inteli.internals source code

1 /**
2 * Internal stuff only, do not import.
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.internals;
8 
9 import inteli.types;
10 
11 // The only math functions needed for intel-intrinsics
12 public import core.math: sqrt; // since it's an intrinsics
13 
14 package:
15 nothrow:
16 @nogc:
17 
18 
19 version(GNU)
20 {
21     version (X86)
22     {
23         // For 32-bit x86, disable vector extensions with GDC. 
24         // It just doesn't work well.
25         enum GDC_with_x86 = true;
26         enum GDC_with_MMX = false;
27         enum GDC_with_SSE = false;
28         enum GDC_with_SSE2 = false;
29         enum GDC_with_SSE3 = false;
30         enum GDC_with_SSSE3 = false;
31         enum GDC_with_SSE41 = false;
32         enum GDC_with_SSE42 = false;
33         enum GDC_with_AVX = false;
34         enum GDC_with_AVX2 = false;
35         enum GDC_with_SHA = false;
36         enum GDC_with_BMI2 = false;
37     }
38     else version (X86_64)
39     {
40         // GDC support uses extended inline assembly:
41         //   https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html        (general information and hints)
42         //   https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html  (binding variables to registers)
43         //   https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names)
44 
45         public import core.simd: byte16, short8, int4, float4, double2;
46 
47         // NOTE: These intrinsics are not available in every i386 and x86_64 CPU.
48         // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 
49         public import gcc.builtins;
50 
51         // TODO: SSE and SSE2 should be truly optional instead, in the future, if we 
52         // want to support other archs with GDC
53 
54         enum GDC_with_x86 = true;
55         enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there
56         enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there
57         enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there
58 
59         static if (__VERSION__ >= 2100) // Starting at GDC 12.1
60         {
61             enum GDC_with_SSE3 = __traits(compiles, __builtin_ia32_haddps);
62             enum GDC_with_SSSE3 = __traits(compiles, __builtin_ia32_pmulhrsw128);
63             enum GDC_with_SSE41 = __traits(compiles, __builtin_ia32_dpps);
64             enum GDC_with_SSE42 = __traits(compiles, __builtin_ia32_pcmpgtq);
65             enum GDC_with_AVX = __traits(compiles, __builtin_ia32_vbroadcastf128_pd256);
66             enum GDC_with_AVX2 = __traits(compiles, __builtin_ia32_gathersiv2df);
67         }
68         else
69         {
70             // Before GCC 11.3, no reliable way to detect instruction sets.
71             // We start above detection at GCC 12, with DMDFE 2.100, which
72             // is more conservative.
73             enum GDC_with_SSE3 = false;
74             enum GDC_with_SSSE3 = false;
75             enum GDC_with_SSE41 = false;
76             enum GDC_with_SSE42 = false;
77             enum GDC_with_AVX = false;
78             enum GDC_with_AVX2 = false;
79         }
80 
81         enum GDC_with_SHA = false; // TODO: detect that
82         enum GDC_with_BMI2 = false; // TODO: detect that
83     }
84     else
85     {
86         enum GDC_with_x86 = false;
87         enum GDC_with_MMX = false;
88         enum GDC_with_SSE = false;
89         enum GDC_with_SSE2 = false;
90         enum GDC_with_SSE3 = false;
91         enum GDC_with_SSSE3 = false;
92         enum GDC_with_SSE41 = false;
93         enum GDC_with_SSE42 = false;
94         enum GDC_with_AVX = false;
95         enum GDC_with_AVX2 = false;
96         enum GDC_with_SHA = false;
97         enum GDC_with_BMI2 = false;
98     }
99 }
100 else
101 {
102     enum GDC_with_x86 = false;
103     enum GDC_with_MMX = false;
104     enum GDC_with_SSE = false;
105     enum GDC_with_SSE2 = false;
106     enum GDC_with_SSE3 = false;
107     enum GDC_with_SSSE3 = false;
108     enum GDC_with_SSE41 = false;
109     enum GDC_with_SSE42 = false;
110     enum GDC_with_AVX = false;
111     enum GDC_with_AVX2 = false;
112     enum GDC_with_SHA = false;
113     enum GDC_with_BMI2 = false;
114 }
115 
116 version(LDC)
117 {
118     public import core.simd;
119     public import ldc.simd;
120     public import ldc.intrinsics;
121     public import ldc.llvmasm: __asm;
122 
123     // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR
124     static if (__VERSION__ >= 2083)
125     {
126          import ldc.llvmasm;
127          alias LDCInlineIR = __ir_pure;
128 
129          // A version of inline IR with prefix/suffix didn't exist before LDC 1.13
130          alias LDCInlineIREx = __irEx_pure; 
131     }
132     else
133     {
134         alias LDCInlineIR = inlineIR;
135     }
136 
137     version(ARM)
138     {
139         public import ldc.gccbuiltins_arm;
140         enum LDC_with_ARM32 = true;
141         enum LDC_with_ARM64 = false;
142         enum LDC_with_ARM64_CRC = false;
143         enum LDC_with_SSE1 = false;
144         enum LDC_with_SSE2 = false;
145         enum LDC_with_SSE3 = false;
146         enum LDC_with_SSSE3 = false;
147         enum LDC_with_SSE41 = false;
148         enum LDC_with_SSE42 = false;
149         enum LDC_with_AVX = false;
150         enum LDC_with_AVX2 = false;
151         enum LDC_with_SHA = false;
152         enum LDC_with_BMI2 = false;
153     }
154     else version(AArch64)
155     {
156         public import ldc.gccbuiltins_aarch64;
157         enum LDC_with_ARM32 = false;
158         enum LDC_with_ARM64 = true; // implies "has Neon"
159         enum LDC_with_ARM64_CRC = __traits(targetHasFeature, "crc");
160         enum LDC_with_SSE1 = false;
161         enum LDC_with_SSE2 = false;
162         enum LDC_with_SSE3 = false;
163         enum LDC_with_SSSE3 = false;
164         enum LDC_with_SSE41 = false;
165         enum LDC_with_SSE42 = false;
166         enum LDC_with_AVX = false;
167         enum LDC_with_AVX2 = false;
168         enum LDC_with_SHA = false;
169         enum LDC_with_BMI2 = false;
170     }
171     else
172     {
173         public import ldc.gccbuiltins_x86;
174         enum LDC_with_ARM32 = false;
175         enum LDC_with_ARM64 = false;
176         enum LDC_with_ARM64_CRC = false;
177         enum LDC_with_SSE1 = __traits(targetHasFeature, "sse");
178         enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2");
179         enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3");
180         enum LDC_with_SSSE3 = __traits(targetHasFeature, "ssse3");
181         enum LDC_with_SSE41 = __traits(targetHasFeature, "sse4.1");
182         enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2");
183         enum LDC_with_AVX = __traits(targetHasFeature, "avx");
184         enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2");
185         enum LDC_with_SHA = __traits(targetHasFeature, "sha");
186         enum LDC_with_BMI2 = __traits(targetHasFeature, "bmi2");
187     }
188 }
189 else
190 {
191     enum LDC_with_ARM32 = false;
192     enum LDC_with_ARM64 = false;
193     enum LDC_with_ARM64_CRC = false;
194     enum LDC_with_SSE1 = false;
195     enum LDC_with_SSE2 = false;
196     enum LDC_with_SSE3 = false;
197     enum LDC_with_SSSE3 = false;
198     enum LDC_with_SSE41 = false;
199     enum LDC_with_SSE42 = false;
200     enum LDC_with_AVX = false;
201     enum LDC_with_AVX2 = false;
202     enum LDC_with_SHA = false;
203     enum LDC_with_BMI2 = false;
204 }
205 
206 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64;
207 
208 version(DigitalMars)
209 {
210     version(D_InlineAsm_X86)
211         enum DMD_with_asm = true;
212     else version(D_InlineAsm_X86_64)
213         enum DMD_with_asm = true;
214     else
215         enum DMD_with_asm = false;
216 
217     version(D_InlineAsm_X86)
218         enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution
219     else
220         enum DMD_with_32bit_asm = false;
221 
222     version (D_SIMD)
223         enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated;
224     else
225         enum DMD_with_DSIMD = false;
226 }
227 else
228 {
229     enum DMD_with_asm = false;
230     enum DMD_with_32bit_asm = false;
231     enum DMD_with_DSIMD = false;
232 }
233 
234 static if (LDC_with_ARM32)
235 {
236     package uint arm_get_fpcr() nothrow @nogc @trusted
237     {
238         return __builtin_arm_get_fpscr();
239     }
240 
241     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
242     {
243         __builtin_arm_set_fpscr(cw);
244     }
245 }
246 
247 static if (LDC_with_ARM64)
248 {
249     pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr")
250         long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe;
251 
252     package uint arm_get_fpcr() pure nothrow @nogc @trusted
253     {
254         // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR
255         return __asm!uint("mrs $0, fpcr", "=r");
256     }
257 
258     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
259     {
260         // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR.
261         long save_x2;
262         __asm!void("str x2, $1 \n" ~
263                    "ldr w2, $0 \n" ~
264                    "msr fpcr, x2 \n" ~
265                    "ldr x2, $1 "   , "m,m", cw, &save_x2);
266     }
267 }
268 
269 
270 // For internal use only, since public API deals with a x86 semantic emulation
271 enum uint _MM_ROUND_NEAREST_ARM     = 0x00000000;
272 enum uint _MM_ROUND_DOWN_ARM        = 0x00800000;
273 enum uint _MM_ROUND_UP_ARM          = 0x00400000;
274 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000;
275 enum uint _MM_ROUND_MASK_ARM        = 0x00C00000;
276 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000;
277 
278 
279 //
280 //  <ROUNDING>
281 //
282 //  Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE
283 //  doesn't change the FPU rounding mode, and isn't expected to do so.
284 //  So we devised these rounding function to help having consistent rounding between 
285 //  LDC and DMD. It's important that DMD uses whatever is in MXCSR to round.
286 //
287 //  Note: There is no MXCSR in ARM. But there is fpcr/fpscr that implements similar 
288 //  functionality.
289 //  https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register
290 //  We use fpcr/fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly.
291 
292 int convertFloatToInt32UsingMXCSR(float value) @trusted
293 {
294     int result;
295     version(GNU)
296     {
297         asm pure nothrow @nogc @trusted
298         {
299             "cvtss2si %1, %0\n": "=r"(result) : "x" (value);
300         }
301     }
302     else static if (LDC_with_ARM32)
303     {
304         // TODO: this is a bug, it won't preserve registers when optimized
305         result = __asm!int(`vldr s2, $1
306                             vcvtr.s32.f32 s2, s2
307                             vmov $0, s2`, "=r,m", value);
308     }
309     else static if (LDC_with_ARM64)
310     {
311         // Get current rounding mode.
312         uint fpscr = arm_get_fpcr();
313 
314         switch(fpscr & _MM_ROUND_MASK_ARM)
315         {
316             default:
317             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f32(value); break;
318             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f32(value); break;
319             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f32(value); break;
320             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value);  break;
321         }
322     }
323     else
324     {
325         asm pure nothrow @nogc @trusted
326         {
327             cvtss2si EAX, value;
328             mov result, EAX;
329         }
330     }
331     return result;
332 }
333 
334 int convertDoubleToInt32UsingMXCSR(double value) @trusted
335 {
336     int result;
337     version(GNU)
338     {
339         asm pure nothrow @nogc @trusted
340         {
341             "cvtsd2si %1, %0\n": "=r"(result) : "x" (value);
342         }
343     }
344     else static if (LDC_with_ARM32)
345     {
346         // TODO: bug, doesn't preserve registers
347         result = __asm!int(`vldr d2, $1
348                             vcvtr.s32.f64 s2, d2
349                             vmov $0, s2`, "=r,m", value);
350     }
351     else static if (LDC_with_ARM64)
352     {
353         // Get current rounding mode.
354         uint fpscr = arm_get_fpcr();
355 
356         switch(fpscr & _MM_ROUND_MASK_ARM)
357         {
358             default:
359             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f64(value); break;
360             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f64(value); break;
361             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f64(value); break;
362             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value);  break;
363         }
364     }
365     else
366     {
367         asm pure nothrow @nogc @trusted
368         {
369             cvtsd2si EAX, value;
370             mov result, EAX;
371         }
372     }
373     return result;
374 }
375 
376 long convertFloatToInt64UsingMXCSR(float value) @trusted
377 {
378     static if (LDC_with_ARM32)
379     {
380         // We have to resort to libc since 32-bit ARM 
381         // doesn't seem to have 64-bit registers.
382         
383         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
384 
385         // Note: converting to double precision else rounding could be different for large integers
386         double asDouble = value; 
387 
388         switch(fpscr & _MM_ROUND_MASK_ARM)
389         {
390             default:
391             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(asDouble));
392             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(asDouble));
393             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(asDouble));
394             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble);
395         }
396     }
397     else static if (LDC_with_ARM64)
398     {
399         uint fpscr = arm_get_fpcr();
400 
401         switch(fpscr & _MM_ROUND_MASK_ARM)
402         {
403             default:
404             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f32(value);
405             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f32(value);
406             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f32(value);
407             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value);
408         }
409     }
410     // 64-bit can use an SSE instruction
411     else version(D_InlineAsm_X86_64)
412     {
413         long result;
414         version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet."
415         {
416             asm pure nothrow @nogc @trusted
417             {
418                 movss XMM0, value;
419                 cvtss2si RAX, XMM0;
420                 mov result, RAX;
421             }
422         }
423         else
424         {
425             asm pure nothrow @nogc @trusted
426             {
427                 movss XMM0, value;
428                 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit)
429                 mov result, RAX;
430             }
431         }
432         return result;
433     }
434     else version(D_InlineAsm_X86)
435     {
436         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
437         // This leads to an unfortunate FPU sequence in every C++ compiler.
438         // See: https://godbolt.org/z/vZym77
439 
440         // Get current MXCSR rounding
441         uint sseRounding;
442         ushort savedFPUCW;
443         ushort newFPUCW;
444         long result;
445         asm pure nothrow @nogc @trusted
446         {
447             stmxcsr sseRounding;
448             fld value;
449             fnstcw savedFPUCW;
450             mov AX, savedFPUCW;
451             and AX, 0xf3ff;          // clear FPU rounding bits
452             movzx ECX, word ptr sseRounding;
453             and ECX, 0x6000;         // only keep SSE rounding bits
454             shr ECX, 3;
455             or AX, CX;               // make a new control word for FPU with SSE bits
456             mov newFPUCW, AX;
457             fldcw newFPUCW;
458             fistp qword ptr result;            // convert, respecting MXCSR (but not other control word things)
459             fldcw savedFPUCW;
460         }
461         return result;
462     }
463     else static if (GDC_with_x86)
464     {
465         version(X86_64) // 64-bit can just use the right instruction
466         {
467             static assert(GDC_with_SSE);
468             __m128 A;
469             A.ptr[0] = value;
470             return __builtin_ia32_cvtss2si64 (A);
471         }
472         else version(X86) // 32-bit
473         {
474             // This is untested!
475             uint sseRounding;
476             ushort savedFPUCW;
477             ushort newFPUCW;
478             long result;
479             asm pure nothrow @nogc @trusted
480             {
481                 "stmxcsr %1;\n" ~
482                 "fld %2;\n" ~
483                 "fnstcw %3;\n" ~
484                 "movw %3, %%ax;\n" ~
485                 "andw $0xf3ff, %%ax;\n" ~
486                 "movzwl %1, %%ecx;\n" ~
487                 "andl $0x6000, %%ecx;\n" ~
488                 "shrl $3, %%ecx;\n" ~
489                 "orw %%cx, %%ax\n" ~
490                 "movw %%ax, %4;\n" ~
491                 "fldcw %4;\n" ~
492                 "fistpll %0;\n" ~
493                 "fldcw %3;\n" 
494                   : "=m"(result)    // %0
495                   : "m" (sseRounding),
496                     "f" (value),
497                     "m" (savedFPUCW),
498                     "m" (newFPUCW) 
499                   : "eax", "ecx", "st";
500             }
501             return result;
502         }
503         else
504             static assert(false);
505     }
506     else
507         static assert(false);
508 }
509 
510 
511 ///ditto
512 long convertDoubleToInt64UsingMXCSR(double value) @trusted
513 {
514     static if (LDC_with_ARM32)
515     {
516         // We have to resort to libc since 32-bit ARM 
517         // doesn't seem to have 64-bit registers.
518         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
519         switch(fpscr & _MM_ROUND_MASK_ARM)
520         {
521             default:
522             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(value));
523             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(value));
524             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(value));
525             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value);
526         }
527     }
528     else static if (LDC_with_ARM64)
529     {
530         // Get current rounding mode.
531         uint fpscr = arm_get_fpcr();
532 
533         switch(fpscr & _MM_ROUND_MASK_ARM)
534         {
535             default:
536             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f64(value);
537             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f64(value);
538             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f64(value);
539             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value);
540         }
541     }
542     // 64-bit can use an SSE instruction
543     else version(D_InlineAsm_X86_64)
544     {
545         long result;
546         version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet."
547         {
548             asm pure nothrow @nogc @trusted
549             {
550                 movsd XMM0, value;
551                 cvtsd2si RAX, XMM0;
552                 mov result, RAX;
553             }
554         }
555         else
556         {
557             asm pure nothrow @nogc @trusted
558             {
559                 movsd XMM0, value;
560                 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit)
561                 mov result, RAX;
562             }
563         }
564         return result;
565     }
566     else version(D_InlineAsm_X86)
567     {
568         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
569         // This leads to an unfortunate FPU sequence in every C++ compiler.
570         // See: https://godbolt.org/z/vZym77
571 
572         // Get current MXCSR rounding
573         uint sseRounding;
574         ushort savedFPUCW;
575         ushort newFPUCW;
576         long result;
577         asm pure nothrow @nogc @trusted
578         {
579             stmxcsr sseRounding;
580             fld value;
581             fnstcw savedFPUCW;
582             mov AX, savedFPUCW;
583             and AX, 0xf3ff;
584             movzx ECX, word ptr sseRounding;
585             and ECX, 0x6000;
586             shr ECX, 3;
587             or AX, CX;
588             mov newFPUCW, AX;
589             fldcw newFPUCW;
590             fistp result;
591             fldcw savedFPUCW;
592         }
593         return result;
594     }
595     else static if (GDC_with_x86)
596     {
597         version(X86_64)
598         {
599             static assert(GDC_with_SSE2);
600             __m128d A;
601             A.ptr[0] = value;
602             return __builtin_ia32_cvtsd2si64 (A);
603         }
604         else
605         {
606             // This is untested!
607             uint sseRounding;
608             ushort savedFPUCW;
609             ushort newFPUCW;
610             long result;
611             asm pure nothrow @nogc @trusted
612             {
613                 "stmxcsr %1;\n" ~
614                 "fld %2;\n" ~
615                 "fnstcw %3;\n" ~
616                 "movw %3, %%ax;\n" ~
617                 "andw $0xf3ff, %%ax;\n" ~
618                 "movzwl %1, %%ecx;\n" ~
619                 "andl $0x6000, %%ecx;\n" ~
620                 "shrl $3, %%ecx;\n" ~
621                 "orw %%cx, %%ax\n" ~
622                 "movw %%ax, %4;\n" ~
623                 "fldcw %4;\n" ~
624                 "fistpll %0;\n" ~
625                 "fldcw %3;\n"         
626                   : "=m"(result)    // %0
627                   : "m" (sseRounding),
628                     "t" (value),
629                     "m" (savedFPUCW),
630                     "m" (newFPUCW) 
631                   : "eax", "ecx", "st";
632             }
633             return result;
634         }
635     }
636     else
637         static assert(false);
638 }
639 
640 //
641 //  </ROUNDING>
642 //
643 
644 
645 // using the Intel terminology here
646 
647 byte saturateSignedWordToSignedByte(short value) pure @safe
648 {
649     if (value > 127) value = 127;
650     if (value < -128) value = -128;
651     return cast(byte) value;
652 }
653 
654 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe
655 {
656     if (value > 255) value = 255;
657     if (value < 0) value = 0;
658     return cast(ubyte) value;
659 }
660 
661 short saturateSignedIntToSignedShort(int value) pure @safe
662 {
663     if (value > 32767) value = 32767;
664     if (value < -32768) value = -32768;
665     return cast(short) value;
666 }
667 
668 ushort saturateSignedIntToUnsignedShort(int value) pure @safe
669 {
670     if (value > 65535) value = 65535;
671     if (value < 0) value = 0;
672     return cast(ushort) value;
673 }
674 
675 unittest // test saturate operations
676 {
677     assert( saturateSignedWordToSignedByte(32000) == 127);
678     assert( saturateSignedWordToUnsignedByte(32000) == 255);
679     assert( saturateSignedWordToSignedByte(-4000) == -128);
680     assert( saturateSignedWordToUnsignedByte(-4000) == 0);
681     assert( saturateSignedIntToSignedShort(32768) == 32767);
682     assert( saturateSignedIntToUnsignedShort(32768) == 32768);
683     assert( saturateSignedIntToSignedShort(-32769) == -32768);
684     assert( saturateSignedIntToUnsignedShort(-32769) == 0);
685 }
686 
687 version(unittest)
688 {
689     // This is just for debugging tests
690     import core.stdc.stdio: printf;
691 
692     // printing vectors for implementation
693     // Note: you can override `pure` within a `debug` clause
694 
695     void _mm_print_pi64(__m64 v) @trusted
696     {
697         long1 vl = cast(long1)v;
698         printf("%lld\n", vl.array[0]);
699     }
700 
701     void _mm_print_pi32(__m64 v) @trusted
702     {
703         int[2] C = (cast(int2)v).array;
704         printf("%d %d\n", C[0], C[1]);
705     }
706 
707     void _mm_print_pi16(__m64 v) @trusted
708     {
709         short[4] C = (cast(short4)v).array;
710         printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]);
711     }
712 
713     void _mm_print_pi8(__m64 v) @trusted
714     {
715         byte[8] C = (cast(byte8)v).array;
716         printf("%d %d %d %d %d %d %d %d\n",
717         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
718     }
719 
720     void _mm_print_epi64(__m128i v) @trusted
721     {
722         long2 vl = cast(long2)v;
723         printf("%lld %lld\n", vl.array[0], vl.array[1]);
724     }
725 
726     void _mm_print_epi32(__m128i v) @trusted
727     {
728         printf("%d %d %d %d\n",
729               v.array[0], v.array[1], v.array[2], v.array[3]);
730     }  
731 
732     void _mm_print_epi16(__m128i v) @trusted
733     {
734         short[8] C = (cast(short8)v).array;
735         printf("%d %d %d %d %d %d %d %d\n",
736         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
737     }
738 
739     void _mm_print_epi8(__m128i v) @trusted
740     {
741         byte[16] C = (cast(byte16)v).array;
742         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
743         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]);
744     }
745 
746     void _mm_print_ps(__m128 v) @trusted
747     {
748         // %g because %f can conceal very small numbers and prints zero instead
749         float[4] C = (cast(float4)v).array;
750         printf("%g %g %g %g\n", C[0], C[1], C[2], C[3]);
751     }
752 
753     void _mm_print_pd(__m128d v) @trusted
754     {
755         double[2] C = (cast(double2)v).array;
756         printf("%f %f\n", C[0], C[1]);
757     }
758 
759     void _mm256_print_pd(__m256d v) @trusted
760     {
761         // %g because %f can conceal very small numbers and prints zero instead
762         printf("%g %g %g %g\n", v.array[0], v.array[1], v.array[2], v.array[3]); 
763     }
764 
765     void _mm256_print_epi64(__m256i v) @trusted
766     {
767         long4 vl = cast(long4)v;
768         printf("%lld %lld %lld %lld\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3]);
769     }
770 }
771 
772 
773 //
774 //  <FLOATING-POINT COMPARISONS>
775 //
776 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we
777 //       need different IR generation.
778 
779 enum FPComparison
780 {
781     oeq,   // ordered and equal
782     ogt,   // ordered and greater than
783     oge,   // ordered and greater than or equal
784     olt,   // ordered and less than
785     ole,   // ordered and less than or equal
786     one,   // ordered and not equal
787     ord,   // ordered (no nans)
788     ueq,   // unordered or equal
789     ugt,   // unordered or greater than ("nle")
790     uge,   // unordered or greater than or equal ("nlt")
791     ult,   // unordered or less than ("nge")
792     ule,   // unordered or less than or equal ("ngt")
793     une,   // unordered or not equal ("neq")
794     uno,   // unordered (either nans)
795 }
796 
797 private static immutable string[FPComparison.max+1] FPComparisonToString =
798 [
799     "oeq",
800     "ogt",
801     "oge",
802     "olt",
803     "ole",
804     "one",
805     "ord",
806     "ueq",
807     "ugt",
808     "uge",
809     "ult",
810     "ule",
811     "une",
812     "uno",
813 ];
814 
815 // Individual float comparison: returns -1 for true or 0 for false.
816 // Useful for DMD and testing
817 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe
818 {
819     bool unordered = isnan(a) || isnan(b);
820     final switch(comparison) with(FPComparison)
821     {
822         case oeq: return a == b;
823         case ogt: return a > b;
824         case oge: return a >= b;
825         case olt: return a < b;
826         case ole: return a <= b;
827         case one: return !unordered && (a != b); // NaN with != always yields true
828         case ord: return !unordered; 
829         case ueq: return unordered || (a == b);
830         case ugt: return unordered || (a > b);
831         case uge: return unordered || (a >= b);
832         case ult: return unordered || (a < b);
833         case ule: return unordered || (a <= b);
834         case une: return (a != b); // NaN with != always yields true
835         case uno: return unordered;
836     }
837 }
838 
839 version(LDC)
840 {
841     /// Provides packed float comparisons
842     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
843     {
844         enum ir = `
845             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1
846             %r = sext <4 x i1> %cmp to <4 x i32>
847             ret <4 x i32> %r`;
848 
849         return LDCInlineIR!(ir, int4, float4, float4)(a, b);
850     }
851 
852     /// Provides packed double comparisons
853     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
854     {
855         enum ir = `
856             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1
857             %r = sext <2 x i1> %cmp to <2 x i64>
858             ret <2 x i64> %r`;
859 
860         return LDCInlineIR!(ir, long2, double2, double2)(a, b);
861     }
862 
863     /// CMPSS-style comparisons
864     /// clang implement it through x86 intrinsics, it is possible with IR alone
865     /// but leads to less optimal code.
866     /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 
867     /// Not that simple.
868     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
869     {
870         /*
871         enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison];
872         enum bool invertOp = (predicateNumber & 0x80) != 0;
873         static if(invertOp)
874             return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f);
875         else
876             return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f);
877         */
878         enum ir = `
879             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
880             %r = sext i1 %cmp to i32
881             %r2 = bitcast i32 %r to float
882             ret float %r2`;
883 
884         float4 r = a;
885         r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]);
886         return r;
887     }
888 
889     /// CMPSD-style comparisons
890     /// clang implement it through x86 intrinsics, it is possible with IR alone
891     /// but leads to less optimal code.
892     /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 
893     /// Not that simple.    
894     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
895     {
896         enum ir = `
897             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
898             %r = sext i1 %cmp to i64
899             %r2 = bitcast i64 %r to double
900             ret double %r2`;
901 
902         double2 r = a;
903         r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]);
904         return r;
905     }
906 }
907 else
908 {
909     /// Provides packed float comparisons
910     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted
911     {
912         int4 result;
913         foreach(i; 0..4)
914         {
915             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
916         }
917         return result;
918     }
919 
920     /// Provides packed double comparisons
921     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted
922     {
923         long2 result;
924         foreach(i; 0..2)
925         {
926             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
927         }
928         return result;
929     }
930 
931     /// Provides CMPSS-style comparison
932     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted
933     {
934         int4 result = cast(int4)a;
935         result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0;
936         return cast(float4)result;
937     }
938 
939     /// Provides CMPSD-style comparison
940     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted
941     {
942         long2 result = cast(long2)a;
943         result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0;
944         return cast(double2)result;
945     }
946 }
947 unittest // cmpps
948 {
949     // Check all comparison type is working
950     float4 A = [1, 3, 5, float.nan];
951     float4 B = [2, 3, 4, 5];
952 
953     int4 result_oeq = cmpps!(FPComparison.oeq)(A, B);
954     int4 result_ogt = cmpps!(FPComparison.ogt)(A, B);
955     int4 result_oge = cmpps!(FPComparison.oge)(A, B);
956     int4 result_olt = cmpps!(FPComparison.olt)(A, B);
957     int4 result_ole = cmpps!(FPComparison.ole)(A, B);
958     int4 result_one = cmpps!(FPComparison.one)(A, B);
959     int4 result_ord = cmpps!(FPComparison.ord)(A, B);
960     int4 result_ueq = cmpps!(FPComparison.ueq)(A, B);
961     int4 result_ugt = cmpps!(FPComparison.ugt)(A, B);
962     int4 result_uge = cmpps!(FPComparison.uge)(A, B);
963     int4 result_ult = cmpps!(FPComparison.ult)(A, B);
964     int4 result_ule = cmpps!(FPComparison.ule)(A, B);
965     int4 result_une = cmpps!(FPComparison.une)(A, B);
966     int4 result_uno = cmpps!(FPComparison.uno)(A, B);
967 
968     static immutable int[4] correct_oeq    = [ 0,-1, 0, 0];
969     static immutable int[4] correct_ogt    = [ 0, 0,-1, 0];
970     static immutable int[4] correct_oge    = [ 0,-1,-1, 0];
971     static immutable int[4] correct_olt    = [-1, 0, 0, 0];
972     static immutable int[4] correct_ole    = [-1,-1, 0, 0];
973     static immutable int[4] correct_one    = [-1, 0,-1, 0];
974     static immutable int[4] correct_ord    = [-1,-1,-1, 0];
975     static immutable int[4] correct_ueq    = [ 0,-1, 0,-1];
976     static immutable int[4] correct_ugt    = [ 0, 0,-1,-1];
977     static immutable int[4] correct_uge    = [ 0,-1,-1,-1];
978     static immutable int[4] correct_ult    = [-1, 0, 0,-1];
979     static immutable int[4] correct_ule    = [-1,-1, 0,-1];
980     static immutable int[4] correct_une    = [-1, 0,-1,-1];
981     static immutable int[4] correct_uno    = [ 0, 0, 0,-1];
982 
983     assert(result_oeq.array == correct_oeq);
984     assert(result_ogt.array == correct_ogt);
985     assert(result_oge.array == correct_oge);
986     assert(result_olt.array == correct_olt);
987     assert(result_ole.array == correct_ole);
988     assert(result_one.array == correct_one);
989     assert(result_ord.array == correct_ord);
990     assert(result_ueq.array == correct_ueq);
991     assert(result_ugt.array == correct_ugt);
992     assert(result_uge.array == correct_uge);
993     assert(result_ult.array == correct_ult);
994     assert(result_ule.array == correct_ule);
995     assert(result_une.array == correct_une);
996     assert(result_uno.array == correct_uno);
997 }
998 unittest
999 {
1000     double2 a = [1, 3];
1001     double2 b = [2, 3];
1002     long2 c = cmppd!(FPComparison.ult)(a, b);
1003     static immutable long[2] correct = [cast(long)(-1), 0];
1004     assert(c.array == correct);
1005 }
1006 unittest // cmpss
1007 {
1008     void testComparison(FPComparison comparison)(float4 A, float4 B)
1009     {
1010         float4 result = cmpss!comparison(A, B);
1011         int4 iresult = cast(int4)result;
1012         int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0;
1013         assert(iresult.array[0] == expected);
1014         assert(result.array[1] == A.array[1]);
1015         assert(result.array[2] == A.array[2]);
1016         assert(result.array[3] == A.array[3]);
1017     }
1018 
1019     // Check all comparison type is working
1020     float4 A = [1, 3, 5, 6];
1021     float4 B = [2, 3, 4, 5];
1022     float4 C = [float.nan, 3, 4, 5];
1023 
1024     testComparison!(FPComparison.oeq)(A, B);
1025     testComparison!(FPComparison.oeq)(A, C);
1026     testComparison!(FPComparison.ogt)(A, B);
1027     testComparison!(FPComparison.ogt)(A, C);
1028     testComparison!(FPComparison.oge)(A, B);
1029     testComparison!(FPComparison.oge)(A, C);
1030     testComparison!(FPComparison.olt)(A, B);
1031     testComparison!(FPComparison.olt)(A, C);
1032     testComparison!(FPComparison.ole)(A, B);
1033     testComparison!(FPComparison.ole)(A, C);
1034     testComparison!(FPComparison.one)(A, B);
1035     testComparison!(FPComparison.one)(A, C);
1036     testComparison!(FPComparison.ord)(A, B);
1037     testComparison!(FPComparison.ord)(A, C);
1038     testComparison!(FPComparison.ueq)(A, B);
1039     testComparison!(FPComparison.ueq)(A, C);
1040     testComparison!(FPComparison.ugt)(A, B);
1041     testComparison!(FPComparison.ugt)(A, C);
1042     testComparison!(FPComparison.uge)(A, B);
1043     testComparison!(FPComparison.uge)(A, C);
1044     testComparison!(FPComparison.ult)(A, B);
1045     testComparison!(FPComparison.ult)(A, C);
1046     testComparison!(FPComparison.ule)(A, B);
1047     testComparison!(FPComparison.ule)(A, C);
1048     testComparison!(FPComparison.une)(A, B);
1049     testComparison!(FPComparison.une)(A, C);
1050     testComparison!(FPComparison.uno)(A, B);
1051     testComparison!(FPComparison.uno)(A, C);
1052 }
1053 unittest // cmpsd
1054 {
1055     void testComparison(FPComparison comparison)(double2 A, double2 B)
1056     {
1057         double2 result = cmpsd!comparison(A, B);
1058         long2 iresult = cast(long2)result;
1059         long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0;
1060         assert(iresult.array[0] == expected);
1061         assert(result.array[1] == A.array[1]);
1062     }
1063 
1064     // Check all comparison type is working
1065     double2 A = [1, 3];
1066     double2 B = [2, 4];
1067     double2 C = [double.nan, 5];
1068 
1069     testComparison!(FPComparison.oeq)(A, B);
1070     testComparison!(FPComparison.oeq)(A, C);
1071     testComparison!(FPComparison.ogt)(A, B);
1072     testComparison!(FPComparison.ogt)(A, C);
1073     testComparison!(FPComparison.oge)(A, B);
1074     testComparison!(FPComparison.oge)(A, C);
1075     testComparison!(FPComparison.olt)(A, B);
1076     testComparison!(FPComparison.olt)(A, C);
1077     testComparison!(FPComparison.ole)(A, B);
1078     testComparison!(FPComparison.ole)(A, C);
1079     testComparison!(FPComparison.one)(A, B);
1080     testComparison!(FPComparison.one)(A, C);
1081     testComparison!(FPComparison.ord)(A, B);
1082     testComparison!(FPComparison.ord)(A, C);
1083     testComparison!(FPComparison.ueq)(A, B);
1084     testComparison!(FPComparison.ueq)(A, C);
1085     testComparison!(FPComparison.ugt)(A, B);
1086     testComparison!(FPComparison.ugt)(A, C);
1087     testComparison!(FPComparison.uge)(A, B);
1088     testComparison!(FPComparison.uge)(A, C);
1089     testComparison!(FPComparison.ult)(A, B);
1090     testComparison!(FPComparison.ult)(A, C);
1091     testComparison!(FPComparison.ule)(A, B);
1092     testComparison!(FPComparison.ule)(A, C);
1093     testComparison!(FPComparison.une)(A, B);
1094     testComparison!(FPComparison.une)(A, C);
1095     testComparison!(FPComparison.uno)(A, B);
1096     testComparison!(FPComparison.uno)(A, C);
1097 }
1098 
1099 //
1100 //  </FLOATING-POINT COMPARISONS>
1101 //
1102 
1103 
1104 __m64 to_m64(__m128i a) pure @trusted
1105 {
1106     long2 la = cast(long2)a;
1107     long1 r = la.array[0];
1108     return r;
1109 }
1110 
1111 __m128i to_m128i(__m64 a) pure @trusted
1112 {
1113   /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 
1114     
1115     version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 
1116     {
1117         long2 r = a.array[0];
1118         r.ptr[1] = 0;
1119         return cast(int4)r;
1120     }
1121     else */
1122     {
1123         long2 r = [0, 0];
1124         r.ptr[0] = a.array[0];
1125         return cast(__m128i)r;
1126     }
1127 }
1128 
1129 // ADDITIONAL x86 INTRINSICS
1130 // Absent from ldc.gccbuiltins_x86 for some reason, but needed.
1131 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/12.x/llvm/include/llvm/IR/IntrinsicsX86.td
1132 static if (LDC_with_SSE41)
1133 {
1134     pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb")
1135         byte16 __builtin_ia32_pblendvb(byte16, byte16, byte16) pure @safe;
1136 }
1137 
1138 // SOME NEON INTRINSICS
1139 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics.
1140 // Not in the public API but the simde project expose it all for the user to use.
1141 // MAYDO: create a new neon.d module, for internal use only.
1142 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64.
1143 static if (LDC_with_ARM64)
1144 {
1145     // VERY USEFUL LINK
1146     // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td
1147     // Also: https://developer.arm.com/architectures/instruction-sets/intrinsics/
1148 
1149     pragma(LDC_intrinsic, "llvm.aarch64.crc32cb")
1150         uint __crc32cb(uint a, uint b) pure @safe;
1151 
1152     pragma(LDC_intrinsic, "llvm.aarch64.crc32ch")
1153         uint __crc32ch(uint a, uint b) pure @safe;
1154 
1155     pragma(LDC_intrinsic, "llvm.aarch64.crc32cw")
1156         uint __crc32cw(uint a, uint b) pure @safe;
1157 
1158     pragma(LDC_intrinsic, "llvm.aarch64.crc32cx")
1159         uint __crc32cd(uint a, ulong b) pure @safe;
1160 
1161     //pragma(LDC_intrinsic, "llvm.aarch64.dmb")
1162     //    uint __dmb(int a) @safe; // didn't found a name in intrinsic list
1163 
1164     pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8")
1165         byte16 vabdq_u8(byte16 a, byte16 b) pure @safe;
1166 
1167     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v8i16")
1168         short8 vabsq_s16(short8 a) pure @safe;
1169 
1170     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v4i32")
1171         int4 vabsq_s32(int4 a) pure @safe;
1172 
1173     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v16i8")
1174         byte16 vabsq_s8(byte16 a) pure @safe;
1175 
1176     byte8 vand_u8(byte8 a, byte8 b) pure @safe
1177     {
1178         return a & b;
1179     }
1180 
1181     long2 vandq_s64(long2 a, long2 b)
1182     {
1183         return a & b;
1184     }
1185 
1186     long2 vbicq_s64(long2 a, long2 b) pure @safe
1187     {
1188         return a & ~b;
1189     }
1190 
1191     int4 vbslq_s32(int4 a, int4 b, int4 c) pure @safe
1192     {
1193         return c ^ ((c ^ b) & a);
1194     }
1195 
1196     byte16 vbslq_s8(byte16 a, byte16 b, byte16 c) pure @safe
1197     {
1198         return c ^ ((c ^ b) & a);
1199     }
1200 
1201     long2 vbslq_s64(long2 a, long2 b, long2 c) pure @safe
1202     {
1203         return c ^ ((c ^ b) & a);
1204     }
1205 
1206     short8 vcombine_s16(short4 lo, short4 hi) pure @trusted
1207     {
1208         short8 r;
1209         r.ptr[0]  = lo.array[0];
1210         r.ptr[1]  = lo.array[1];
1211         r.ptr[2]  = lo.array[2];
1212         r.ptr[3]  = lo.array[3];
1213         r.ptr[4]  = hi.array[0];
1214         r.ptr[5]  = hi.array[1];
1215         r.ptr[6]  = hi.array[2];
1216         r.ptr[7]  = hi.array[3];
1217         return r;
1218     }
1219 
1220     int4 vcombine_s32(int2 lo, int2 hi) pure @trusted
1221     {
1222         int4 r;
1223         r.ptr[0] = lo.array[0];
1224         r.ptr[1] = lo.array[1];
1225         r.ptr[2] = hi.array[0];
1226         r.ptr[3] = hi.array[1];
1227         return r;
1228     }
1229 
1230     byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted
1231     {
1232         byte16 r;
1233         r.ptr[0]  = lo.array[0];
1234         r.ptr[1]  = lo.array[1];
1235         r.ptr[2]  = lo.array[2];
1236         r.ptr[3]  = lo.array[3];
1237         r.ptr[4]  = lo.array[4];
1238         r.ptr[5]  = lo.array[5];
1239         r.ptr[6]  = lo.array[6];
1240         r.ptr[7]  = lo.array[7];
1241         r.ptr[8]  = hi.array[0];
1242         r.ptr[9]  = hi.array[1];
1243         r.ptr[10] = hi.array[2];
1244         r.ptr[11] = hi.array[3];
1245         r.ptr[12] = hi.array[4];
1246         r.ptr[13] = hi.array[5];
1247         r.ptr[14] = hi.array[6];
1248         r.ptr[15] = hi.array[7];
1249         return r;
1250     }
1251 
1252     short8 vcombine_u16(short4 lo, short4 hi) pure @trusted
1253     {
1254         short8 r;
1255         r.ptr[0]  = lo.array[0];
1256         r.ptr[1]  = lo.array[1];
1257         r.ptr[2]  = lo.array[2];
1258         r.ptr[3]  = lo.array[3];
1259         r.ptr[4]  = hi.array[0];
1260         r.ptr[5]  = hi.array[1];
1261         r.ptr[6]  = hi.array[2];
1262         r.ptr[7]  = hi.array[3];
1263         return r;
1264     }
1265 
1266 
1267     // float4 => int4
1268 
1269     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32")
1270         int4 vcvtmq_s32_f32(float4 a) pure @safe;
1271 
1272     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32")
1273         int4 vcvtnq_s32_f32(float4 a) pure @safe;
1274 
1275     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32")
1276         int4 vcvtpq_s32_f32(float4 a) pure @safe;
1277 
1278     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32")
1279         int4 vcvtzq_s32_f32(float4 a) pure @safe;
1280 
1281 
1282     // double2 => long2
1283 
1284     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64")
1285         long2 vcvtmq_s64_f64(double2 a) pure @safe;
1286 
1287     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64")
1288         long2 vcvtnq_s64_f64(double2 a) pure @safe;
1289 
1290     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64")
1291         long2 vcvtpq_s64_f64(double2 a) pure @safe;
1292 
1293     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64")
1294         long2 vcvtzq_s64_f64(double2 a) pure @safe;
1295 
1296     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32")
1297         int vcvtms_s32_f32(float a) pure @safe;
1298 
1299     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32")
1300         int vcvtns_s32_f32(float a) pure @safe;    
1301 
1302     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32")
1303         int vcvtps_s32_f32(float a) pure @safe;
1304 
1305     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32")
1306         int vcvts_s32_f32(float a) pure @safe;
1307      
1308     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64")
1309         int vcvtms_s32_f64(double a) pure @safe;
1310 
1311     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64")
1312         int vcvtns_s32_f64(double a) pure @safe;    
1313 
1314     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64")
1315         int vcvtps_s32_f64(double a) pure @safe;
1316 
1317     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64")
1318         int vcvts_s32_f64(double a) pure @safe;
1319 
1320     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32")
1321         long vcvtms_s64_f32(float a) pure @safe;
1322 
1323     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32")
1324         long vcvtns_s64_f32(float a) pure @safe;    
1325 
1326     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32")
1327         long vcvtps_s64_f32(float a) pure @safe;
1328 
1329     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32")
1330         long vcvts_s64_f32(float a) pure @safe;
1331 
1332     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64")
1333         long vcvtms_s64_f64(double a) pure @safe;
1334 
1335     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64")
1336         long vcvtns_s64_f64(double a) pure @safe;    
1337 
1338     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64")
1339         long vcvtps_s64_f64(double a) pure @safe; // Note: technically should be named vcvtpd_s64_f64
1340 
1341     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64")
1342         long vcvts_s64_f64(double a) pure @safe;
1343 
1344     long2 vdupq_n_s64(long value) pure @safe
1345     {
1346         long2 r;
1347         r = value;
1348         return r;
1349     }
1350 
1351     short4 vget_high_s16(short8 a) pure @trusted
1352     {
1353         short4 r;
1354         r.ptr[0] = a.array[4];
1355         r.ptr[1] = a.array[5];
1356         r.ptr[2] = a.array[6];
1357         r.ptr[3] = a.array[7];
1358         return r;
1359     }
1360 
1361     int2 vget_high_s32(int4 a) pure @trusted
1362     {
1363         int2 r;
1364         r.ptr[0] = a.array[2];
1365         r.ptr[1] = a.array[3];
1366         return r;
1367     }
1368 
1369     byte8 vget_high_u8(byte16 a) pure @trusted
1370     {
1371         byte8 r;
1372         r.ptr[0] = a.array[8];
1373         r.ptr[1] = a.array[9];
1374         r.ptr[2] = a.array[10];
1375         r.ptr[3] = a.array[11];
1376         r.ptr[4] = a.array[12];
1377         r.ptr[5] = a.array[13];
1378         r.ptr[6] = a.array[14];
1379         r.ptr[7] = a.array[15];
1380         return r;
1381     }
1382 
1383     short4 vget_low_s16(short8 a) pure @trusted
1384     {
1385         short4 r;
1386         r.ptr[0] = a.array[0];
1387         r.ptr[1] = a.array[1];
1388         r.ptr[2] = a.array[2];
1389         r.ptr[3] = a.array[3];
1390         return r;
1391     } 
1392 
1393     int2 vget_low_s32(int4 a) pure @trusted
1394     {
1395         int2 r;
1396         r.ptr[0] = a.array[0];
1397         r.ptr[1] = a.array[1];
1398         return r;
1399     }
1400 
1401     byte8 vget_low_u8(byte16 a) pure @trusted
1402     {
1403         byte8 r;
1404         r.ptr[0] = a.array[0];
1405         r.ptr[1] = a.array[1];
1406         r.ptr[2] = a.array[2];
1407         r.ptr[3] = a.array[3];
1408         r.ptr[4] = a.array[4];
1409         r.ptr[5] = a.array[5];
1410         r.ptr[6] = a.array[6];
1411         r.ptr[7] = a.array[7];
1412         return r;
1413     }
1414 
1415     long vgetq_lane_s64(long2 v, const int lane) pure @safe
1416     {
1417         return v.array[lane];
1418     }
1419 
1420     pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16")
1421         short8 vmaxq_s16(short8 a, short8 b) pure @safe;
1422 
1423     int4 vmaxq_s32(int4 a, int4 b)
1424     {
1425         int4 r;
1426         r[0] = a[0] >= b[0] ? a[0] : b[0];
1427         r[1] = a[1] >= b[1] ? a[1] : b[1];
1428         r[2] = a[2] >= b[2] ? a[2] : b[2];
1429         r[3] = a[3] >= b[3] ? a[3] : b[3];
1430         return r;
1431     }
1432 
1433     pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16")
1434         short8 vminq_s16(short8 a, short8 b) pure @safe;
1435 
1436     int2 vmovn_s64(long2 a) pure @trusted
1437     {
1438         int2 r;
1439         r.ptr[0] = cast(int)(a.array[0]);
1440         r.ptr[1] = cast(int)(a.array[1]);
1441         return r;
1442     }        
1443 
1444     int4 vmull_s16(short4 a, short4 b) pure @trusted
1445     {
1446         int4 r;
1447         r.ptr[0] = a.array[0] * b.array[0];
1448         r.ptr[1] = a.array[1] * b.array[1];
1449         r.ptr[2] = a.array[2] * b.array[2];
1450         r.ptr[3] = a.array[3] * b.array[3];
1451         return r;
1452     }
1453 
1454     pragma(LDC_intrinsic, "llvm.aarch64.neon.smull.v2i64")
1455         long2 vmull_s32(int2 a, int2 b) pure @safe;
1456 
1457     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i16")
1458         short4 vpadd_s16(short4 a, short4 b) pure @safe;
1459 
1460     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32")
1461         int2 vpadd_s32(int2 a, int2 b) pure @safe;
1462 
1463     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8")
1464         byte8 vpadd_u8(byte8 a, byte8 b) pure @safe;
1465 
1466     pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8")
1467         short8 vpaddlq_u8 (byte16 a) pure @safe;
1468 
1469     static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin
1470     {
1471         pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32")
1472             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1473     }
1474     else
1475     {
1476         pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32")
1477             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1478     }
1479     
1480     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i16")
1481         short8 vpaddq_s16(short8 a, short8 b) pure @safe;
1482 
1483     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8")
1484         byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe;
1485 
1486     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i32")
1487         int4 vpaddq_s32(int4 a, int4 b) pure @safe;
1488 
1489     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v4i16")
1490         short4 vqadd_s16(short4 a, short4 b) pure @safe;
1491 
1492     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v8i16")
1493         short8 vqaddq_s16(short8 a, short8 b) pure @safe;
1494 
1495     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8")
1496         byte8 vqmovn_s16(short8 a) pure @safe;
1497 
1498     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16")
1499         short4 vqmovn_s32(int4 a) pure @safe;
1500 
1501     pragma(LDC_intrinsic, "llvm.aarch64.neon.uqxtn.v4i16")
1502         short4 vqmovn_u32(int4 a) pure @safe;
1503 
1504     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8")
1505         byte8 vqmovun_s16(short8 a) pure @safe;
1506 
1507     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v4i16")
1508         short4 vqsub_s16(short4 a, short4 b) pure @safe;
1509 
1510     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v8i16")
1511         short8 vqsubq_s16(short8 a, short8 b) pure @safe;
1512 
1513     pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v16i8")
1514         byte16 vqtbl1q_s8(byte16 t, byte16 idx) pure @safe;
1515 
1516     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8")
1517         byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe;
1518 
1519     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16")
1520         short8 vrhadd_u16(short8 a, short8 b) pure @safe;
1521 
1522     pragma(LDC_intrinsic, "llvm.aarch64.neon.rshrn.v4i16")
1523         short4 vrshrn_n_s32(int4 a, int n) pure @safe;        
1524 
1525     byte8 vshr_u8(byte8 a, byte8 b) pure @safe
1526     {
1527         return a >>> b;
1528     }
1529 
1530     byte16 vshrq_n_s8(byte16 a, byte r) pure @safe
1531     { 
1532         a = a >> byte16(cast(byte)r);
1533         return a;
1534     }
1535 
1536     pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v8i8")
1537         byte8 vtbl1_s8(byte16 t, byte8 idx) pure @safe;
1538 }
1539 
1540 version(unittest)
1541 {
1542     double abs_double(double x) @trusted
1543     {
1544         version(LDC)
1545             return llvm_fabs(x);
1546         else
1547         {
1548             long uf = *cast(long*)(&x);
1549             uf &= 0x7fffffff_ffffffff;
1550             return *cast(double*)(&uf);
1551         }
1552     }
1553 }
1554 
1555 // needed because in old GDC from travis, core.stdc.math.isnan isn't pure
1556 
1557 bool isnan(float x) pure @trusted
1558 {
1559     uint u = *cast(uint*)(&x);
1560     bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF);
1561     return result;
1562 }
1563 unittest
1564 {
1565     float x = float.nan;
1566     assert(isnan(x));
1567 
1568     x = 0;
1569     assert(!isnan(x));
1570     
1571     x = float.infinity;
1572     assert(!isnan(x));
1573 }
1574 
1575 bool isnan(double x) pure @trusted
1576 {
1577     ulong u = *cast(ulong*)(&x);
1578     return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF);
1579 }
1580 unittest
1581 {
1582     double x = double.nan;
1583     assert(isnan(x));
1584 
1585     x = 0;
1586     assert(!isnan(x));
1587     
1588     x = double.infinity;
1589     assert(!isnan(x));
1590 }