1 /**
2 * Internal stuff only, do not import.
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.internals;
8 
9 import inteli.types;
10 
11 // The only math functions needed for intel-intrinsics
12 public import core.math: sqrt; // since it's an intrinsics
13 
14 package:
15 nothrow:
16 @nogc:
17 
18 
19 version(GNU)
20 {
21     version (X86)
22     {
23         // For 32-bit x86, disable vector extensions with GDC. 
24         // It just doesn't work well.
25         enum GDC_with_x86 = true;
26         enum GDC_with_MMX = false;
27         enum GDC_with_SSE = false;
28         enum GDC_with_SSE2 = false;
29         enum GDC_with_SSE3 = false;
30         enum GDC_with_SSSE3 = false;
31         enum GDC_with_SSE41 = false;
32         enum GDC_with_SSE42 = false;
33         enum GDC_with_AVX = false;
34         enum GDC_with_AVX2 = false;
35         enum GDC_with_SHA = false;
36         enum GDC_with_BMI2 = false;
37     }
38     else version (X86_64)
39     {
40         // GDC support uses extended inline assembly:
41         //   https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html        (general information and hints)
42         //   https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html  (binding variables to registers)
43         //   https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names)
44 
45         public import core.simd: byte16, short8, int4, float4, double2;
46 
47         // NOTE: These intrinsics are not available in every i386 and x86_64 CPU.
48         // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 
49         public import gcc.builtins;
50 
51         // TODO: SSE and SSE2 should be truly optional instead, in the future, if we 
52         // want to support other archs with GDC
53 
54         enum GDC_with_x86 = true;
55         enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there
56         enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there
57         enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there
58 
59         static if (__VERSION__ >= 2100) // Starting at GDC 12.1
60         {
61             enum GDC_with_SSE3 = __traits(compiles, __builtin_ia32_haddps);
62             enum GDC_with_SSSE3 = __traits(compiles, __builtin_ia32_pmulhrsw128);
63             enum GDC_with_SSE41 = __traits(compiles, __builtin_ia32_dpps);
64             enum GDC_with_SSE42 = __traits(compiles, __builtin_ia32_pcmpgtq);
65             enum GDC_with_AVX = __traits(compiles, __builtin_ia32_vbroadcastf128_pd256);
66             enum GDC_with_AVX2 = __traits(compiles, __builtin_ia32_gathersiv2df);
67             enum GDC_with_BMI2 = __traits(compiles, __builtin_ia32_pext_si);
68 
69         }
70         else
71         {
72             // Before GCC 11.3, no reliable way to detect instruction sets.
73             // We start above detection at GCC 12, with DMDFE 2.100, which
74             // is more conservative.
75             enum GDC_with_SSE3 = false;
76             enum GDC_with_SSSE3 = false;
77             enum GDC_with_SSE41 = false;
78             enum GDC_with_SSE42 = false;
79             enum GDC_with_AVX = false;
80             enum GDC_with_AVX2 = false;
81             enum GDC_with_BMI2 = false;
82         }
83 
84         enum GDC_with_SHA = false; // TODO: detect that
85     }
86     else
87     {
88         enum GDC_with_x86 = false;
89         enum GDC_with_MMX = false;
90         enum GDC_with_SSE = false;
91         enum GDC_with_SSE2 = false;
92         enum GDC_with_SSE3 = false;
93         enum GDC_with_SSSE3 = false;
94         enum GDC_with_SSE41 = false;
95         enum GDC_with_SSE42 = false;
96         enum GDC_with_AVX = false;
97         enum GDC_with_AVX2 = false;
98         enum GDC_with_SHA = false;
99         enum GDC_with_BMI2 = false;
100     }
101 }
102 else
103 {
104     enum GDC_with_x86 = false;
105     enum GDC_with_MMX = false;
106     enum GDC_with_SSE = false;
107     enum GDC_with_SSE2 = false;
108     enum GDC_with_SSE3 = false;
109     enum GDC_with_SSSE3 = false;
110     enum GDC_with_SSE41 = false;
111     enum GDC_with_SSE42 = false;
112     enum GDC_with_AVX = false;
113     enum GDC_with_AVX2 = false;
114     enum GDC_with_SHA = false;
115     enum GDC_with_BMI2 = false;
116 }
117 
118 version(LDC)
119 {
120     public import core.simd;
121     public import ldc.simd;
122     public import ldc.intrinsics;
123     public import ldc.llvmasm: __asm;
124 
125     // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR
126     static if (__VERSION__ >= 2083)
127     {
128          import ldc.llvmasm;
129          alias LDCInlineIR = __ir_pure;
130 
131          // A version of inline IR with prefix/suffix didn't exist before LDC 1.13
132          alias LDCInlineIREx = __irEx_pure; 
133     }
134     else
135     {
136         alias LDCInlineIR = inlineIR;
137     }
138 
139     version(ARM)
140     {
141         public import ldc.gccbuiltins_arm;
142         enum LDC_with_ARM32 = true;
143         enum LDC_with_ARM64 = false;
144         enum LDC_with_ARM64_CRC = false;
145         enum LDC_with_SSE = false;
146         enum LDC_with_SSE2 = false;
147         enum LDC_with_SSE3 = false;
148         enum LDC_with_SSSE3 = false;
149         enum LDC_with_SSE41 = false;
150         enum LDC_with_SSE42 = false;
151         enum LDC_with_AVX = false;
152         enum LDC_with_AVX2 = false;
153         enum LDC_with_SHA = false;
154         enum LDC_with_BMI2 = false;
155     }
156     else version(AArch64)
157     {
158         public import ldc.gccbuiltins_aarch64;
159         enum LDC_with_ARM32 = false;
160         enum LDC_with_ARM64 = true; // implies "has Neon"
161         enum LDC_with_ARM64_CRC = __traits(targetHasFeature, "crc");
162         enum LDC_with_SSE = false;
163         enum LDC_with_SSE2 = false;
164         enum LDC_with_SSE3 = false;
165         enum LDC_with_SSSE3 = false;
166         enum LDC_with_SSE41 = false;
167         enum LDC_with_SSE42 = false;
168         enum LDC_with_AVX = false;
169         enum LDC_with_AVX2 = false;
170         enum LDC_with_SHA = false;
171         enum LDC_with_BMI2 = false;
172     }
173     else
174     {
175         public import ldc.gccbuiltins_x86;
176         enum LDC_with_ARM32 = false;
177         enum LDC_with_ARM64 = false;
178         enum LDC_with_ARM64_CRC = false;
179         enum LDC_with_SSE = __traits(targetHasFeature, "sse");
180         enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2");
181         enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3");
182         enum LDC_with_SSSE3 = __traits(targetHasFeature, "ssse3");
183         enum LDC_with_SSE41 = __traits(targetHasFeature, "sse4.1");
184         enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2");
185         enum LDC_with_AVX = __traits(targetHasFeature, "avx");
186         enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2");
187         enum LDC_with_SHA = __traits(targetHasFeature, "sha");
188         enum LDC_with_BMI2 = __traits(targetHasFeature, "bmi2");
189     }
190 }
191 else
192 {
193     enum LDC_with_ARM32 = false;
194     enum LDC_with_ARM64 = false;
195     enum LDC_with_ARM64_CRC = false;
196     enum LDC_with_SSE = false;
197     enum LDC_with_SSE2 = false;
198     enum LDC_with_SSE3 = false;
199     enum LDC_with_SSSE3 = false;
200     enum LDC_with_SSE41 = false;
201     enum LDC_with_SSE42 = false;
202     enum LDC_with_AVX = false;
203     enum LDC_with_AVX2 = false;
204     enum LDC_with_SHA = false;
205     enum LDC_with_BMI2 = false;
206 }
207 
208 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64;
209 
210 version(DigitalMars)
211 {
212     version(D_InlineAsm_X86)
213         enum DMD_with_asm = true;
214     else version(D_InlineAsm_X86_64)
215         enum DMD_with_asm = true;
216     else
217         enum DMD_with_asm = false;
218 
219     version(D_InlineAsm_X86)
220         enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution
221     else
222         enum DMD_with_32bit_asm = false;
223 
224     version (D_SIMD)
225     {
226         enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated;
227 
228         // Going further, does DMD has SSE4.1 through -mcpu?
229         static if (DMD_with_DSIMD)
230             enum bool DMD_with_DSIMD_and_SSE41 = __traits(compiles, int4(0) * int4(0));
231         else
232             enum bool DMD_with_DSIMD_and_SSE41 = false;
233 
234         // No DMD way to detect those instruction sets => pessimize
235         // would be cool to have a way to detect support for this at CT
236         enum DMD_with_DSIMD_and_SSE3  = DMD_with_DSIMD_and_SSE41; 
237         enum DMD_with_DSIMD_and_SSSE3 = DMD_with_DSIMD_and_SSE41;
238 
239         version(D_AVX)
240             enum DMD_with_DSIMD_and_AVX   = true;
241         else
242             enum DMD_with_DSIMD_and_AVX   = false;
243 
244         version(D_AVX2)
245             enum DMD_with_DSIMD_and_AVX2  = true;
246         else
247             enum DMD_with_DSIMD_and_AVX2  = false;
248 
249         enum DMD_with_DSIMD_and_SSE42 = DMD_with_DSIMD_and_AVX;
250     }
251     else
252     {
253         enum DMD_with_DSIMD = false;
254         enum DMD_with_DSIMD_and_SSE3  = false;
255         enum DMD_with_DSIMD_and_SSSE3 = false;
256         enum DMD_with_DSIMD_and_SSE41 = false;
257         enum DMD_with_DSIMD_and_SSE42 = false;
258         enum DMD_with_DSIMD_and_AVX   = false;
259         enum DMD_with_DSIMD_and_AVX2  = false;
260     }
261 }
262 else
263 {
264     enum DMD_with_asm = false;
265     enum DMD_with_32bit_asm = false;
266     enum DMD_with_DSIMD = false;
267     enum DMD_with_DSIMD_and_SSE3  = false;
268     enum DMD_with_DSIMD_and_SSSE3 = false;
269     enum DMD_with_DSIMD_and_SSE41 = false;
270     enum DMD_with_DSIMD_and_SSE42 = false;
271     enum DMD_with_DSIMD_and_AVX   = false;
272     enum DMD_with_DSIMD_and_AVX2  = false;
273 }
274 
275 
276 // Sometimes, can be helpful to merge builtin code, however keep in mind that
277 // LDC and GDC builtins often subtly diverse, wrt. unsigned vs signed vectors, 
278 // return types, purity... test it in Godbolt! this is safer with float and double intrinsics.
279 enum GDC_or_LDC_with_SSE  = GDC_with_SSE  || LDC_with_SSE;
280 enum GDC_or_LDC_with_SSE2 = GDC_with_SSE2 || LDC_with_SSE2;
281 enum GDC_or_LDC_with_SSE3 = GDC_with_SSE3 || LDC_with_SSE3;
282 
283 enum GDC_or_LDC_with_AVX  = GDC_with_AVX  || LDC_with_AVX;
284 enum GDC_or_LDC_with_AVX2 = GDC_with_AVX2 || LDC_with_AVX2;
285 enum GDC_or_LDC_with_SHA  = GDC_with_SHA  || LDC_with_SHA;
286 enum GDC_or_LDC_with_BMI2 = GDC_with_BMI2 || LDC_with_BMI2;
287 
288 
289 static if (LDC_with_ARM32)
290 {
291     package uint arm_get_fpcr() nothrow @nogc @trusted
292     {
293         return __builtin_arm_get_fpscr();
294     }
295 
296     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
297     {
298         __builtin_arm_set_fpscr(cw);
299     }
300 }
301 
302 static if (LDC_with_ARM64)
303 {
304     pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr")
305         long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe;
306 
307     package uint arm_get_fpcr() pure nothrow @nogc @trusted
308     {
309         // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR
310         return __asm!uint("mrs $0, fpcr", "=r");
311     }
312 
313     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
314     {
315         // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR.
316         long save_x2;
317         __asm!void("str x2, $1 \n" ~
318                    "ldr w2, $0 \n" ~
319                    "msr fpcr, x2 \n" ~
320                    "ldr x2, $1 "   , "m,m", cw, &save_x2);
321     }
322 }
323 
324 
325 // For internal use only, since public API deals with a x86 semantic emulation
326 enum uint _MM_ROUND_NEAREST_ARM     = 0x00000000;
327 enum uint _MM_ROUND_DOWN_ARM        = 0x00800000;
328 enum uint _MM_ROUND_UP_ARM          = 0x00400000;
329 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000;
330 enum uint _MM_ROUND_MASK_ARM        = 0x00C00000;
331 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000;
332 
333 
334 //
335 //  <ROUNDING>
336 //
337 //  Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE
338 //  doesn't change the FPU rounding mode, and isn't expected to do so.
339 //  So we devised these rounding function to help having consistent rounding between 
340 //  LDC and DMD. It's important that DMD uses whatever is in MXCSR to round.
341 //
342 //  Note: There is no MXCSR in ARM. But there is fpcr/fpscr that implements similar 
343 //  functionality.
344 //  https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register
345 //  We use fpcr/fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly.
346 
347 int convertFloatToInt32UsingMXCSR(float value) @trusted
348 {
349     int result;
350     version(GNU)
351     {
352         asm pure nothrow @nogc @trusted
353         {
354             "cvtss2si %1, %0\n": "=r"(result) : "x" (value);
355         }
356     }
357     else static if (LDC_with_ARM32)
358     {
359         result = __asm!int(`vldr s2, $1
360                             vcvtr.s32.f32 s2, s2
361                             vmov $0, s2`, "=r,m,~{s2}", value);
362     }
363     else static if (LDC_with_ARM64)
364     {
365         // Get current rounding mode.
366         uint fpscr = arm_get_fpcr();
367 
368         switch(fpscr & _MM_ROUND_MASK_ARM)
369         {
370             default:
371             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f32(value); break;
372             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f32(value); break;
373             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f32(value); break;
374             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value);  break;
375         }
376     }
377     else
378     {
379         asm pure nothrow @nogc @trusted
380         {
381             cvtss2si EAX, value;
382             mov result, EAX;
383         }
384     }
385     return result;
386 }
387 
388 int convertDoubleToInt32UsingMXCSR(double value) @trusted
389 {
390     int result;
391     version(GNU)
392     {
393         asm pure nothrow @nogc @trusted
394         {
395             "cvtsd2si %1, %0\n": "=r"(result) : "x" (value);
396         }
397     }
398     else static if (LDC_with_ARM32)
399     {
400         result = __asm!int(`vldr d2, $1
401                             vcvtr.s32.f64 s2, d2
402                             vmov $0, s2`, "=r,m,~{s2},~{d2}", value);
403     }
404     else static if (LDC_with_ARM64)
405     {
406         // Get current rounding mode.
407         uint fpscr = arm_get_fpcr();
408 
409         switch(fpscr & _MM_ROUND_MASK_ARM)
410         {
411             default:
412             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f64(value); break;
413             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f64(value); break;
414             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f64(value); break;
415             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value);  break;
416         }
417     }
418     else
419     {
420         asm pure nothrow @nogc @trusted
421         {
422             cvtsd2si EAX, value;
423             mov result, EAX;
424         }
425     }
426     return result;
427 }
428 
429 long convertFloatToInt64UsingMXCSR(float value) @trusted
430 {
431     static if (LDC_with_ARM32)
432     {
433         // We have to resort to libc since 32-bit ARM 
434         // doesn't seem to have 64-bit registers.
435         
436         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
437 
438         // Note: converting to double precision else rounding could be different for large integers
439         double asDouble = value; 
440 
441         switch(fpscr & _MM_ROUND_MASK_ARM)
442         {
443             default:
444             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(asDouble));
445             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(asDouble));
446             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(asDouble));
447             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble);
448         }
449     }
450     else static if (LDC_with_ARM64)
451     {
452         uint fpscr = arm_get_fpcr();
453 
454         switch(fpscr & _MM_ROUND_MASK_ARM)
455         {
456             default:
457             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f32(value);
458             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f32(value);
459             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f32(value);
460             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value);
461         }
462     }
463     // 64-bit can use an SSE instruction
464     else version(D_InlineAsm_X86_64)
465     {
466         long result;
467         version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet."
468         {
469             asm pure nothrow @nogc @trusted
470             {
471                 movss XMM0, value;
472                 cvtss2si RAX, XMM0;
473                 mov result, RAX;
474             }
475         }
476         else
477         {
478             asm pure nothrow @nogc @trusted
479             {
480                 movss XMM0, value;
481                 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit)
482                 mov result, RAX;
483             }
484         }
485         return result;
486     }
487     else version(D_InlineAsm_X86)
488     {
489         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
490         // This leads to an unfortunate FPU sequence in every C++ compiler.
491         // See: https://godbolt.org/z/vZym77
492 
493         // Get current MXCSR rounding
494         uint sseRounding;
495         ushort savedFPUCW;
496         ushort newFPUCW;
497         long result;
498         asm pure nothrow @nogc @trusted
499         {
500             stmxcsr sseRounding;
501             fld value;
502             fnstcw savedFPUCW;
503             mov AX, savedFPUCW;
504             and AX, 0xf3ff;          // clear FPU rounding bits
505             movzx ECX, word ptr sseRounding;
506             and ECX, 0x6000;         // only keep SSE rounding bits
507             shr ECX, 3;
508             or AX, CX;               // make a new control word for FPU with SSE bits
509             mov newFPUCW, AX;
510             fldcw newFPUCW;
511             fistp qword ptr result;            // convert, respecting MXCSR (but not other control word things)
512             fldcw savedFPUCW;
513         }
514         return result;
515     }
516     else static if (GDC_with_x86)
517     {
518         version(X86_64) // 64-bit can just use the right instruction
519         {
520             static assert(GDC_with_SSE);
521             __m128 A;
522             A.ptr[0] = value;
523             return __builtin_ia32_cvtss2si64 (A);
524         }
525         else version(X86) // 32-bit
526         {
527             // This is untested!
528             uint sseRounding;
529             ushort savedFPUCW;
530             ushort newFPUCW;
531             long result;
532             asm pure nothrow @nogc @trusted
533             {
534                 "stmxcsr %1;\n" ~
535                 "fld %2;\n" ~
536                 "fnstcw %3;\n" ~
537                 "movw %3, %%ax;\n" ~
538                 "andw $0xf3ff, %%ax;\n" ~
539                 "movzwl %1, %%ecx;\n" ~
540                 "andl $0x6000, %%ecx;\n" ~
541                 "shrl $3, %%ecx;\n" ~
542                 "orw %%cx, %%ax\n" ~
543                 "movw %%ax, %4;\n" ~
544                 "fldcw %4;\n" ~
545                 "fistpll %0;\n" ~
546                 "fldcw %3;\n" 
547                   : "=m"(result)    // %0
548                   : "m" (sseRounding),
549                     "f" (value),
550                     "m" (savedFPUCW),
551                     "m" (newFPUCW) 
552                   : "eax", "ecx", "st";
553             }
554             return result;
555         }
556         else
557             static assert(false);
558     }
559     else
560         static assert(false);
561 }
562 
563 
564 ///ditto
565 long convertDoubleToInt64UsingMXCSR(double value) @trusted
566 {
567     static if (LDC_with_ARM32)
568     {
569         // We have to resort to libc since 32-bit ARM 
570         // doesn't seem to have 64-bit registers.
571         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
572         switch(fpscr & _MM_ROUND_MASK_ARM)
573         {
574             default:
575             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(value));
576             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(value));
577             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(value));
578             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value);
579         }
580     }
581     else static if (LDC_with_ARM64)
582     {
583         // Get current rounding mode.
584         uint fpscr = arm_get_fpcr();
585 
586         switch(fpscr & _MM_ROUND_MASK_ARM)
587         {
588             default:
589             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f64(value);
590             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f64(value);
591             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f64(value);
592             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value);
593         }
594     }
595     // 64-bit can use an SSE instruction
596     else version(D_InlineAsm_X86_64)
597     {
598         long result;
599         version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet."
600         {
601             asm pure nothrow @nogc @trusted
602             {
603                 movsd XMM0, value;
604                 cvtsd2si RAX, XMM0;
605                 mov result, RAX;
606             }
607         }
608         else
609         {
610             asm pure nothrow @nogc @trusted
611             {
612                 movsd XMM0, value;
613                 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit)
614                 mov result, RAX;
615             }
616         }
617         return result;
618     }
619     else version(D_InlineAsm_X86)
620     {
621         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
622         // This leads to an unfortunate FPU sequence in every C++ compiler.
623         // See: https://godbolt.org/z/vZym77
624 
625         // Get current MXCSR rounding
626         uint sseRounding;
627         ushort savedFPUCW;
628         ushort newFPUCW;
629         long result;
630         asm pure nothrow @nogc @trusted
631         {
632             stmxcsr sseRounding;
633             fld value;
634             fnstcw savedFPUCW;
635             mov AX, savedFPUCW;
636             and AX, 0xf3ff;
637             movzx ECX, word ptr sseRounding;
638             and ECX, 0x6000;
639             shr ECX, 3;
640             or AX, CX;
641             mov newFPUCW, AX;
642             fldcw newFPUCW;
643             fistp result;
644             fldcw savedFPUCW;
645         }
646         return result;
647     }
648     else static if (GDC_with_x86)
649     {
650         version(X86_64)
651         {
652             static assert(GDC_with_SSE2);
653             __m128d A;
654             A.ptr[0] = value;
655             return __builtin_ia32_cvtsd2si64 (A);
656         }
657         else
658         {
659             // This is untested!
660             uint sseRounding;
661             ushort savedFPUCW;
662             ushort newFPUCW;
663             long result;
664             asm pure nothrow @nogc @trusted
665             {
666                 "stmxcsr %1;\n" ~
667                 "fld %2;\n" ~
668                 "fnstcw %3;\n" ~
669                 "movw %3, %%ax;\n" ~
670                 "andw $0xf3ff, %%ax;\n" ~
671                 "movzwl %1, %%ecx;\n" ~
672                 "andl $0x6000, %%ecx;\n" ~
673                 "shrl $3, %%ecx;\n" ~
674                 "orw %%cx, %%ax\n" ~
675                 "movw %%ax, %4;\n" ~
676                 "fldcw %4;\n" ~
677                 "fistpll %0;\n" ~
678                 "fldcw %3;\n"         
679                   : "=m"(result)    // %0
680                   : "m" (sseRounding),
681                     "t" (value),
682                     "m" (savedFPUCW),
683                     "m" (newFPUCW) 
684                   : "eax", "ecx", "st";
685             }
686             return result;
687         }
688     }
689     else
690         static assert(false);
691 }
692 
693 //
694 //  </ROUNDING>
695 //
696 
697 
698 // using the Intel terminology here
699 
700 byte saturateSignedWordToSignedByte(short value) pure @safe
701 {
702     if (value > 127) value = 127;
703     if (value < -128) value = -128;
704     return cast(byte) value;
705 }
706 
707 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe
708 {
709     if (value > 255) value = 255;
710     if (value < 0) value = 0;
711     return cast(ubyte) value;
712 }
713 
714 short saturateSignedIntToSignedShort(int value) pure @safe
715 {
716     if (value > 32767) value = 32767;
717     if (value < -32768) value = -32768;
718     return cast(short) value;
719 }
720 
721 ushort saturateSignedIntToUnsignedShort(int value) pure @safe
722 {
723     if (value > 65535) value = 65535;
724     if (value < 0) value = 0;
725     return cast(ushort) value;
726 }
727 
728 unittest // test saturate operations
729 {
730     assert( saturateSignedWordToSignedByte(32000) == 127);
731     assert( saturateSignedWordToUnsignedByte(32000) == 255);
732     assert( saturateSignedWordToSignedByte(-4000) == -128);
733     assert( saturateSignedWordToUnsignedByte(-4000) == 0);
734     assert( saturateSignedIntToSignedShort(32768) == 32767);
735     assert( saturateSignedIntToUnsignedShort(32768) == 32768);
736     assert( saturateSignedIntToSignedShort(-32769) == -32768);
737     assert( saturateSignedIntToUnsignedShort(-32769) == 0);
738 }
739 
740 version(unittest)
741 {
742     // This is just for debugging tests
743     import core.stdc.stdio: printf;
744 
745     // printing vectors for implementation
746     // Note: you can override `pure` within a `debug` clause
747 
748     void _mm_print_pi64(__m64 v) @trusted
749     {
750         long1 vl = cast(long1)v;
751         printf("%lld\n", vl.array[0]);
752     }
753 
754     void _mm_print_pi32(__m64 v) @trusted
755     {
756         int[2] C = (cast(int2)v).array;
757         printf("%d %d\n", C[0], C[1]);
758     }
759 
760     void _mm_print_pi16(__m64 v) @trusted
761     {
762         short[4] C = (cast(short4)v).array;
763         printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]);
764     }
765 
766     void _mm_print_pi8(__m64 v) @trusted
767     {
768         byte[8] C = (cast(byte8)v).array;
769         printf("%d %d %d %d %d %d %d %d\n",
770         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
771     }
772 
773     void _mm_print_epi64(__m128i v) @trusted
774     {
775         long2 vl = cast(long2)v;
776         printf("%lld %lld\n", vl.array[0], vl.array[1]);
777     }
778 
779     void _mm_print_epi32(__m128i v) @trusted
780     {
781         printf("%d %d %d %d\n",
782               v.array[0], v.array[1], v.array[2], v.array[3]);
783     }  
784 
785     void _mm_print_epi16(__m128i v) @trusted
786     {
787         short[8] C = (cast(short8)v).array;
788         printf("%d %d %d %d %d %d %d %d\n",
789         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
790     }
791 
792     void _mm_print_epi8(__m128i v) @trusted
793     {
794         byte[16] C = (cast(byte16)v).array;
795         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
796         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]);
797     }
798 
799     void _mm_print_ps(__m128 v) @trusted
800     {
801         // %g because %f can conceal very small numbers and prints zero instead
802         float[4] C = (cast(float4)v).array;
803         printf("%g %g %g %g\n", C[0], C[1], C[2], C[3]);
804     }
805 
806     void _mm_print_pd(__m128d v) @trusted
807     {
808         double[2] C = (cast(double2)v).array;
809         printf("%f %f\n", C[0], C[1]);
810     }
811 
812     void _mm256_print_pd(__m256d v) @trusted
813     {
814         // %g because %f can conceal very small numbers and prints zero instead
815         printf("%g %g %g %g\n", v.array[0], v.array[1], v.array[2], v.array[3]); 
816     }
817 
818     void _mm256_print_ps(__m256 v) @trusted
819     {
820         // %g because %f can conceal very small numbers and prints zero instead
821         printf("%g %g %g %g %g %g %g %g\n", 
822             v.array[0], v.array[1], v.array[2], v.array[3],
823             v.array[4], v.array[5], v.array[6], v.array[7]); 
824     }
825 
826     void _mm256_print_epi32(__m256i v) @trusted
827     {
828         int8 vl = cast(int8)v;
829         printf("%d %d %d %d %d %d %d %d\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3],
830                                             vl.array[4], vl.array[5], vl.array[6], vl.array[7]);
831     }
832 
833     void _mm256_print_epi64(__m256i v) @trusted
834     {
835         long4 vl = cast(long4)v;
836         printf("%lld %lld %lld %lld\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3]);
837     }
838 }
839 
840 
841 //
842 //  <FLOATING-POINT COMPARISONS>
843 //
844 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we
845 //       need different IR generation.
846 
847 enum FPComparison
848 {
849     oeq,   // ordered and equal
850     ogt,   // ordered and greater than
851     oge,   // ordered and greater than or equal
852     olt,   // ordered and less than
853     ole,   // ordered and less than or equal
854     one,   // ordered and not equal
855     ord,   // ordered (no nans)
856     ueq,   // unordered or equal
857     ugt,   // unordered or greater than ("nle")
858     uge,   // unordered or greater than or equal ("nlt")
859     ult,   // unordered or less than ("nge")
860     ule,   // unordered or less than or equal ("ngt")
861     une,   // unordered or not equal ("neq")
862     uno,   // unordered (either nans)
863 }
864 
865 private static immutable string[FPComparison.max+1] FPComparisonToString =
866 [
867     "oeq",
868     "ogt",
869     "oge",
870     "olt",
871     "ole",
872     "one",
873     "ord",
874     "ueq",
875     "ugt",
876     "uge",
877     "ult",
878     "ule",
879     "une",
880     "uno",
881 ];
882 
883 // Individual float comparison: returns -1 for true or 0 for false.
884 // Useful for DMD and testing
885 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe
886 {
887     bool unordered = isnan(a) || isnan(b);
888     final switch(comparison) with(FPComparison)
889     {
890         case oeq: return a == b;
891         case ogt: return a > b;
892         case oge: return a >= b;
893         case olt: return a < b;
894         case ole: return a <= b;
895         case one: return !unordered && (a != b); // NaN with != always yields true
896         case ord: return !unordered; 
897         case ueq: return unordered || (a == b);
898         case ugt: return unordered || (a > b);
899         case uge: return unordered || (a >= b);
900         case ult: return unordered || (a < b);
901         case ule: return unordered || (a <= b);
902         case une: return (a != b); // NaN with != always yields true
903         case uno: return unordered;
904     }
905 }
906 
907 version(LDC)
908 {
909     /// Provides packed float comparisons
910     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
911     {
912         enum ir = `
913             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1
914             %r = sext <4 x i1> %cmp to <4 x i32>
915             ret <4 x i32> %r`;
916 
917         return LDCInlineIR!(ir, int4, float4, float4)(a, b);
918     }
919 
920     /// Provides packed double comparisons
921     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
922     {
923         enum ir = `
924             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1
925             %r = sext <2 x i1> %cmp to <2 x i64>
926             ret <2 x i64> %r`;
927 
928         return LDCInlineIR!(ir, long2, double2, double2)(a, b);
929     }
930 
931     /// CMPSS-style comparisons
932     /// clang implement it through x86 intrinsics, it is possible with IR alone
933     /// but leads to less optimal code.
934     /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 
935     /// Not that simple.
936     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
937     {
938         /*
939         enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison];
940         enum bool invertOp = (predicateNumber & 0x80) != 0;
941         static if(invertOp)
942             return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f);
943         else
944             return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f);
945         */
946         enum ir = `
947             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
948             %r = sext i1 %cmp to i32
949             %r2 = bitcast i32 %r to float
950             ret float %r2`;
951 
952         float4 r = a;
953         r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]);
954         return r;
955     }
956 
957     /// CMPSD-style comparisons
958     /// clang implement it through x86 intrinsics, it is possible with IR alone
959     /// but leads to less optimal code.
960     /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 
961     /// Not that simple.    
962     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
963     {
964         enum ir = `
965             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
966             %r = sext i1 %cmp to i64
967             %r2 = bitcast i64 %r to double
968             ret double %r2`;
969 
970         double2 r = a;
971         r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]);
972         return r;
973     }
974 }
975 else
976 {
977     /// Provides packed float comparisons
978     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted
979     {
980         int4 result;
981         foreach(i; 0..4)
982         {
983             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
984         }
985         return result;
986     }
987 
988     /// Provides packed double comparisons
989     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted
990     {
991         long2 result;
992         foreach(i; 0..2)
993         {
994             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
995         }
996         return result;
997     }
998 
999     /// Provides CMPSS-style comparison
1000     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted
1001     {
1002         int4 result = cast(int4)a;
1003         result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0;
1004         return cast(float4)result;
1005     }
1006 
1007     /// Provides CMPSD-style comparison
1008     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted
1009     {
1010         long2 result = cast(long2)a;
1011         result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0;
1012         return cast(double2)result;
1013     }
1014 }
1015 unittest // cmpps
1016 {
1017     // Check all comparison type is working
1018     float4 A = [1, 3, 5, float.nan];
1019     float4 B = [2, 3, 4, 5];
1020 
1021     int4 result_oeq = cmpps!(FPComparison.oeq)(A, B);
1022     int4 result_ogt = cmpps!(FPComparison.ogt)(A, B);
1023     int4 result_oge = cmpps!(FPComparison.oge)(A, B);
1024     int4 result_olt = cmpps!(FPComparison.olt)(A, B);
1025     int4 result_ole = cmpps!(FPComparison.ole)(A, B);
1026     int4 result_one = cmpps!(FPComparison.one)(A, B);
1027     int4 result_ord = cmpps!(FPComparison.ord)(A, B);
1028     int4 result_ueq = cmpps!(FPComparison.ueq)(A, B);
1029     int4 result_ugt = cmpps!(FPComparison.ugt)(A, B);
1030     int4 result_uge = cmpps!(FPComparison.uge)(A, B);
1031     int4 result_ult = cmpps!(FPComparison.ult)(A, B);
1032     int4 result_ule = cmpps!(FPComparison.ule)(A, B);
1033     int4 result_une = cmpps!(FPComparison.une)(A, B);
1034     int4 result_uno = cmpps!(FPComparison.uno)(A, B);
1035 
1036     static immutable int[4] correct_oeq    = [ 0,-1, 0, 0];
1037     static immutable int[4] correct_ogt    = [ 0, 0,-1, 0];
1038     static immutable int[4] correct_oge    = [ 0,-1,-1, 0];
1039     static immutable int[4] correct_olt    = [-1, 0, 0, 0];
1040     static immutable int[4] correct_ole    = [-1,-1, 0, 0];
1041     static immutable int[4] correct_one    = [-1, 0,-1, 0];
1042     static immutable int[4] correct_ord    = [-1,-1,-1, 0];
1043     static immutable int[4] correct_ueq    = [ 0,-1, 0,-1];
1044     static immutable int[4] correct_ugt    = [ 0, 0,-1,-1];
1045     static immutable int[4] correct_uge    = [ 0,-1,-1,-1];
1046     static immutable int[4] correct_ult    = [-1, 0, 0,-1];
1047     static immutable int[4] correct_ule    = [-1,-1, 0,-1];
1048     static immutable int[4] correct_une    = [-1, 0,-1,-1];
1049     static immutable int[4] correct_uno    = [ 0, 0, 0,-1];
1050 
1051     assert(result_oeq.array == correct_oeq);
1052     assert(result_ogt.array == correct_ogt);
1053     assert(result_oge.array == correct_oge);
1054     assert(result_olt.array == correct_olt);
1055     assert(result_ole.array == correct_ole);
1056     assert(result_one.array == correct_one);
1057     assert(result_ord.array == correct_ord);
1058     assert(result_ueq.array == correct_ueq);
1059     assert(result_ugt.array == correct_ugt);
1060     assert(result_uge.array == correct_uge);
1061     assert(result_ult.array == correct_ult);
1062     assert(result_ule.array == correct_ule);
1063     assert(result_une.array == correct_une);
1064     assert(result_uno.array == correct_uno);
1065 }
1066 unittest
1067 {
1068     double2 a = [1, 3];
1069     double2 b = [2, 3];
1070     long2 c = cmppd!(FPComparison.ult)(a, b);
1071     static immutable long[2] correct = [cast(long)(-1), 0];
1072     assert(c.array == correct);
1073 }
1074 unittest // cmpss
1075 {
1076     void testComparison(FPComparison comparison)(float4 A, float4 B)
1077     {
1078         float4 result = cmpss!comparison(A, B);
1079         int4 iresult = cast(int4)result;
1080         int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0;
1081         assert(iresult.array[0] == expected);
1082         assert(result.array[1] == A.array[1]);
1083         assert(result.array[2] == A.array[2]);
1084         assert(result.array[3] == A.array[3]);
1085     }
1086 
1087     // Check all comparison type is working
1088     float4 A = [1, 3, 5, 6];
1089     float4 B = [2, 3, 4, 5];
1090     float4 C = [float.nan, 3, 4, 5];
1091 
1092     testComparison!(FPComparison.oeq)(A, B);
1093     testComparison!(FPComparison.oeq)(A, C);
1094     testComparison!(FPComparison.ogt)(A, B);
1095     testComparison!(FPComparison.ogt)(A, C);
1096     testComparison!(FPComparison.oge)(A, B);
1097     testComparison!(FPComparison.oge)(A, C);
1098     testComparison!(FPComparison.olt)(A, B);
1099     testComparison!(FPComparison.olt)(A, C);
1100     testComparison!(FPComparison.ole)(A, B);
1101     testComparison!(FPComparison.ole)(A, C);
1102     testComparison!(FPComparison.one)(A, B);
1103     testComparison!(FPComparison.one)(A, C);
1104     testComparison!(FPComparison.ord)(A, B);
1105     testComparison!(FPComparison.ord)(A, C);
1106     testComparison!(FPComparison.ueq)(A, B);
1107     testComparison!(FPComparison.ueq)(A, C);
1108     testComparison!(FPComparison.ugt)(A, B);
1109     testComparison!(FPComparison.ugt)(A, C);
1110     testComparison!(FPComparison.uge)(A, B);
1111     testComparison!(FPComparison.uge)(A, C);
1112     testComparison!(FPComparison.ult)(A, B);
1113     testComparison!(FPComparison.ult)(A, C);
1114     testComparison!(FPComparison.ule)(A, B);
1115     testComparison!(FPComparison.ule)(A, C);
1116     testComparison!(FPComparison.une)(A, B);
1117     testComparison!(FPComparison.une)(A, C);
1118     testComparison!(FPComparison.uno)(A, B);
1119     testComparison!(FPComparison.uno)(A, C);
1120 }
1121 unittest // cmpsd
1122 {
1123     void testComparison(FPComparison comparison)(double2 A, double2 B)
1124     {
1125         double2 result = cmpsd!comparison(A, B);
1126         long2 iresult = cast(long2)result;
1127         long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0;
1128         assert(iresult.array[0] == expected);
1129         assert(result.array[1] == A.array[1]);
1130     }
1131 
1132     // Check all comparison type is working
1133     double2 A = [1, 3];
1134     double2 B = [2, 4];
1135     double2 C = [double.nan, 5];
1136 
1137     testComparison!(FPComparison.oeq)(A, B);
1138     testComparison!(FPComparison.oeq)(A, C);
1139     testComparison!(FPComparison.ogt)(A, B);
1140     testComparison!(FPComparison.ogt)(A, C);
1141     testComparison!(FPComparison.oge)(A, B);
1142     testComparison!(FPComparison.oge)(A, C);
1143     testComparison!(FPComparison.olt)(A, B);
1144     testComparison!(FPComparison.olt)(A, C);
1145     testComparison!(FPComparison.ole)(A, B);
1146     testComparison!(FPComparison.ole)(A, C);
1147     testComparison!(FPComparison.one)(A, B);
1148     testComparison!(FPComparison.one)(A, C);
1149     testComparison!(FPComparison.ord)(A, B);
1150     testComparison!(FPComparison.ord)(A, C);
1151     testComparison!(FPComparison.ueq)(A, B);
1152     testComparison!(FPComparison.ueq)(A, C);
1153     testComparison!(FPComparison.ugt)(A, B);
1154     testComparison!(FPComparison.ugt)(A, C);
1155     testComparison!(FPComparison.uge)(A, B);
1156     testComparison!(FPComparison.uge)(A, C);
1157     testComparison!(FPComparison.ult)(A, B);
1158     testComparison!(FPComparison.ult)(A, C);
1159     testComparison!(FPComparison.ule)(A, B);
1160     testComparison!(FPComparison.ule)(A, C);
1161     testComparison!(FPComparison.une)(A, B);
1162     testComparison!(FPComparison.une)(A, C);
1163     testComparison!(FPComparison.uno)(A, B);
1164     testComparison!(FPComparison.uno)(A, C);
1165 }
1166 
1167 //
1168 //  </FLOATING-POINT COMPARISONS>
1169 //
1170 
1171 
1172 __m64 to_m64(__m128i a) pure @trusted
1173 {
1174     long2 la = cast(long2)a;
1175     long1 r = la.array[0];
1176     return r;
1177 }
1178 
1179 __m128i to_m128i(__m64 a) pure @trusted
1180 {
1181   /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 
1182     
1183     version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 
1184     {
1185         long2 r = a.array[0];
1186         r.ptr[1] = 0;
1187         return cast(int4)r;
1188     }
1189     else */
1190     {
1191         long2 r = [0, 0];
1192         r.ptr[0] = a.array[0];
1193         return cast(__m128i)r;
1194     }
1195 }
1196 
1197 // ADDITIONAL x86 INTRINSICS
1198 // Absent from ldc.gccbuiltins_x86 for some reason, but needed.
1199 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/12.x/llvm/include/llvm/IR/IntrinsicsX86.td
1200 static if (LDC_with_SSE41)
1201 {
1202     pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb")
1203         byte16 __builtin_ia32_pblendvb(byte16, byte16, byte16) pure @safe;
1204 }
1205 
1206 // SOME NEON INTRINSICS
1207 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics.
1208 // Not in the public API but the simde project expose it all for the user to use.
1209 // MAYDO: create a new neon.d module, for internal use only.
1210 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64.
1211 static if (LDC_with_ARM64)
1212 {
1213     // VERY USEFUL LINK
1214     // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td
1215     // Also: https://developer.arm.com/architectures/instruction-sets/intrinsics/
1216 
1217     pragma(LDC_intrinsic, "llvm.aarch64.crc32cb")
1218         uint __crc32cb(uint a, uint b) pure @safe;
1219 
1220     pragma(LDC_intrinsic, "llvm.aarch64.crc32ch")
1221         uint __crc32ch(uint a, uint b) pure @safe;
1222 
1223     pragma(LDC_intrinsic, "llvm.aarch64.crc32cw")
1224         uint __crc32cw(uint a, uint b) pure @safe;
1225 
1226     pragma(LDC_intrinsic, "llvm.aarch64.crc32cx")
1227         uint __crc32cd(uint a, ulong b) pure @safe;
1228 
1229     //pragma(LDC_intrinsic, "llvm.aarch64.dmb")
1230     //    uint __dmb(int a) @safe; // didn't found a name in intrinsic list
1231 
1232     pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8")
1233         byte16 vabdq_u8(byte16 a, byte16 b) pure @safe;
1234 
1235     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v8i16")
1236         short8 vabsq_s16(short8 a) pure @safe;
1237 
1238     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v4i32")
1239         int4 vabsq_s32(int4 a) pure @safe;
1240 
1241     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v16i8")
1242         byte16 vabsq_s8(byte16 a) pure @safe;
1243 
1244     byte8 vand_u8(byte8 a, byte8 b) pure @safe
1245     {
1246         return a & b;
1247     }
1248 
1249     long2 vandq_s64(long2 a, long2 b)
1250     {
1251         return a & b;
1252     }
1253 
1254     long2 vbicq_s64(long2 a, long2 b) pure @safe
1255     {
1256         return a & ~b;
1257     }
1258 
1259     int4 vbslq_s32(int4 a, int4 b, int4 c) pure @safe
1260     {
1261         return c ^ ((c ^ b) & a);
1262     }
1263 
1264     byte16 vbslq_s8(byte16 a, byte16 b, byte16 c) pure @safe
1265     {
1266         return c ^ ((c ^ b) & a);
1267     }
1268 
1269     long2 vbslq_s64(long2 a, long2 b, long2 c) pure @safe
1270     {
1271         return c ^ ((c ^ b) & a);
1272     }
1273 
1274     short8 vcombine_s16(short4 lo, short4 hi) pure @trusted
1275     {
1276         short8 r;
1277         r.ptr[0]  = lo.array[0];
1278         r.ptr[1]  = lo.array[1];
1279         r.ptr[2]  = lo.array[2];
1280         r.ptr[3]  = lo.array[3];
1281         r.ptr[4]  = hi.array[0];
1282         r.ptr[5]  = hi.array[1];
1283         r.ptr[6]  = hi.array[2];
1284         r.ptr[7]  = hi.array[3];
1285         return r;
1286     }
1287 
1288     int4 vcombine_s32(int2 lo, int2 hi) pure @trusted
1289     {
1290         int4 r;
1291         r.ptr[0] = lo.array[0];
1292         r.ptr[1] = lo.array[1];
1293         r.ptr[2] = hi.array[0];
1294         r.ptr[3] = hi.array[1];
1295         return r;
1296     }
1297 
1298     byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted
1299     {
1300         byte16 r;
1301         r.ptr[0]  = lo.array[0];
1302         r.ptr[1]  = lo.array[1];
1303         r.ptr[2]  = lo.array[2];
1304         r.ptr[3]  = lo.array[3];
1305         r.ptr[4]  = lo.array[4];
1306         r.ptr[5]  = lo.array[5];
1307         r.ptr[6]  = lo.array[6];
1308         r.ptr[7]  = lo.array[7];
1309         r.ptr[8]  = hi.array[0];
1310         r.ptr[9]  = hi.array[1];
1311         r.ptr[10] = hi.array[2];
1312         r.ptr[11] = hi.array[3];
1313         r.ptr[12] = hi.array[4];
1314         r.ptr[13] = hi.array[5];
1315         r.ptr[14] = hi.array[6];
1316         r.ptr[15] = hi.array[7];
1317         return r;
1318     }
1319 
1320     short8 vcombine_u16(short4 lo, short4 hi) pure @trusted
1321     {
1322         short8 r;
1323         r.ptr[0]  = lo.array[0];
1324         r.ptr[1]  = lo.array[1];
1325         r.ptr[2]  = lo.array[2];
1326         r.ptr[3]  = lo.array[3];
1327         r.ptr[4]  = hi.array[0];
1328         r.ptr[5]  = hi.array[1];
1329         r.ptr[6]  = hi.array[2];
1330         r.ptr[7]  = hi.array[3];
1331         return r;
1332     }
1333 
1334 
1335     // float4 => int4
1336 
1337     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32")
1338         int4 vcvtmq_s32_f32(float4 a) pure @safe;
1339 
1340     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32")
1341         int4 vcvtnq_s32_f32(float4 a) pure @safe;
1342 
1343     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32")
1344         int4 vcvtpq_s32_f32(float4 a) pure @safe;
1345 
1346     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32")
1347         int4 vcvtzq_s32_f32(float4 a) pure @safe;
1348 
1349 
1350     // double2 => long2
1351 
1352     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64")
1353         long2 vcvtmq_s64_f64(double2 a) pure @safe;
1354 
1355     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64")
1356         long2 vcvtnq_s64_f64(double2 a) pure @safe;
1357 
1358     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64")
1359         long2 vcvtpq_s64_f64(double2 a) pure @safe;
1360 
1361     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64")
1362         long2 vcvtzq_s64_f64(double2 a) pure @safe;
1363 
1364     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32")
1365         int vcvtms_s32_f32(float a) pure @safe;
1366 
1367     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32")
1368         int vcvtns_s32_f32(float a) pure @safe;    
1369 
1370     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32")
1371         int vcvtps_s32_f32(float a) pure @safe;
1372 
1373     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32")
1374         int vcvts_s32_f32(float a) pure @safe;
1375      
1376     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64")
1377         int vcvtms_s32_f64(double a) pure @safe;
1378 
1379     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64")
1380         int vcvtns_s32_f64(double a) pure @safe;    
1381 
1382     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64")
1383         int vcvtps_s32_f64(double a) pure @safe;
1384 
1385     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64")
1386         int vcvts_s32_f64(double a) pure @safe;
1387 
1388     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32")
1389         long vcvtms_s64_f32(float a) pure @safe;
1390 
1391     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32")
1392         long vcvtns_s64_f32(float a) pure @safe;    
1393 
1394     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32")
1395         long vcvtps_s64_f32(float a) pure @safe;
1396 
1397     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32")
1398         long vcvts_s64_f32(float a) pure @safe;
1399 
1400     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64")
1401         long vcvtms_s64_f64(double a) pure @safe;
1402 
1403     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64")
1404         long vcvtns_s64_f64(double a) pure @safe;    
1405 
1406     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64")
1407         long vcvtps_s64_f64(double a) pure @safe; // Note: technically should be named vcvtpd_s64_f64
1408 
1409     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64")
1410         long vcvts_s64_f64(double a) pure @safe;
1411 
1412     long2 vdupq_n_s64(long value) pure @safe
1413     {
1414         long2 r;
1415         r = value;
1416         return r;
1417     }
1418 
1419     short4 vget_high_s16(short8 a) pure @trusted
1420     {
1421         short4 r;
1422         r.ptr[0] = a.array[4];
1423         r.ptr[1] = a.array[5];
1424         r.ptr[2] = a.array[6];
1425         r.ptr[3] = a.array[7];
1426         return r;
1427     }
1428 
1429     int2 vget_high_s32(int4 a) pure @trusted
1430     {
1431         int2 r;
1432         r.ptr[0] = a.array[2];
1433         r.ptr[1] = a.array[3];
1434         return r;
1435     }
1436 
1437     byte8 vget_high_u8(byte16 a) pure @trusted
1438     {
1439         byte8 r;
1440         r.ptr[0] = a.array[8];
1441         r.ptr[1] = a.array[9];
1442         r.ptr[2] = a.array[10];
1443         r.ptr[3] = a.array[11];
1444         r.ptr[4] = a.array[12];
1445         r.ptr[5] = a.array[13];
1446         r.ptr[6] = a.array[14];
1447         r.ptr[7] = a.array[15];
1448         return r;
1449     }
1450 
1451     short4 vget_low_s16(short8 a) pure @trusted
1452     {
1453         short4 r;
1454         r.ptr[0] = a.array[0];
1455         r.ptr[1] = a.array[1];
1456         r.ptr[2] = a.array[2];
1457         r.ptr[3] = a.array[3];
1458         return r;
1459     } 
1460 
1461     int2 vget_low_s32(int4 a) pure @trusted
1462     {
1463         int2 r;
1464         r.ptr[0] = a.array[0];
1465         r.ptr[1] = a.array[1];
1466         return r;
1467     }
1468 
1469     byte8 vget_low_u8(byte16 a) pure @trusted
1470     {
1471         byte8 r;
1472         r.ptr[0] = a.array[0];
1473         r.ptr[1] = a.array[1];
1474         r.ptr[2] = a.array[2];
1475         r.ptr[3] = a.array[3];
1476         r.ptr[4] = a.array[4];
1477         r.ptr[5] = a.array[5];
1478         r.ptr[6] = a.array[6];
1479         r.ptr[7] = a.array[7];
1480         return r;
1481     }
1482 
1483     long vgetq_lane_s64(long2 v, const int lane) pure @safe
1484     {
1485         return v.array[lane];
1486     }
1487 
1488     pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16")
1489         short8 vmaxq_s16(short8 a, short8 b) pure @safe;
1490 
1491     int4 vmaxq_s32(int4 a, int4 b)
1492     {
1493         int4 r;
1494         r[0] = a[0] >= b[0] ? a[0] : b[0];
1495         r[1] = a[1] >= b[1] ? a[1] : b[1];
1496         r[2] = a[2] >= b[2] ? a[2] : b[2];
1497         r[3] = a[3] >= b[3] ? a[3] : b[3];
1498         return r;
1499     }
1500 
1501     pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16")
1502         short8 vminq_s16(short8 a, short8 b) pure @safe;
1503 
1504     int2 vmovn_s64(long2 a) pure @trusted
1505     {
1506         int2 r;
1507         r.ptr[0] = cast(int)(a.array[0]);
1508         r.ptr[1] = cast(int)(a.array[1]);
1509         return r;
1510     }        
1511 
1512     int4 vmull_s16(short4 a, short4 b) pure @trusted
1513     {
1514         int4 r;
1515         r.ptr[0] = a.array[0] * b.array[0];
1516         r.ptr[1] = a.array[1] * b.array[1];
1517         r.ptr[2] = a.array[2] * b.array[2];
1518         r.ptr[3] = a.array[3] * b.array[3];
1519         return r;
1520     }
1521 
1522     pragma(LDC_intrinsic, "llvm.aarch64.neon.smull.v2i64")
1523         long2 vmull_s32(int2 a, int2 b) pure @safe;
1524 
1525     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i16")
1526         short4 vpadd_s16(short4 a, short4 b) pure @safe;
1527 
1528     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32")
1529         int2 vpadd_s32(int2 a, int2 b) pure @safe;
1530 
1531     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8")
1532         byte8 vpadd_u8(byte8 a, byte8 b) pure @safe;
1533 
1534     pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8")
1535         short8 vpaddlq_u8 (byte16 a) pure @safe;
1536 
1537     static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin
1538     {
1539         pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32")
1540             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1541     }
1542     else
1543     {
1544         pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32")
1545             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1546     }
1547     
1548     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i16")
1549         short8 vpaddq_s16(short8 a, short8 b) pure @safe;
1550 
1551     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8")
1552         byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe;
1553 
1554     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i32")
1555         int4 vpaddq_s32(int4 a, int4 b) pure @safe;
1556 
1557     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v4i16")
1558         short4 vqadd_s16(short4 a, short4 b) pure @safe;
1559 
1560     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v8i16")
1561         short8 vqaddq_s16(short8 a, short8 b) pure @safe;
1562 
1563     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8")
1564         byte8 vqmovn_s16(short8 a) pure @safe;
1565 
1566     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16")
1567         short4 vqmovn_s32(int4 a) pure @safe;
1568 
1569     pragma(LDC_intrinsic, "llvm.aarch64.neon.uqxtn.v4i16")
1570         short4 vqmovn_u32(int4 a) pure @safe;
1571 
1572     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8")
1573         byte8 vqmovun_s16(short8 a) pure @safe;
1574 
1575     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v4i16")
1576         short4 vqsub_s16(short4 a, short4 b) pure @safe;
1577 
1578     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v8i16")
1579         short8 vqsubq_s16(short8 a, short8 b) pure @safe;
1580 
1581     pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v16i8")
1582         byte16 vqtbl1q_s8(byte16 t, byte16 idx) pure @safe;
1583 
1584     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8")
1585         byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe;
1586 
1587     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16")
1588         short8 vrhadd_u16(short8 a, short8 b) pure @safe;
1589 
1590     pragma(LDC_intrinsic, "llvm.aarch64.neon.rshrn.v4i16")
1591         short4 vrshrn_n_s32(int4 a, int n) pure @safe;        
1592 
1593     byte8 vshr_u8(byte8 a, byte8 b) pure @safe
1594     {
1595         return a >>> b;
1596     }
1597 
1598     byte16 vshrq_n_s8(byte16 a, byte r) pure @safe
1599     { 
1600         a = a >> byte16(cast(byte)r);
1601         return a;
1602     }
1603 
1604     pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v8i8")
1605         byte8 vtbl1_s8(byte16 t, byte8 idx) pure @safe;
1606 }
1607 
1608 version(unittest)
1609 {
1610     double abs_double(double x) @trusted
1611     {
1612         version(LDC)
1613             return llvm_fabs(x);
1614         else
1615         {
1616             long uf = *cast(long*)(&x);
1617             uf &= 0x7fffffff_ffffffff;
1618             return *cast(double*)(&uf);
1619         }
1620     }
1621 }
1622 
1623 // needed because in old GDC from travis, core.stdc.math.isnan isn't pure
1624 
1625 bool isnan(float x) pure @trusted
1626 {
1627     uint u = *cast(uint*)(&x);
1628     bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF);
1629     return result;
1630 }
1631 unittest
1632 {
1633     float x = float.nan;
1634     assert(isnan(x));
1635 
1636     x = 0;
1637     assert(!isnan(x));
1638     
1639     x = float.infinity;
1640     assert(!isnan(x));
1641 }
1642 
1643 bool isnan(double x) pure @trusted
1644 {
1645     ulong u = *cast(ulong*)(&x);
1646     return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF);
1647 }
1648 unittest
1649 {
1650     double x = double.nan;
1651     assert(isnan(x));
1652 
1653     x = 0;
1654     assert(!isnan(x));
1655     
1656     x = double.infinity;
1657     assert(!isnan(x));
1658 }