inteli.internals source code

1 /**
2 * Internal stuff only, do not import.
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2025, Stefanos Baziotis 2019.
5 *            cet 2024.
6 *            Copyright Kitsunebi Games 2025.
7 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8 */
9 module inteli.internals;
10 
11 import inteli.types;
12 
13 package:
14 nothrow:
15 @nogc:
16 
17 // nurt compatibility
18 
19 version(USE_NURT) 
20 {
21     import numem.core.hooks : nu_malloc, nu_free, nu_memcpy;
22     public import core.internal.exception : onOutOfMemoryError;
23     alias malloc = nu_malloc;
24     alias free = nu_free;
25     alias memcpy = nu_memcpy;
26 } 
27 else 
28 {
29     public import core.stdc.stdlib: malloc, free;
30     public import core.stdc.string: memcpy;
31     public import core.exception: onOutOfMemoryError;
32 }
33 
34 // The only math functions needed for intel-intrinsics
35 public import core.math: sqrt;
36 public import core.bitop: bsf, bsr;
37 
38 
39 
40 /// Helps portability with yet unsupported platforms
41 void __warn_noop(string fname = __FUNCTION__)() 
42 {
43     pragma(msg, "Warning: ", fname, " is currently not supported, it will become a NO-OP!");
44 }
45 ///ditto
46 RetT __warn_noop_ret(RetT, string fname = __FUNCTION__)(RetT rval = RetT.init) 
47     if (!is(RetT == void)) 
48 {
49     pragma(msg, "Warning: ", fname, " is currently not supported, it will become a NO-OP!");
50     return rval;
51 }
52 
53 
54 
55 version(GNU)
56 {
57     version (X86)
58     {
59         // For 32-bit x86, disable vector extensions with GDC. 
60         // It just doesn't work well.
61         enum GDC_with_x86 = true;
62         enum GDC_with_MMX = false;
63         enum GDC_with_SSE = false;
64         enum GDC_with_SSE2 = false;
65         enum GDC_with_SSE3 = false;
66         enum GDC_with_SSSE3 = false;
67         enum GDC_with_SSE41 = false;
68         enum GDC_with_SSE42 = false;
69         enum GDC_with_AVX = false;
70         enum GDC_with_F16C = false;
71         enum GDC_with_AVX2 = false;
72         enum GDC_with_SHA = false;
73         enum GDC_with_BMI2 = false;
74     }
75     else version (X86_64)
76     {
77         // GDC support uses extended inline assembly:
78         //   https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html        (general information and hints)
79         //   https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html  (binding variables to registers)
80         //   https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names)
81 
82         public import core.simd: byte16, short8, int4, float4, double2;
83 
84         // NOTE: These intrinsics are not available in every i386 and x86_64 CPU.
85         // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 
86         public import gcc.builtins;
87 
88         // TODO: SSE and SSE2 should be truly optional instead, in the future, if we 
89         // want to support other archs with GDC
90 
91         enum GDC_with_x86 = true;
92         enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there
93         enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there
94         enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there
95 
96         // Starting at GDC 12.1, detect capabilities with __traits(compiles).
97         // Before GCC 11.3, no reliable way to detect instruction sets.
98         // We start above detection at GCC 12, with DMDFE 2.100, which
99         // is more conservative.
100         static if (__VERSION__ >= 2100) 
101         {
102             enum GDC_with_SSE3 = __traits(compiles, __builtin_ia32_haddps);
103             enum GDC_with_SSSE3 = __traits(compiles, __builtin_ia32_pmulhrsw128);
104             enum GDC_with_SSE41 = __traits(compiles, __builtin_ia32_dpps);
105             enum GDC_with_SSE42 = __traits(compiles, __builtin_ia32_pcmpgtq);
106             enum GDC_with_AVX = __traits(compiles, __builtin_ia32_vbroadcastf128_pd256);
107             enum GDC_with_F16C = __traits(compiles, __builtin_ia32_vcvtps2ph);
108             enum GDC_with_AVX2 = __traits(compiles, __builtin_ia32_gathersiv2df);
109             enum GDC_with_SHA = __traits(compiles, __builtin_ia32_sha256msg1);
110             enum GDC_with_BMI2 = __traits(compiles, __builtin_ia32_pext_si);
111 
112         }
113         else
114         {
115             
116             enum GDC_with_SSE3 = false;
117             enum GDC_with_SSSE3 = false;
118             enum GDC_with_SSE41 = false;
119             enum GDC_with_SSE42 = false;
120             enum GDC_with_AVX = false;
121             enum GDC_with_F16C = false;
122             enum GDC_with_AVX2 = false;
123             enum GDC_with_SHA = false;
124             enum GDC_with_BMI2 = false;
125         }
126 
127         
128     }
129     else
130     {
131         enum GDC_with_x86 = false;
132         enum GDC_with_MMX = false;
133         enum GDC_with_SSE = false;
134         enum GDC_with_SSE2 = false;
135         enum GDC_with_SSE3 = false;
136         enum GDC_with_SSSE3 = false;
137         enum GDC_with_SSE41 = false;
138         enum GDC_with_SSE42 = false;
139         enum GDC_with_AVX = false;
140         enum GDC_with_F16C = false;
141         enum GDC_with_AVX2 = false;
142         enum GDC_with_SHA = false;
143         enum GDC_with_BMI2 = false;
144     }
145 }
146 else
147 {
148     enum GDC_with_x86 = false;
149     enum GDC_with_MMX = false;
150     enum GDC_with_SSE = false;
151     enum GDC_with_SSE2 = false;
152     enum GDC_with_SSE3 = false;
153     enum GDC_with_SSSE3 = false;
154     enum GDC_with_SSE41 = false;
155     enum GDC_with_SSE42 = false;
156     enum GDC_with_AVX = false;
157     enum GDC_with_F16C = false;
158     enum GDC_with_AVX2 = false;
159     enum GDC_with_SHA = false;
160     enum GDC_with_BMI2 = false;
161 }
162 
163 version(LDC)
164 {
165     public import core.simd;
166     public import ldc.simd;
167     public import ldc.intrinsics;
168     public import ldc.llvmasm: __asm;
169 
170     version (X86)
171         private enum bool some_x86 = true;
172     else version (X86_64)
173         private enum bool some_x86 = true;
174     else
175         private enum bool some_x86 = false;
176 
177     // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR
178     static if (__VERSION__ >= 2083)
179     {
180         import ldc.llvmasm;
181         alias LDCInlineIR = __ir_pure;
182 
183         // A version of inline IR with prefix/suffix didn't exist before LDC 1.13
184         alias LDCInlineIREx = __irEx_pure; 
185 
186         enum bool LDC_with_InlineIREx = true;
187     }
188     else
189     {
190         alias LDCInlineIR = inlineIR;
191         enum bool LDC_with_InlineIREx = false;
192     }
193 
194     // This is used to disable LDC feature that are expensive at compile time: 
195     // everything that relies on inline LLVM IR.
196     version(D_Optimized)
197     {
198         enum bool LDC_with_optimizations = true;
199     }
200     else
201     {
202         static if (__VERSION__ < 2101)
203         {
204             // See Issue #136, D_Optimized only appeared in DMDFE 2.101.
205             // Relying on this had terrible consequences.
206             enum bool LDC_with_optimizations = true;
207         }
208         else
209             enum bool LDC_with_optimizations = false;
210     }
211 
212     version(ARM)
213     {
214         public import ldc.gccbuiltins_arm;
215 
216         enum LDC_with_ARM32 = true;
217         enum LDC_with_ARM64 = false;
218         enum LDC_with_ARM64_CRC = false;
219         enum LDC_with_SSE = false;
220         enum LDC_with_SSE2 = false;
221         enum LDC_with_SSE3 = false;
222         enum LDC_with_SSSE3 = false;
223         enum LDC_with_SSE41 = false;
224         enum LDC_with_SSE42 = false;
225         enum LDC_with_CRC32 = false;
226         enum LDC_with_AVX = false;
227         enum LDC_with_F16C = false;
228         enum LDC_with_AVX2 = false;
229         enum LDC_with_SHA = false;
230         enum LDC_with_BMI2 = false;
231     }
232     else version(AArch64)
233     {
234         public import ldc.gccbuiltins_aarch64;
235         enum LDC_with_ARM32 = false;
236         enum LDC_with_ARM64 = true; // implies "has Neon"
237         enum LDC_with_ARM64_CRC = __traits(targetHasFeature, "crc");
238         enum LDC_with_SSE = false;
239         enum LDC_with_SSE2 = false;
240         enum LDC_with_SSE3 = false;
241         enum LDC_with_SSSE3 = false;
242         enum LDC_with_SSE41 = false;
243         enum LDC_with_SSE42 = false;
244         enum LDC_with_CRC32 = false;
245         enum LDC_with_AVX = false;
246         enum LDC_with_F16C = false;
247         enum LDC_with_AVX2 = false;
248         enum LDC_with_SHA = false;
249         enum LDC_with_BMI2 = false;
250     }
251     else static if (some_x86)
252     {
253         public import ldc.gccbuiltins_x86;
254 
255         // Workaround LDC 1.32.0 having NO builtins at all.
256         // See LDC issue 4347 https://github.com/ldc-developers/ldc/issues/4347
257         enum LDC_with_ia32_builtins = __traits(compiles, __builtin_ia32_clflush); // This one must be available in all of LDC history.
258 
259         static if (!LDC_with_ia32_builtins)
260         {
261             // in case our __builtin_ia32_clflush workaround breaks
262             pragma(msg, "Warning: LDC v1.32.0 has no SIMD builtins. intel-intrinsics will use slow path. Please avoid LDC 1.32.0");
263         }
264 
265         enum LDC_with_ARM32 = false;
266         enum LDC_with_ARM64 = false;
267         enum LDC_with_ARM64_CRC = false;
268         enum LDC_with_SSE = __traits(targetHasFeature, "sse") && LDC_with_ia32_builtins;
269         enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2") && LDC_with_ia32_builtins;
270         enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3") && LDC_with_ia32_builtins;
271         enum LDC_with_SSSE3 = __traits(targetHasFeature, "ssse3") && LDC_with_ia32_builtins;
272         enum LDC_with_SSE41 = __traits(targetHasFeature, "sse4.1") && LDC_with_ia32_builtins;
273         enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2") && LDC_with_ia32_builtins;
274 
275         // Since LDC 1.30, crc32 is a separate (and sufficient) attribute from sse4.2
276         // As of Jan 2023, GDC doesn't make that distinction, -msse4.2 includes -mcrc32 for GDC.
277         static if (__VERSION__ >= 2100)
278         {
279             enum LDC_with_CRC32 = __traits(targetHasFeature, "crc32") && LDC_with_ia32_builtins;
280         }
281         else
282         {
283             enum LDC_with_CRC32 = __traits(targetHasFeature, "sse4.2") && LDC_with_ia32_builtins; // crc32 used to be included in sse4.2
284         }
285 
286         enum LDC_with_AVX = __traits(targetHasFeature, "avx") && LDC_with_ia32_builtins;
287         enum LDC_with_F16C = __traits(targetHasFeature, "f16c") && LDC_with_ia32_builtins;
288         enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2") && LDC_with_ia32_builtins;
289         enum LDC_with_SHA = __traits(targetHasFeature, "sha") && LDC_with_ia32_builtins;
290         enum LDC_with_BMI2 = __traits(targetHasFeature, "bmi2") && LDC_with_ia32_builtins;
291     }
292     else
293     {
294         enum LDC_with_ARM32 = false;
295         enum LDC_with_ARM64 = false;
296         enum LDC_with_ARM64_CRC = false;
297         enum LDC_with_SSE = false;
298         enum LDC_with_SSE2 = false;
299         enum LDC_with_SSE3 = false;
300         enum LDC_with_SSSE3 = false;
301         enum LDC_with_SSE41 = false;
302         enum LDC_with_SSE42 = false;
303         enum LDC_with_CRC32 = false;
304         enum LDC_with_AVX = false;
305         enum LDC_with_F16C = false;
306         enum LDC_with_AVX2 = false;
307         enum LDC_with_SHA = false;
308         enum LDC_with_BMI2 = false;
309     }
310 
311     // Should we use inline x86 assembly with DMD syntax, in LDC?
312     version(D_InlineAsm_X86)
313     {
314         enum LDC_with_32b_x86_asm = LDC_with_SSE2; // if no SSE support, disable the x86 asm code path
315         enum LDC_with_64b_x86_asm = false;
316     }
317     else version(D_InlineAsm_X86_64)
318     {
319         enum LDC_with_32b_x86_asm = false;
320         enum LDC_with_64b_x86_asm = LDC_with_SSE2;
321     }
322     else
323     {
324         enum LDC_with_32b_x86_asm = false;
325         enum LDC_with_64b_x86_asm = false;
326     }
327 }
328 else
329 {
330     enum LDC_with_ARM32 = false;
331     enum LDC_with_ARM64 = false;
332     enum LDC_with_ARM64_CRC = false;
333     enum LDC_with_SSE = false;
334     enum LDC_with_SSE2 = false;
335     enum LDC_with_SSE3 = false;
336     enum LDC_with_SSSE3 = false;
337     enum LDC_with_SSE41 = false;
338     enum LDC_with_SSE42 = false;
339     enum LDC_with_CRC32 = false;
340     enum LDC_with_AVX = false;
341     enum LDC_with_F16C = false;
342     enum LDC_with_AVX2 = false;
343     enum LDC_with_SHA = false;
344     enum LDC_with_BMI2 = false;
345 
346     enum LDC_with_InlineIREx = false;
347     enum bool LDC_with_optimizations = false;
348     enum bool LDC_with_32b_x86_asm = false;
349     enum bool LDC_with_64b_x86_asm = false;
350 }
351 enum LDC_with_x86_asm = LDC_with_32b_x86_asm || LDC_with_64b_x86_asm;
352 
353 
354 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64;
355 
356 version(DigitalMars)
357 {
358     version(D_InlineAsm_X86)
359         enum DMD_with_asm = true;
360     else version(D_InlineAsm_X86_64)
361         enum DMD_with_asm = true;
362     else
363         enum DMD_with_asm = false;
364 
365     version(D_InlineAsm_X86)
366         enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution
367     else
368         enum DMD_with_32bit_asm = false;
369 
370     version (D_SIMD)
371     {
372         enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated;
373 
374         // Going further, does DMD has SSE4.1 through -mcpu?
375         static if (DMD_with_DSIMD)
376             enum bool DMD_with_DSIMD_and_SSE41 = __traits(compiles, int4(0) * int4(0));
377         else
378             enum bool DMD_with_DSIMD_and_SSE41 = false;
379 
380         // No DMD way to detect those instruction sets => pessimize
381         // would be cool to have a way to detect support for this at CT
382         enum DMD_with_DSIMD_and_SSE3  = DMD_with_DSIMD_and_SSE41; 
383         enum DMD_with_DSIMD_and_SSSE3 = DMD_with_DSIMD_and_SSE41;
384 
385         version(D_AVX)
386             enum DMD_with_DSIMD_and_AVX   = true;
387         else
388             enum DMD_with_DSIMD_and_AVX   = false;
389 
390         version(D_AVX2)
391             enum DMD_with_DSIMD_and_AVX2  = true;
392         else
393             enum DMD_with_DSIMD_and_AVX2  = false;
394 
395         enum DMD_with_DSIMD_and_SSE42 = DMD_with_DSIMD_and_AVX;
396     }
397     else
398     {
399         enum DMD_with_DSIMD = false;
400         enum DMD_with_DSIMD_and_SSE3  = false;
401         enum DMD_with_DSIMD_and_SSSE3 = false;
402         enum DMD_with_DSIMD_and_SSE41 = false;
403         enum DMD_with_DSIMD_and_SSE42 = false;
404         enum DMD_with_DSIMD_and_AVX   = false;
405         enum DMD_with_DSIMD_and_AVX2  = false;
406     }
407 }
408 else
409 {
410     enum DMD_with_asm = false;
411     enum DMD_with_32bit_asm = false;
412     enum DMD_with_DSIMD = false;
413     enum DMD_with_DSIMD_and_SSE3  = false;
414     enum DMD_with_DSIMD_and_SSSE3 = false;
415     enum DMD_with_DSIMD_and_SSE41 = false;
416     enum DMD_with_DSIMD_and_SSE42 = false;
417     enum DMD_with_DSIMD_and_AVX   = false;
418     enum DMD_with_DSIMD_and_AVX2  = false;
419 }
420 
421 
422 // Sometimes, can be helpful to merge builtin code, however keep in mind that
423 // LDC and GDC builtins often subtly diverge, wrt. unsigned vs signed vectors, 
424 // return types, purity... test it in Godbolt! this is safer with float and double intrinsics.
425 enum GDC_or_LDC_with_SSE  = GDC_with_SSE  || LDC_with_SSE;
426 enum GDC_or_LDC_with_SSE2 = GDC_with_SSE2 || LDC_with_SSE2;
427 enum GDC_or_LDC_with_SSE3 = GDC_with_SSE3 || LDC_with_SSE3;
428 enum GDC_or_LDC_with_SSE41 = GDC_with_SSE41 || LDC_with_SSE41;
429 enum GDC_or_LDC_with_SSE42 = GDC_with_SSE42 || LDC_with_SSE42;
430 
431 enum GDC_or_LDC_with_AVX  = GDC_with_AVX  || LDC_with_AVX;
432 enum GDC_or_LDC_with_F16C = GDC_with_F16C || LDC_with_F16C;
433 enum GDC_or_LDC_with_AVX2 = GDC_with_AVX2 || LDC_with_AVX2;
434 enum GDC_or_LDC_with_SHA  = GDC_with_SHA  || LDC_with_SHA;
435 enum GDC_or_LDC_with_BMI2 = GDC_with_BMI2 || LDC_with_BMI2;
436 
437 static if (__VERSION__ >= 2102)
438 {
439     enum SIMD_COMPARISON_MASKS_8B  = !MMXSizedVectorsAreEmulated; // can do < <= => > == with builtin 8 bytes __vectors.
440     enum SIMD_COMPARISON_MASKS_16B = !SSESizedVectorsAreEmulated; // can do < <= => > == with builtin 16 bytes __vectors.
441     enum SIMD_COMPARISON_MASKS_32B = !AVXSizedVectorsAreEmulated; // can do < <= => > == with builtin 32 bytes __vectors.
442 }
443 else
444 {
445     enum SIMD_COMPARISON_MASKS_8B = false;
446     enum SIMD_COMPARISON_MASKS_16B = false;
447     enum SIMD_COMPARISON_MASKS_32B = false;
448 }
449 
450 
451 static if (LDC_with_ARM32)
452 {
453     package uint arm_get_fpcr() nothrow @nogc @trusted
454     {
455         return __builtin_arm_get_fpscr();
456     }
457 
458     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
459     {
460         __builtin_arm_set_fpscr(cw);
461     }
462 }
463 
464 static if (LDC_with_ARM64)
465 {
466     pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr")
467         long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe;
468 
469     package uint arm_get_fpcr() pure nothrow @nogc @trusted
470     {
471         // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR
472         return __asm!uint("mrs $0, fpcr", "=r");
473     }
474 
475     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
476     {
477         // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR.
478         long save_x2;
479         __asm!void("str x2, $1 \n" ~
480                    "ldr w2, $0 \n" ~
481                    "msr fpcr, x2 \n" ~
482                    "ldr x2, $1 "   , "m,m", cw, &save_x2);
483     }
484 }
485 
486 
487 // For internal use only, since public API deals with a x86 semantic emulation
488 enum uint _MM_ROUND_NEAREST_ARM     = 0x00000000;
489 enum uint _MM_ROUND_DOWN_ARM        = 0x00800000;
490 enum uint _MM_ROUND_UP_ARM          = 0x00400000;
491 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000;
492 enum uint _MM_ROUND_MASK_ARM        = 0x00C00000;
493 enum uint _MM_FLUSH_ZERO_MASK_ARM   = 0x01000000;
494 
495 
496 //
497 //  <ROUNDING>
498 //
499 //  Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE
500 //  doesn't change the FPU rounding mode, and isn't expected to do so.
501 //  So we devised these rounding function to help having consistent rounding between 
502 //  LDC and DMD. It's important that DMD uses whatever is in MXCSR to round.
503 //
504 //  Note: There is no MXCSR in ARM. But there is fpcr/fpscr that implements similar 
505 //  functionality.
506 //  https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register
507 //  We use fpcr/fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly.
508 
509 int convertFloatToInt32UsingMXCSR(float value) @trusted
510 {
511     int result;
512     version(GNU)
513     {
514         version(X86)
515         {
516             asm pure nothrow @nogc @trusted
517             {
518                 "cvtss2si %1, %0\n": "=r"(result) : "x" (value);
519             }
520         }
521         else version(X86_64)
522         {
523             asm pure nothrow @nogc @trusted
524             {
525                 "cvtss2si %1, %0\n": "=r"(result) : "x" (value);
526             }
527         }
528         else
529         {
530             // BUG: this is truncation instead of MXCSR
531             result = cast(int)value;
532         }
533     }
534     else static if (LDC_with_ARM32)
535     {
536         result = __asm!int(`vldr s2, $1
537                             vcvtr.s32.f32 s2, s2
538                             vmov $0, s2`, "=r,m,~{s2}", value);
539     }
540     else static if (LDC_with_ARM64)
541     {
542         // Get current rounding mode.
543         uint fpscr = arm_get_fpcr();
544 
545         switch(fpscr & _MM_ROUND_MASK_ARM)
546         {
547             default:
548             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f32(value); break;
549             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f32(value); break;
550             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f32(value); break;
551             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value);  break;
552         }
553     }
554     else
555     {
556         asm pure nothrow @nogc @trusted
557         {
558             cvtss2si EAX, value;
559             mov result, EAX;
560         }
561     }
562     return result;
563 }
564 
565 int convertDoubleToInt32UsingMXCSR(double value) @trusted
566 {
567     int result;
568     version(GNU)
569     {
570         version(X86)
571         {
572             asm pure nothrow @nogc @trusted
573             {
574                 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value);
575             }
576         }
577         else version(X86_64)
578         {
579             asm pure nothrow @nogc @trusted
580             {
581                 "cvtsd2si %1, %0\n": "=r"(result) : "x" (value);
582             }
583         }
584         else
585         {
586             // BUG: this is truncation instead of MXCSR
587             result = cast(int)value;
588         }
589     }
590     else static if (LDC_with_ARM32)
591     {
592         result = __asm!int(`vldr d2, $1
593                             vcvtr.s32.f64 s2, d2
594                             vmov $0, s2`, "=r,m,~{s2},~{d2}", value);
595     }
596     else static if (LDC_with_ARM64)
597     {
598         // Get current rounding mode.
599         uint fpscr = arm_get_fpcr();
600 
601         switch(fpscr & _MM_ROUND_MASK_ARM)
602         {
603             default:
604             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f64(value); break;
605             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f64(value); break;
606             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f64(value); break;
607             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value);  break;
608         }
609     }
610     else
611     {
612         asm pure nothrow @nogc @trusted
613         {
614             cvtsd2si EAX, value;
615             mov result, EAX;
616         }
617     }
618     return result;
619 }
620 
621 long convertFloatToInt64UsingMXCSR(float value) @trusted
622 {
623     static if (LDC_with_ARM32)
624     {
625         // We have to resort to libc since 32-bit ARM 
626         // doesn't seem to have 64-bit registers.
627         
628         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
629 
630         // Note: converting to double precision else rounding could be different for large integers
631         double asDouble = value; 
632 
633         switch(fpscr & _MM_ROUND_MASK_ARM)
634         {
635             default:
636             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(asDouble));
637             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(asDouble));
638             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(asDouble));
639             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble);
640         }
641     }
642     else static if (LDC_with_ARM64)
643     {
644         uint fpscr = arm_get_fpcr();
645 
646         switch(fpscr & _MM_ROUND_MASK_ARM)
647         {
648             default:
649             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f32(value);
650             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f32(value);
651             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f32(value);
652             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value);
653         }
654     }
655     // 64-bit can use an SSE instruction
656     else version(D_InlineAsm_X86_64)
657     {
658         long result;
659         version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet."
660         {
661             asm pure nothrow @nogc @trusted
662             {
663                 movss XMM0, value;
664                 cvtss2si RAX, XMM0;
665                 mov result, RAX;
666             }
667         }
668         else
669         {
670             asm pure nothrow @nogc @trusted
671             {
672                 movss XMM0, value;
673                 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit)
674                 mov result, RAX;
675             }
676         }
677         return result;
678     }
679     else version(D_InlineAsm_X86)
680     {
681         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
682         // This leads to an unfortunate FPU sequence in every C++ compiler.
683         // See: https://godbolt.org/z/vZym77
684 
685         // Get current MXCSR rounding
686         uint sseRounding;
687         ushort savedFPUCW;
688         ushort newFPUCW;
689         long result;
690         asm pure nothrow @nogc @trusted
691         {
692             stmxcsr sseRounding;
693             fld value;
694             fnstcw savedFPUCW;
695             mov AX, savedFPUCW;
696             and AX, 0xf3ff;          // clear FPU rounding bits
697             movzx ECX, word ptr sseRounding;
698             and ECX, 0x6000;         // only keep SSE rounding bits
699             shr ECX, 3;
700             or AX, CX;               // make a new control word for FPU with SSE bits
701             mov newFPUCW, AX;
702             fldcw newFPUCW;
703             fistp qword ptr result;            // convert, respecting MXCSR (but not other control word things)
704             fldcw savedFPUCW;
705         }
706         return result;
707     }
708     else static if (GDC_with_x86)
709     {
710         version(X86_64) // 64-bit can just use the right instruction
711         {
712             static assert(GDC_with_SSE);
713             __m128 A;
714             A.ptr[0] = value;
715             return __builtin_ia32_cvtss2si64 (A);
716         }
717         else version(X86) // 32-bit
718         {
719             // This is untested!
720             uint sseRounding;
721             ushort savedFPUCW;
722             ushort newFPUCW;
723             long result;
724             asm pure nothrow @nogc @trusted
725             {
726                 "stmxcsr %1;\n" ~
727                 "fld %2;\n" ~
728                 "fnstcw %3;\n" ~
729                 "movw %3, %%ax;\n" ~
730                 "andw $0xf3ff, %%ax;\n" ~
731                 "movzwl %1, %%ecx;\n" ~
732                 "andl $0x6000, %%ecx;\n" ~
733                 "shrl $3, %%ecx;\n" ~
734                 "orw %%cx, %%ax\n" ~
735                 "movw %%ax, %4;\n" ~
736                 "fldcw %4;\n" ~
737                 "fistpll %0;\n" ~
738                 "fldcw %3;\n" 
739                   : "=m"(result)    // %0
740                   : "m" (sseRounding),
741                     "f" (value),
742                     "m" (savedFPUCW),
743                     "m" (newFPUCW) 
744                   : "eax", "ecx", "st";
745             }
746             return result;
747         }
748         else
749             static assert(false);
750     }
751     else
752     {
753         // BUG
754         // This is a last result and wrong, typically
755         // for GDC architectures we don't yet support
756         return cast(long)value;
757     }
758 }
759 
760 
761 ///ditto
762 long convertDoubleToInt64UsingMXCSR(double value) @trusted
763 {
764     static if (LDC_with_ARM32)
765     {
766         // We have to resort to libc since 32-bit ARM 
767         // doesn't seem to have 64-bit registers.
768         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
769         switch(fpscr & _MM_ROUND_MASK_ARM)
770         {
771             default:
772             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(value));
773             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(value));
774             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(value));
775             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value);
776         }
777     }
778     else static if (LDC_with_ARM64)
779     {
780         // Get current rounding mode.
781         uint fpscr = arm_get_fpcr();
782 
783         switch(fpscr & _MM_ROUND_MASK_ARM)
784         {
785             default:
786             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f64(value);
787             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f64(value);
788             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f64(value);
789             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value);
790         }
791     }
792     // 64-bit can use an SSE instruction
793     else version(D_InlineAsm_X86_64)
794     {
795         long result;
796         version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet."
797         {
798             asm pure nothrow @nogc @trusted
799             {
800                 movsd XMM0, value;
801                 cvtsd2si RAX, XMM0;
802                 mov result, RAX;
803             }
804         }
805         else
806         {
807             asm pure nothrow @nogc @trusted
808             {
809                 movsd XMM0, value;
810                 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit)
811                 mov result, RAX;
812             }
813         }
814         return result;
815     }
816     else version(D_InlineAsm_X86)
817     {
818         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
819         // This leads to an unfortunate FPU sequence in every C++ compiler.
820         // See: https://godbolt.org/z/vZym77
821 
822         // Get current MXCSR rounding
823         uint sseRounding;
824         ushort savedFPUCW;
825         ushort newFPUCW;
826         long result;
827         asm pure nothrow @nogc @trusted
828         {
829             stmxcsr sseRounding;
830             fld value;
831             fnstcw savedFPUCW;
832             mov AX, savedFPUCW;
833             and AX, 0xf3ff;
834             movzx ECX, word ptr sseRounding;
835             and ECX, 0x6000;
836             shr ECX, 3;
837             or AX, CX;
838             mov newFPUCW, AX;
839             fldcw newFPUCW;
840             fistp result;
841             fldcw savedFPUCW;
842         }
843         return result;
844     }
845     else static if (GDC_with_x86)
846     {
847         version(X86_64)
848         {
849             static assert(GDC_with_SSE2);
850             __m128d A;
851             A.ptr[0] = value;
852             return __builtin_ia32_cvtsd2si64 (A);
853         }
854         else
855         {
856             // This is untested!
857             uint sseRounding;
858             ushort savedFPUCW;
859             ushort newFPUCW;
860             long result;
861             asm pure nothrow @nogc @trusted
862             {
863                 "stmxcsr %1;\n" ~
864                 "fld %2;\n" ~
865                 "fnstcw %3;\n" ~
866                 "movw %3, %%ax;\n" ~
867                 "andw $0xf3ff, %%ax;\n" ~
868                 "movzwl %1, %%ecx;\n" ~
869                 "andl $0x6000, %%ecx;\n" ~
870                 "shrl $3, %%ecx;\n" ~
871                 "orw %%cx, %%ax\n" ~
872                 "movw %%ax, %4;\n" ~
873                 "fldcw %4;\n" ~
874                 "fistpll %0;\n" ~
875                 "fldcw %3;\n"         
876                   : "=m"(result)    // %0
877                   : "m" (sseRounding),
878                     "t" (value),
879                     "m" (savedFPUCW),
880                     "m" (newFPUCW) 
881                   : "eax", "ecx", "st";
882             }
883             return result;
884         }
885     }
886     else
887     {
888         // BUG
889         // This is a last result and wrong, typically
890         // for GDC architectures we don't yet support
891         return cast(long)value;
892     }
893 }
894 
895 //
896 //  </ROUNDING>
897 //
898 
899 
900 // using the Intel terminology here
901 
902 byte saturateSignedWordToSignedByte(short value) pure @safe
903 {
904     if (value > 127) value = 127;
905     if (value < -128) value = -128;
906     return cast(byte) value;
907 }
908 
909 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe
910 {
911     if (value > 255) value = 255;
912     if (value < 0) value = 0;
913     return cast(ubyte) value;
914 }
915 
916 short saturateSignedIntToSignedShort(int value) pure @safe
917 {
918     if (value > 32767) value = 32767;
919     if (value < -32768) value = -32768;
920     return cast(short) value;
921 }
922 
923 ushort saturateSignedIntToUnsignedShort(int value) pure @safe
924 {
925     if (value > 65535) value = 65535;
926     if (value < 0) value = 0;
927     return cast(ushort) value;
928 }
929 
930 unittest // test saturate operations
931 {
932     assert( saturateSignedWordToSignedByte(32000) == 127);
933     assert( saturateSignedWordToUnsignedByte(32000) == 255);
934     assert( saturateSignedWordToSignedByte(-4000) == -128);
935     assert( saturateSignedWordToUnsignedByte(-4000) == 0);
936     assert( saturateSignedIntToSignedShort(32768) == 32767);
937     assert( saturateSignedIntToUnsignedShort(32768) == 32768);
938     assert( saturateSignedIntToSignedShort(-32769) == -32768);
939     assert( saturateSignedIntToUnsignedShort(-32769) == 0);
940 }
941 
942 version(unittest)
943 {
944     // This is just for debugging tests
945     import core.stdc.stdio: printf;
946 
947     // printing vectors for implementation
948     // Note: you can override `pure` within a `debug` clause
949 
950     void _mm_print_pi64(__m64 v) @trusted
951     {
952         long1 vl = cast(long1)v;
953         printf("%lld\n", vl.array[0]);
954     }
955 
956     void _mm_print_pi32(__m64 v) @trusted
957     {
958         int[2] C = (cast(int2)v).array;
959         printf("%d %d\n", C[0], C[1]);
960     }
961 
962     void _mm_print_pi16(__m64 v) @trusted
963     {
964         short[4] C = (cast(short4)v).array;
965         printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]);
966     }
967 
968     void _mm_print_pi8(__m64 v) @trusted
969     {
970         byte[8] C = (cast(byte8)v).array;
971         printf("%d %d %d %d %d %d %d %d\n",
972         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
973     }
974 
975     void _mm_print_epi64(__m128i v) @trusted
976     {
977         long2 vl = cast(long2)v;
978         printf("%lld %lld\n", vl.array[0], vl.array[1]);
979     }
980 
981     void _mm_print_epi32(__m128i v) @trusted
982     {
983         printf("%d %d %d %d\n",
984               v.array[0], v.array[1], v.array[2], v.array[3]);
985     }
986 
987     void _mm_print_short8(short8 v) @trusted
988     {
989         printf("%d %d %d %d %d %d %d %d\n",
990                v.array[0], v.array[1], v.array[2], v.array[3],
991                v.array[4], v.array[5], v.array[6], v.array[7]);
992     } 
993 
994     void _mm_print_epi16(__m128i v) @trusted
995     {
996         short[8] C = (cast(short8)v).array;
997         printf("%d %d %d %d %d %d %d %d\n",
998         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
999     }
1000 
1001     void _mm_print_epi8(__m128i v) @trusted
1002     {
1003         byte[16] C = (cast(byte16)v).array;
1004         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
1005         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]);
1006     }
1007 
1008     void _mm_print_ps(__m128 v) @trusted
1009     {
1010         // %g because %f can conceal very small numbers and prints zero instead
1011         float[4] C = (cast(float4)v).array;
1012         printf("%g %g %g %g\n", C[0], C[1], C[2], C[3]);
1013     }
1014 
1015     void _mm_print_pd(__m128d v) @trusted
1016     {
1017         double[2] C = (cast(double2)v).array;
1018         printf("%f %f\n", C[0], C[1]);
1019     }
1020 
1021     void _mm256_print_pd(__m256d v) @trusted
1022     {
1023         // %g because %f can conceal very small numbers and prints zero instead
1024         printf("%g %g %g %g\n", v.array[0], v.array[1], v.array[2], v.array[3]); 
1025     }
1026 
1027     void _mm256_print_ps(__m256 v) @trusted
1028     {
1029         // %g because %f can conceal very small numbers and prints zero instead
1030         printf("%g %g %g %g %g %g %g %g\n", 
1031             v.array[0], v.array[1], v.array[2], v.array[3],
1032             v.array[4], v.array[5], v.array[6], v.array[7]); 
1033     }
1034 
1035     void _mm256_print_epi16(__m256i v) @trusted
1036     {
1037         short16 vl = cast(short16)v;
1038         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", 
1039                vl.array[0], vl.array[1], vl.array[2], vl.array[3],
1040                vl.array[4], vl.array[5], vl.array[6], vl.array[7],
1041                vl.array[8], vl.array[9], vl.array[10], vl.array[11],
1042                vl.array[12], vl.array[13], vl.array[14], vl.array[15]);
1043     }
1044 
1045     void _mm256_print_epi32(__m256i v) @trusted
1046     {
1047         int8 vl = cast(int8)v;
1048         printf("%d %d %d %d %d %d %d %d\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3],
1049                                             vl.array[4], vl.array[5], vl.array[6], vl.array[7]);
1050     }
1051 
1052     void _mm256_print_epi64(__m256i v) @trusted
1053     {
1054         long4 vl = cast(long4)v;
1055         printf("%lld %lld %lld %lld\n", vl.array[0], vl.array[1], vl.array[2], vl.array[3]);
1056     }
1057 
1058     void _mm256_print_epi8(__m256i v) @trusted
1059     {
1060         byte[32] C = (cast(byte32)v).array;
1061         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
1062                C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], 
1063                C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15],
1064                C[16], C[17], C[18], C[19], C[20], C[21], C[22], C[23], 
1065                C[24], C[25], C[26], C[27], C[28], C[29], C[30], C[31]);
1066 
1067     }
1068 }
1069 
1070 
1071 //
1072 //  <FLOATING-POINT COMPARISONS>
1073 //
1074 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we
1075 //       need different IR generation.
1076 
1077 enum FPComparison
1078 {
1079     false_,// always false
1080     oeq,   // ordered and equal
1081     ogt,   // ordered and greater than
1082     oge,   // ordered and greater than or equal
1083     olt,   // ordered and less than
1084     ole,   // ordered and less than or equal
1085     one,   // ordered and not equal
1086     ord,   // ordered (no nans)
1087     ueq,   // unordered or equal
1088     ugt,   // unordered or greater than ("nle")
1089     uge,   // unordered or greater than or equal ("nlt")
1090     ult,   // unordered or less than ("nge")
1091     ule,   // unordered or less than or equal ("ngt")
1092     une,   // unordered or not equal ("neq")
1093     uno,   // unordered (either nans)
1094     true_, // always true
1095 }
1096 
1097 private static immutable string[FPComparison.max+1] FPComparisonToString =
1098 [
1099     "false",
1100     "oeq",
1101     "ogt",
1102     "oge",
1103     "olt",
1104     "ole",
1105     "one",
1106     "ord",
1107     "ueq",
1108     "ugt",
1109     "uge",
1110     "ult",
1111     "ule",
1112     "une",
1113     "uno",
1114     "true"
1115 ];
1116 
1117 // AVX FP comparison to FPComparison
1118 FPComparison mapAVXFPComparison(int imm8) pure @safe
1119 {
1120     // Always map on non-signalling
1121     static immutable FPComparison[16] mapping =
1122     [
1123         FPComparison.oeq, // _CMP_EQ_OQ
1124         FPComparison.olt, // _CMP_LT_OS
1125         FPComparison.ole, // _CMP_LE_OS
1126         FPComparison.uno, // _CMP_UNORD_Q
1127         FPComparison.une, // _CMP_NEQ_UQ // TODO does it mean net-equal OR unordered?
1128         FPComparison.uge, // _CMP_NLT_US
1129         FPComparison.ugt, // _CMP_NLE_US
1130         FPComparison.ord, // _CMP_ORD_Q
1131         FPComparison.ueq,   // _CMP_EQ_UQ  
1132         FPComparison.ult,   // _CMP_NGE_US 
1133         FPComparison.ule,   // _CMP_NGT_US 
1134         FPComparison.false_,// _CMP_FALSE_OQ
1135         FPComparison.one,   // _CMP_NEQ_OQ
1136         FPComparison.oge,   // _CMP_GE_OS
1137         FPComparison.ogt,   // _CMP_GT_OS
1138         FPComparison.true_  // _CMP_TRUE_UQ
1139     ];
1140 
1141     return mapping[imm8 & 0x0f]; // note: signalling NaN information is mixed up
1142 }
1143 
1144 // Individual float comparison: returns -1 for true or 0 for false.
1145 // Useful for DMD and testing
1146 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe
1147 {
1148     bool unordered = isnan(a) || isnan(b);
1149     final switch(comparison) with(FPComparison)
1150     {
1151         case false_: return false;
1152         case oeq: return a == b;
1153         case ogt: return a > b;
1154         case oge: return a >= b;
1155         case olt: return a < b;
1156         case ole: return a <= b;
1157         case one: return !unordered && (a != b); // NaN with != always yields true
1158         case ord: return !unordered; 
1159         case ueq: return unordered || (a == b);
1160         case ugt: return unordered || (a > b);
1161         case uge: return unordered || (a >= b);
1162         case ult: return unordered || (a < b);
1163         case ule: return unordered || (a <= b);
1164         case une: return (a != b); // NaN with != always yields true
1165         case uno: return unordered;
1166         case true_: return true;
1167     }
1168 }
1169 
1170 static if (LDC_with_optimizations) // this save time for bigger projects, since LDCInlineIR gets more expensive there.
1171 {
1172     /// Provides packed float comparisons
1173     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
1174     {
1175         enum ir = `
1176             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1
1177             %r = sext <4 x i1> %cmp to <4 x i32>
1178             ret <4 x i32> %r`;
1179 
1180         return LDCInlineIR!(ir, int4, float4, float4)(a, b);
1181     }
1182 
1183     ///ditto
1184     package int8 cmpps256(FPComparison comparison)(float8 a, float8 b) pure @safe
1185     {
1186         enum ir = `
1187             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <8 x float> %0, %1
1188             %r = sext <8 x i1> %cmp to <8 x i32>
1189             ret <8 x i32> %r`;
1190         return LDCInlineIR!(ir, int8, float8, float8)(a, b);
1191     }
1192 
1193     /// Provides packed double comparisons
1194     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
1195     {
1196         enum ir = `
1197             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1
1198             %r = sext <2 x i1> %cmp to <2 x i64>
1199             ret <2 x i64> %r`;
1200 
1201         return LDCInlineIR!(ir, long2, double2, double2)(a, b);
1202     }
1203 
1204     ///ditto 
1205     package long4 cmppd256(FPComparison comparison)(double4 a, double4 b) pure @safe
1206     {
1207         enum ir = `
1208             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x double> %0, %1
1209             %r = sext <4 x i1> %cmp to <4 x i64>
1210             ret <4 x i64> %r`;
1211         return LDCInlineIR!(ir, long4, double4, double4)(a, b);
1212     }
1213 
1214     /// CMPSS-style comparisons
1215     /// clang implement it through x86 intrinsics, it is possible with IR alone
1216     /// but leads to less optimal code.
1217     /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 
1218     /// Not that simple.
1219     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
1220     {
1221         /*
1222         enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison];
1223         enum bool invertOp = (predicateNumber & 0x80) != 0;
1224         static if(invertOp)
1225             return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f);
1226         else
1227             return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f);
1228         */
1229         enum ir = `
1230             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
1231             %r = sext i1 %cmp to i32
1232             %r2 = bitcast i32 %r to float
1233             ret float %r2`;
1234 
1235         float4 r = a;
1236         r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]);
1237         return r;
1238     }
1239 
1240     /// CMPSD-style comparisons
1241     /// clang implement it through x86 intrinsics, it is possible with IR alone
1242     /// but leads to less optimal code.
1243     /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 
1244     /// Not that simple.    
1245     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
1246     {
1247         enum ir = `
1248             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
1249             %r = sext i1 %cmp to i64
1250             %r2 = bitcast i64 %r to double
1251             ret double %r2`;
1252 
1253         double2 r = a;
1254         r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]);
1255         return r;
1256     }
1257 }
1258 else
1259 {
1260     /// Provides packed float comparisons
1261     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted
1262     {
1263         int4 result;
1264         foreach(i; 0..4)
1265         {
1266             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
1267         }
1268         return result;
1269     }
1270     ///ditto
1271     package int8 cmpps256(FPComparison comparison)(float8 a, float8 b) pure @trusted
1272     {
1273         int8 result;
1274         foreach(i; 0..8)
1275         {
1276             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
1277         }
1278         return result;
1279     }
1280 
1281     /// Provides packed double comparisons
1282     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted
1283     {
1284         long2 result;
1285         foreach(i; 0..2)
1286         {
1287             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
1288         }
1289         return result;
1290     }
1291     ///ditto
1292     package long4 cmppd256(FPComparison comparison)(double4 a, double4 b) pure @trusted
1293     {
1294         long4 result;
1295         foreach(i; 0..4)
1296         {
1297             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
1298         }
1299         return result;
1300     }
1301 
1302     /// Provides CMPSS-style comparison
1303     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted
1304     {
1305         int4 result = cast(int4)a;
1306         result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0;
1307         return cast(float4)result;
1308     }
1309 
1310     /// Provides CMPSD-style comparison
1311     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted
1312     {
1313         long2 result = cast(long2)a;
1314         result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0;
1315         return cast(double2)result;
1316     }
1317 }
1318 unittest // cmpps
1319 {
1320     // Check all comparison type is working
1321     float4 A = [1, 3, 5, float.nan];
1322     float4 B = [2, 3, 4, 5];
1323 
1324     int4 result_oeq = cmpps!(FPComparison.oeq)(A, B);
1325     int4 result_ogt = cmpps!(FPComparison.ogt)(A, B);
1326     int4 result_oge = cmpps!(FPComparison.oge)(A, B);
1327     int4 result_olt = cmpps!(FPComparison.olt)(A, B);
1328     int4 result_ole = cmpps!(FPComparison.ole)(A, B);
1329     int4 result_one = cmpps!(FPComparison.one)(A, B);
1330     int4 result_ord = cmpps!(FPComparison.ord)(A, B);
1331     int4 result_ueq = cmpps!(FPComparison.ueq)(A, B);
1332     int4 result_ugt = cmpps!(FPComparison.ugt)(A, B);
1333     int4 result_uge = cmpps!(FPComparison.uge)(A, B);
1334     int4 result_ult = cmpps!(FPComparison.ult)(A, B);
1335     int4 result_ule = cmpps!(FPComparison.ule)(A, B);
1336     int4 result_une = cmpps!(FPComparison.une)(A, B);
1337     int4 result_uno = cmpps!(FPComparison.uno)(A, B);
1338 
1339     static immutable int[4] correct_oeq    = [ 0,-1, 0, 0];
1340     static immutable int[4] correct_ogt    = [ 0, 0,-1, 0];
1341     static immutable int[4] correct_oge    = [ 0,-1,-1, 0];
1342     static immutable int[4] correct_olt    = [-1, 0, 0, 0];
1343     static immutable int[4] correct_ole    = [-1,-1, 0, 0];
1344     static immutable int[4] correct_one    = [-1, 0,-1, 0];
1345     static immutable int[4] correct_ord    = [-1,-1,-1, 0];
1346     static immutable int[4] correct_ueq    = [ 0,-1, 0,-1];
1347     static immutable int[4] correct_ugt    = [ 0, 0,-1,-1];
1348     static immutable int[4] correct_uge    = [ 0,-1,-1,-1];
1349     static immutable int[4] correct_ult    = [-1, 0, 0,-1];
1350     static immutable int[4] correct_ule    = [-1,-1, 0,-1];
1351     static immutable int[4] correct_une    = [-1, 0,-1,-1];
1352     static immutable int[4] correct_uno    = [ 0, 0, 0,-1];
1353 
1354     assert(result_oeq.array == correct_oeq);
1355     assert(result_ogt.array == correct_ogt);
1356     assert(result_oge.array == correct_oge);
1357     assert(result_olt.array == correct_olt);
1358     assert(result_ole.array == correct_ole);
1359     assert(result_one.array == correct_one);
1360     assert(result_ord.array == correct_ord);
1361     assert(result_ueq.array == correct_ueq);
1362     assert(result_ugt.array == correct_ugt);
1363     assert(result_uge.array == correct_uge);
1364     assert(result_ult.array == correct_ult);
1365     assert(result_ule.array == correct_ule);
1366     assert(result_une.array == correct_une);
1367     assert(result_uno.array == correct_uno);
1368 }
1369 unittest
1370 {
1371     double2 a = [1, 3];
1372     double2 b = [2, 3];
1373     long2 c = cmppd!(FPComparison.ult)(a, b);
1374     static immutable long[2] correct = [cast(long)(-1), 0];
1375     assert(c.array == correct);
1376 }
1377 unittest // cmpss
1378 {
1379     void testComparison(FPComparison comparison)(float4 A, float4 B)
1380     {
1381         float4 result = cmpss!comparison(A, B);
1382         int4 iresult = cast(int4)result;
1383         int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0;
1384         assert(iresult.array[0] == expected);
1385         assert(result.array[1] == A.array[1]);
1386         assert(result.array[2] == A.array[2]);
1387         assert(result.array[3] == A.array[3]);
1388     }
1389 
1390     // Check all comparison type is working
1391     float4 A = [1, 3, 5, 6];
1392     float4 B = [2, 3, 4, 5];
1393     float4 C = [float.nan, 3, 4, 5];
1394 
1395     testComparison!(FPComparison.oeq)(A, B);
1396     testComparison!(FPComparison.oeq)(A, C);
1397     testComparison!(FPComparison.ogt)(A, B);
1398     testComparison!(FPComparison.ogt)(A, C);
1399     testComparison!(FPComparison.oge)(A, B);
1400     testComparison!(FPComparison.oge)(A, C);
1401     testComparison!(FPComparison.olt)(A, B);
1402     testComparison!(FPComparison.olt)(A, C);
1403     testComparison!(FPComparison.ole)(A, B);
1404     testComparison!(FPComparison.ole)(A, C);
1405     testComparison!(FPComparison.one)(A, B);
1406     testComparison!(FPComparison.one)(A, C);
1407     testComparison!(FPComparison.ord)(A, B);
1408     testComparison!(FPComparison.ord)(A, C);
1409     testComparison!(FPComparison.ueq)(A, B);
1410     testComparison!(FPComparison.ueq)(A, C);
1411     testComparison!(FPComparison.ugt)(A, B);
1412     testComparison!(FPComparison.ugt)(A, C);
1413     testComparison!(FPComparison.uge)(A, B);
1414     testComparison!(FPComparison.uge)(A, C);
1415     testComparison!(FPComparison.ult)(A, B);
1416     testComparison!(FPComparison.ult)(A, C);
1417     testComparison!(FPComparison.ule)(A, B);
1418     testComparison!(FPComparison.ule)(A, C);
1419     testComparison!(FPComparison.une)(A, B);
1420     testComparison!(FPComparison.une)(A, C);
1421     testComparison!(FPComparison.uno)(A, B);
1422     testComparison!(FPComparison.uno)(A, C);
1423 }
1424 unittest // cmpsd
1425 {
1426     void testComparison(FPComparison comparison)(double2 A, double2 B)
1427     {
1428         double2 result = cmpsd!comparison(A, B);
1429         long2 iresult = cast(long2)result;
1430         long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0;
1431         assert(iresult.array[0] == expected);
1432         assert(result.array[1] == A.array[1]);
1433     }
1434 
1435     // Check all comparison type is working
1436     double2 A = [1, 3];
1437     double2 B = [2, 4];
1438     double2 C = [double.nan, 5];
1439 
1440     testComparison!(FPComparison.oeq)(A, B);
1441     testComparison!(FPComparison.oeq)(A, C);
1442     testComparison!(FPComparison.ogt)(A, B);
1443     testComparison!(FPComparison.ogt)(A, C);
1444     testComparison!(FPComparison.oge)(A, B);
1445     testComparison!(FPComparison.oge)(A, C);
1446     testComparison!(FPComparison.olt)(A, B);
1447     testComparison!(FPComparison.olt)(A, C);
1448     testComparison!(FPComparison.ole)(A, B);
1449     testComparison!(FPComparison.ole)(A, C);
1450     testComparison!(FPComparison.one)(A, B);
1451     testComparison!(FPComparison.one)(A, C);
1452     testComparison!(FPComparison.ord)(A, B);
1453     testComparison!(FPComparison.ord)(A, C);
1454     testComparison!(FPComparison.ueq)(A, B);
1455     testComparison!(FPComparison.ueq)(A, C);
1456     testComparison!(FPComparison.ugt)(A, B);
1457     testComparison!(FPComparison.ugt)(A, C);
1458     testComparison!(FPComparison.uge)(A, B);
1459     testComparison!(FPComparison.uge)(A, C);
1460     testComparison!(FPComparison.ult)(A, B);
1461     testComparison!(FPComparison.ult)(A, C);
1462     testComparison!(FPComparison.ule)(A, B);
1463     testComparison!(FPComparison.ule)(A, C);
1464     testComparison!(FPComparison.une)(A, B);
1465     testComparison!(FPComparison.une)(A, C);
1466     testComparison!(FPComparison.uno)(A, B);
1467     testComparison!(FPComparison.uno)(A, C);
1468 }
1469 
1470 //
1471 //  </FLOATING-POINT COMPARISONS>
1472 //
1473 
1474 
1475 __m64 to_m64(__m128i a) pure @trusted
1476 {
1477     long2 la = cast(long2)a;
1478     long1 r = la.array[0];
1479     return r;
1480 }
1481 
1482 __m128i to_m128i(__m64 a) pure @trusted
1483 {
1484   /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 
1485     
1486     version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 
1487     {
1488         long2 r = a.array[0];
1489         r.ptr[1] = 0;
1490         return cast(int4)r;
1491     }
1492     else */
1493     {
1494         long2 r = [0, 0];
1495         r.ptr[0] = a.array[0];
1496         return cast(__m128i)r;
1497     }
1498 }
1499 
1500 
1501 // ADDITIONAL LLVM INTRINSICS
1502 // Basically LDC didn't add them yet
1503 version(LDC)
1504 {
1505     static if (__VERSION__ >= 2097) // LDC 1.27+
1506     {
1507         pragma(LDC_intrinsic, "llvm.abs.i#")
1508             T inteli_llvm_abs(T)(T val, bool attrib);
1509     }
1510 
1511     static if (__VERSION__ >= 2092) // LDC 1.22+
1512     {
1513         pragma(LDC_intrinsic, "llvm.sadd.sat.i#")
1514             T inteli_llvm_adds(T)(T a, T b) pure @safe;
1515         pragma(LDC_intrinsic, "llvm.ssub.sat.i#")
1516             T inteli_llvm_subs(T)(T a, T b) pure @safe;
1517         pragma(LDC_intrinsic, "llvm.uadd.sat.i#")
1518             T inteli_llvm_addus(T)(T a, T b) pure @safe;
1519         pragma(LDC_intrinsic, "llvm.usub.sat.i#")
1520             T inteli_llvm_subus(T)(T a, T b) pure @safe;
1521 
1522         enum LDC_with_saturated_intrinsics = true;
1523     }
1524     else
1525         enum LDC_with_saturated_intrinsics = false;
1526 }
1527 else
1528     enum LDC_with_saturated_intrinsics = false;
1529 
1530 // ADDITIONAL x86 INTRINSICS
1531 // Absent from ldc.gccbuiltins_x86 for some reason, but needed.
1532 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/12.x/llvm/include/llvm/IR/IntrinsicsX86.td
1533 static if (LDC_with_SSE41)
1534 {
1535     pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb")
1536         byte16 __builtin_ia32_pblendvb(byte16, byte16, byte16) pure @safe;
1537 }
1538 
1539 // SOME NEON INTRINSICS
1540 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics.
1541 // Not in the public API but the simde project expose it all for the user to use.
1542 // MAYDO: create a new neon.d module, for internal use only.
1543 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64.
1544 static if (LDC_with_ARM64)
1545 {
1546     // VERY USEFUL LINK
1547     // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td
1548     // Also: https://developer.arm.com/architectures/instruction-sets/intrinsics/
1549 
1550     // Note: it is helpful to verify, in case of complex sequence of intrinsics, that the result is actually false.
1551     // Some intrinsics have trouble when inlined inside another, such as vmovl_low_s32. In this case, it's better to use builtins 
1552     // from backend to have an inlining that still match the instruction.
1553 
1554     pragma(LDC_intrinsic, "llvm.aarch64.crc32cb")
1555         uint __crc32cb(uint a, uint b) pure @safe;
1556 
1557     pragma(LDC_intrinsic, "llvm.aarch64.crc32ch")
1558         uint __crc32ch(uint a, uint b) pure @safe;
1559 
1560     pragma(LDC_intrinsic, "llvm.aarch64.crc32cw")
1561         uint __crc32cw(uint a, uint b) pure @safe;
1562 
1563     pragma(LDC_intrinsic, "llvm.aarch64.crc32cx")
1564         uint __crc32cd(uint a, ulong b) pure @safe;
1565 
1566     //pragma(LDC_intrinsic, "llvm.aarch64.dmb")
1567     //    uint __dmb(int a) @safe; // didn't found a name in intrinsic list
1568 
1569     pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8")
1570         byte16 vabdq_u8(byte16 a, byte16 b) pure @safe;
1571 
1572     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v8i16")
1573         short8 vabsq_s16(short8 a) pure @safe;
1574 
1575     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v4i32")
1576         int4 vabsq_s32(int4 a) pure @safe;
1577 
1578     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v16i8")
1579         byte16 vabsq_s8(byte16 a) pure @safe;
1580 
1581     byte8 vand_u8(byte8 a, byte8 b) pure @safe
1582     {
1583         return a & b;
1584     }
1585 
1586     long2 vandq_s64(long2 a, long2 b)
1587     {
1588         return a & b;
1589     }
1590 
1591     long2 vbicq_s64(long2 a, long2 b) pure @safe
1592     {
1593         return a & ~b;
1594     }
1595 
1596     int4 vbslq_s32(int4 a, int4 b, int4 c) pure @safe
1597     {
1598         return c ^ ((c ^ b) & a);
1599     }
1600 
1601     byte16 vbslq_s8(byte16 a, byte16 b, byte16 c) pure @safe
1602     {
1603         return c ^ ((c ^ b) & a);
1604     }
1605 
1606     long2 vbslq_s64(long2 a, long2 b, long2 c) pure @safe
1607     {
1608         return c ^ ((c ^ b) & a);
1609     }
1610 
1611     short8 vcombine_s16(short4 lo, short4 hi) pure @trusted
1612     {
1613         short8 r;
1614         r.ptr[0]  = lo.array[0];
1615         r.ptr[1]  = lo.array[1];
1616         r.ptr[2]  = lo.array[2];
1617         r.ptr[3]  = lo.array[3];
1618         r.ptr[4]  = hi.array[0];
1619         r.ptr[5]  = hi.array[1];
1620         r.ptr[6]  = hi.array[2];
1621         r.ptr[7]  = hi.array[3];
1622         return r;
1623     }
1624 
1625     int4 vcombine_s32(int2 lo, int2 hi) pure @trusted
1626     {
1627         int4 r;
1628         r.ptr[0] = lo.array[0];
1629         r.ptr[1] = lo.array[1];
1630         r.ptr[2] = hi.array[0];
1631         r.ptr[3] = hi.array[1];
1632         return r;
1633     }
1634 
1635     byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted
1636     {
1637         byte16 r;
1638         r.ptr[0]  = lo.array[0];
1639         r.ptr[1]  = lo.array[1];
1640         r.ptr[2]  = lo.array[2];
1641         r.ptr[3]  = lo.array[3];
1642         r.ptr[4]  = lo.array[4];
1643         r.ptr[5]  = lo.array[5];
1644         r.ptr[6]  = lo.array[6];
1645         r.ptr[7]  = lo.array[7];
1646         r.ptr[8]  = hi.array[0];
1647         r.ptr[9]  = hi.array[1];
1648         r.ptr[10] = hi.array[2];
1649         r.ptr[11] = hi.array[3];
1650         r.ptr[12] = hi.array[4];
1651         r.ptr[13] = hi.array[5];
1652         r.ptr[14] = hi.array[6];
1653         r.ptr[15] = hi.array[7];
1654         return r;
1655     }
1656 
1657     short8 vcombine_u16(short4 lo, short4 hi) pure @trusted
1658     {
1659         short8 r;
1660         r.ptr[0]  = lo.array[0];
1661         r.ptr[1]  = lo.array[1];
1662         r.ptr[2]  = lo.array[2];
1663         r.ptr[3]  = lo.array[3];
1664         r.ptr[4]  = hi.array[0];
1665         r.ptr[5]  = hi.array[1];
1666         r.ptr[6]  = hi.array[2];
1667         r.ptr[7]  = hi.array[3];
1668         return r;
1669     }
1670 
1671 
1672     // float4 => int4
1673 
1674     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32")
1675         int4 vcvtmq_s32_f32(float4 a) pure @safe;
1676 
1677     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32")
1678         int4 vcvtnq_s32_f32(float4 a) pure @safe;
1679 
1680     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32")
1681         int4 vcvtpq_s32_f32(float4 a) pure @safe;
1682 
1683     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32")
1684         int4 vcvtzq_s32_f32(float4 a) pure @safe;
1685 
1686 
1687     // float4 <=> half4
1688 
1689     pragma(LDC_intrinsic, "llvm.aarch64.neon.vcvtfp2hf")
1690         short4 vcvt_f16_f32(float4 a) pure @safe;
1691 
1692     pragma(LDC_intrinsic, "llvm.aarch64.neon.vcvthf2fp")
1693         float4 vcvt_f32_f16(short4 a) pure @safe;
1694 
1695     // double2 => long2
1696 
1697     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64")
1698         long2 vcvtmq_s64_f64(double2 a) pure @safe;
1699 
1700     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64")
1701         long2 vcvtnq_s64_f64(double2 a) pure @safe;
1702 
1703     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64")
1704         long2 vcvtpq_s64_f64(double2 a) pure @safe;
1705 
1706     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64")
1707         long2 vcvtzq_s64_f64(double2 a) pure @safe;
1708 
1709     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32")
1710         int vcvtms_s32_f32(float a) pure @safe;
1711 
1712     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32")
1713         int vcvtns_s32_f32(float a) pure @safe;    
1714 
1715     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32")
1716         int vcvtps_s32_f32(float a) pure @safe;
1717 
1718     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32")
1719         int vcvts_s32_f32(float a) pure @safe;
1720      
1721     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64")
1722         int vcvtms_s32_f64(double a) pure @safe;
1723 
1724     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64")
1725         int vcvtns_s32_f64(double a) pure @safe;    
1726 
1727     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64")
1728         int vcvtps_s32_f64(double a) pure @safe;
1729 
1730     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64")
1731         int vcvts_s32_f64(double a) pure @safe;
1732 
1733     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32")
1734         long vcvtms_s64_f32(float a) pure @safe;
1735 
1736     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32")
1737         long vcvtns_s64_f32(float a) pure @safe;    
1738 
1739     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32")
1740         long vcvtps_s64_f32(float a) pure @safe;
1741 
1742     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32")
1743         long vcvts_s64_f32(float a) pure @safe;
1744 
1745     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64")
1746         long vcvtms_s64_f64(double a) pure @safe;
1747 
1748     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64")
1749         long vcvtns_s64_f64(double a) pure @safe;    
1750 
1751     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64")
1752         long vcvtps_s64_f64(double a) pure @safe; // Note: technically should be named vcvtpd_s64_f64
1753 
1754     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64")
1755         long vcvts_s64_f64(double a) pure @safe;
1756 
1757     long2 vdupq_n_s64(long value) pure @safe
1758     {
1759         long2 r;
1760         r = value;
1761         return r;
1762     }
1763 
1764     short4 vget_high_s16(short8 a) pure @trusted
1765     {
1766         short4 r;
1767         r.ptr[0] = a.array[4];
1768         r.ptr[1] = a.array[5];
1769         r.ptr[2] = a.array[6];
1770         r.ptr[3] = a.array[7];
1771         return r;
1772     }
1773 
1774     int2 vget_high_s32(int4 a) pure @trusted
1775     {
1776         int2 r;
1777         r.ptr[0] = a.array[2];
1778         r.ptr[1] = a.array[3];
1779         return r;
1780     }
1781 
1782     byte8 vget_high_u8(byte16 a) pure @trusted
1783     {
1784         byte8 r;
1785         r.ptr[0] = a.array[8];
1786         r.ptr[1] = a.array[9];
1787         r.ptr[2] = a.array[10];
1788         r.ptr[3] = a.array[11];
1789         r.ptr[4] = a.array[12];
1790         r.ptr[5] = a.array[13];
1791         r.ptr[6] = a.array[14];
1792         r.ptr[7] = a.array[15];
1793         return r;
1794     }
1795 
1796     short4 vget_low_s16(short8 a) pure @trusted
1797     {
1798         short4 r;
1799         r.ptr[0] = a.array[0];
1800         r.ptr[1] = a.array[1];
1801         r.ptr[2] = a.array[2];
1802         r.ptr[3] = a.array[3];
1803         return r;
1804     } 
1805 
1806     int2 vget_low_s32(int4 a) pure @trusted
1807     {
1808         int2 r;
1809         r.ptr[0] = a.array[0];
1810         r.ptr[1] = a.array[1];
1811         return r;
1812     }
1813 
1814     byte8 vget_low_u8(byte16 a) pure @trusted
1815     {
1816         byte8 r;
1817         r.ptr[0] = a.array[0];
1818         r.ptr[1] = a.array[1];
1819         r.ptr[2] = a.array[2];
1820         r.ptr[3] = a.array[3];
1821         r.ptr[4] = a.array[4];
1822         r.ptr[5] = a.array[5];
1823         r.ptr[6] = a.array[6];
1824         r.ptr[7] = a.array[7];
1825         return r;
1826     }
1827 
1828     long vgetq_lane_s64(long2 v, const int lane) pure @safe
1829     {
1830         return v.array[lane];
1831     }
1832 
1833     pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16")
1834         short8 vmaxq_s16(short8 a, short8 b) pure @safe;
1835 
1836     int4 vmaxq_s32(int4 a, int4 b) pure @safe
1837     {
1838         int4 r;
1839         r[0] = a[0] >= b[0] ? a[0] : b[0];
1840         r[1] = a[1] >= b[1] ? a[1] : b[1];
1841         r[2] = a[2] >= b[2] ? a[2] : b[2];
1842         r[3] = a[3] >= b[3] ? a[3] : b[3];
1843         return r;
1844     }
1845 
1846     pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16")
1847         short8 vminq_s16(short8 a, short8 b) pure @safe;
1848 
1849     int4 vmovl_u16(short4 a) pure @trusted
1850     {
1851         int4 r;
1852         r.ptr[0] = cast(ushort)a.array[0];
1853         r.ptr[1] = cast(ushort)a.array[1];
1854         r.ptr[2] = cast(ushort)a.array[2];
1855         r.ptr[3] = cast(ushort)a.array[3];
1856         return r;
1857     }
1858 
1859     int2 vmovn_s64(long2 a) pure @trusted
1860     {
1861         int2 r;
1862         r.ptr[0] = cast(int)(a.array[0]);
1863         r.ptr[1] = cast(int)(a.array[1]);
1864         return r;
1865     }        
1866 
1867     int4 vmull_s16(short4 a, short4 b) pure @trusted
1868     {
1869         int4 r;
1870         r.ptr[0] = a.array[0] * b.array[0];
1871         r.ptr[1] = a.array[1] * b.array[1];
1872         r.ptr[2] = a.array[2] * b.array[2];
1873         r.ptr[3] = a.array[3] * b.array[3];
1874         return r;
1875     }
1876 
1877     pragma(LDC_intrinsic, "llvm.aarch64.neon.smull.v2i64")
1878         long2 vmull_s32(int2 a, int2 b) pure @safe;
1879 
1880     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i16")
1881         short4 vpadd_s16(short4 a, short4 b) pure @safe;
1882 
1883     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32")
1884         int2 vpadd_s32(int2 a, int2 b) pure @safe;
1885 
1886     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8")
1887         byte8 vpadd_u8(byte8 a, byte8 b) pure @safe;
1888 
1889     pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8")
1890         short8 vpaddlq_u8 (byte16 a) pure @safe;
1891 
1892     static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin
1893     {
1894         pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32")
1895             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1896     }
1897     else
1898     {
1899         pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32")
1900             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1901     }
1902     
1903     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i16")
1904         short8 vpaddq_s16(short8 a, short8 b) pure @safe;
1905 
1906     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8")
1907         byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe;
1908 
1909     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i32")
1910         int4 vpaddq_s32(int4 a, int4 b) pure @safe;
1911 
1912     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v4i16")
1913         short4 vqadd_s16(short4 a, short4 b) pure @safe;
1914 
1915     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v8i16")
1916         short8 vqaddq_s16(short8 a, short8 b) pure @safe;
1917 
1918     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8")
1919         byte8 vqmovn_s16(short8 a) pure @safe;
1920 
1921     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16")
1922         short4 vqmovn_s32(int4 a) pure @safe;
1923 
1924     pragma(LDC_intrinsic, "llvm.aarch64.neon.uqxtn.v4i16")
1925         short4 vqmovn_u32(int4 a) pure @safe;
1926 
1927     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8")
1928         byte8 vqmovun_s16(short8 a) pure @safe;
1929 
1930     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v4i16")
1931         short4 vqsub_s16(short4 a, short4 b) pure @safe;
1932 
1933     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v8i16")
1934         short8 vqsubq_s16(short8 a, short8 b) pure @safe;
1935 
1936     pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v16i8")
1937         byte16 vqtbl1q_s8(byte16 t, byte16 idx) pure @safe;
1938 
1939     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8")
1940         byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe;
1941 
1942     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16")
1943         short8 vrhadd_u16(short8 a, short8 b) pure @safe;
1944 
1945     pragma(LDC_intrinsic, "llvm.aarch64.neon.rshrn.v4i16")
1946         short4 vrshrn_n_s32(int4 a, int n) pure @safe;        
1947 
1948     byte8 vshr_u8(byte8 a, byte8 b) pure @safe
1949     {
1950         return a >>> b;
1951     }
1952 
1953     byte16 vshrq_n_s8(byte16 a, byte r) pure @safe
1954     { 
1955         a = a >> byte16(cast(byte)r);
1956         return a;
1957     }
1958 
1959     pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v8i8")
1960         byte8 vtbl1_s8(byte16 t, byte8 idx) pure @safe;
1961 }
1962 
1963 version(unittest)
1964 {
1965     double abs_double(double x) @trusted
1966     {
1967         version(LDC)
1968             return llvm_fabs(x);
1969         else
1970         {
1971             long uf = *cast(long*)(&x);
1972             uf &= 0x7fffffff_ffffffff;
1973             return *cast(double*)(&uf);
1974         }
1975     }
1976 }
1977 
1978 // needed because in old GDC from travis, core.stdc.math.isnan isn't pure
1979 
1980 bool isnan(float x) pure @trusted
1981 {
1982     uint u = *cast(uint*)(&x);
1983     bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF);
1984     return result;
1985 }
1986 unittest
1987 {
1988     float x = float.nan;
1989     assert(isnan(x));
1990 
1991     x = 0;
1992     assert(!isnan(x));
1993     
1994     x = float.infinity;
1995     assert(!isnan(x));
1996 }
1997 
1998 bool isnan(double x) pure @trusted
1999 {
2000     ulong u = *cast(ulong*)(&x);
2001     return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF);
2002 }
2003 unittest
2004 {
2005     double x = double.nan;
2006     assert(isnan(x));
2007 
2008     x = 0;
2009     assert(!isnan(x));
2010     
2011     x = double.infinity;
2012     assert(!isnan(x));
2013 }