1 /**
2 * Internal stuff only, do not import.
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.internals;
8 
9 import inteli.types;
10 
11 // The only math functions needed for intel-intrinsics
12 public import core.math: sqrt; // since it's an intrinsics
13 
14 package:
15 nothrow:
16 @nogc:
17 
18 
19 version(GNU)
20 {
21     version (X86)
22     {
23         // For 32-bit x86, disable vector extensions with GDC. 
24         // It just doesn't work well.
25         enum GDC_with_x86 = true;
26         enum GDC_with_MMX = false;
27         enum GDC_with_SSE = false;
28         enum GDC_with_SSE2 = false;
29         enum GDC_with_SSE3 = false;
30         enum GDC_with_SSSE3 = false;
31         enum GDC_with_SSE41 = false;
32         enum GDC_with_SSE42 = false;
33         enum GDC_with_AVX = false;
34         enum GDC_with_AVX2 = false;
35         enum GDC_with_SHA = false;
36         enum GDC_with_BMI2 = false;
37     }
38     else version (X86_64)
39     {
40         // GDC support uses extended inline assembly:
41         //   https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html        (general information and hints)
42         //   https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html  (binding variables to registers)
43         //   https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html (x86 specific register short names)
44 
45         public import core.simd: byte16, short8, int4, float4, double2;
46 
47         // NOTE: These intrinsics are not available in every i386 and x86_64 CPU.
48         // For more info: https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html 
49         public import gcc.builtins;
50                 
51         enum GDC_with_x86 = true;
52         enum GDC_with_MMX = true; // We don't have a way to detect that at CT, but we assume it's there
53         enum GDC_with_SSE = true; // We don't have a way to detect that at CT, but we assume it's there
54         enum GDC_with_SSE2 = true; // We don't have a way to detect that at CT, but we assume it's there
55 
56         enum GDC_with_SSE3 = false; // TODO: we don't have a way to detect that at CT
57         enum GDC_with_SSSE3 = false; // TODO: we don't have a way to detect that at CT
58         enum GDC_with_SSE41 = false; // TODO: we don't have a way to detect that at CT
59         enum GDC_with_SSE42 = false; // TODO: we don't have a way to detect that at CT
60         enum GDC_with_AVX = false; // TODO: we don't have a way to detect that at CT
61         enum GDC_with_AVX2 = false; // TODO: we don't have a way to detect that at CT
62         enum GDC_with_SHA = false;
63         enum GDC_with_BMI2 = false;
64     }
65     else
66     {
67         enum GDC_with_x86 = false;
68         enum GDC_with_MMX = false;
69         enum GDC_with_SSE = false;
70         enum GDC_with_SSE2 = false;
71         enum GDC_with_SSE3 = false;
72         enum GDC_with_SSSE3 = false;
73         enum GDC_with_SSE41 = false;
74         enum GDC_with_SSE42 = false;
75         enum GDC_with_AVX = false;
76         enum GDC_with_AVX2 = false;
77         enum GDC_with_SHA = false;
78         enum GDC_with_BMI2 = false;
79     }
80 }
81 else
82 {
83     enum GDC_with_x86 = false;
84     enum GDC_with_MMX = false;
85     enum GDC_with_SSE = false;
86     enum GDC_with_SSE2 = false;
87     enum GDC_with_SSE3 = false;
88     enum GDC_with_SSSE3 = false;
89     enum GDC_with_SSE41 = false;
90     enum GDC_with_SSE42 = false;
91     enum GDC_with_AVX = false;
92     enum GDC_with_AVX2 = false;
93     enum GDC_with_SHA = false;
94     enum GDC_with_BMI2 = false;
95 }
96 
97 version(LDC)
98 {
99     public import core.simd;
100     public import ldc.simd;
101     public import ldc.intrinsics;
102     public import ldc.llvmasm: __asm;
103 
104     // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR
105     static if (__VERSION__ >= 2083)
106     {
107          import ldc.llvmasm;
108          alias LDCInlineIR = __ir_pure;
109 
110          // A version of inline IR with prefix/suffix didn't exist before LDC 1.13
111          alias LDCInlineIREx = __irEx_pure; 
112     }
113     else
114     {
115         alias LDCInlineIR = inlineIR;
116     }
117 
118     version(ARM)
119     {
120         public import ldc.gccbuiltins_arm;
121         enum LDC_with_ARM32 = true;
122         enum LDC_with_ARM64 = false;
123         enum LDC_with_ARM64_CRC = false;
124         enum LDC_with_SSE1 = false;
125         enum LDC_with_SSE2 = false;
126         enum LDC_with_SSE3 = false;
127         enum LDC_with_SSSE3 = false;
128         enum LDC_with_SSE41 = false;
129         enum LDC_with_SSE42 = false;
130         enum LDC_with_AVX = false;
131         enum LDC_with_AVX2 = false;
132         enum LDC_with_SHA = false;
133         enum LDC_with_BMI2 = false;
134     }
135     else version(AArch64)
136     {
137         enum LDC_with_ARM32 = false;
138         enum LDC_with_ARM64 = true; // implies "has Neon"
139         enum LDC_with_ARM64_CRC = __traits(targetHasFeature, "crc");
140         enum LDC_with_SSE1 = false;
141         enum LDC_with_SSE2 = false;
142         enum LDC_with_SSE3 = false;
143         enum LDC_with_SSSE3 = false;
144         enum LDC_with_SSE41 = false;
145         enum LDC_with_SSE42 = false;
146         enum LDC_with_AVX = false;
147         enum LDC_with_AVX2 = false;
148         enum LDC_with_SHA = false;
149         enum LDC_with_BMI2 = false;
150     }
151     else
152     {
153         public import ldc.gccbuiltins_x86;
154         enum LDC_with_ARM32 = false;
155         enum LDC_with_ARM64 = false;
156         enum LDC_with_ARM64_CRC = false;
157         enum LDC_with_SSE1 = __traits(targetHasFeature, "sse");
158         enum LDC_with_SSE2 = __traits(targetHasFeature, "sse2");
159         enum LDC_with_SSE3 = __traits(targetHasFeature, "sse3");
160         enum LDC_with_SSSE3 = __traits(targetHasFeature, "ssse3");
161         enum LDC_with_SSE41 = __traits(targetHasFeature, "sse4.1");
162         enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2");
163         enum LDC_with_AVX = __traits(targetHasFeature, "avx");
164         enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2");
165         enum LDC_with_SHA = __traits(targetHasFeature, "sha");
166         enum LDC_with_BMI2 = __traits(targetHasFeature, "bmi2");
167     }
168 }
169 else
170 {
171     enum LDC_with_ARM32 = false;
172     enum LDC_with_ARM64 = false;
173     enum LDC_with_ARM64_CRC = false;
174     enum LDC_with_SSE1 = false;
175     enum LDC_with_SSE2 = false;
176     enum LDC_with_SSE3 = false;
177     enum LDC_with_SSSE3 = false;
178     enum LDC_with_SSE41 = false;
179     enum LDC_with_SSE42 = false;
180     enum LDC_with_AVX = false;
181     enum LDC_with_AVX2 = false;
182     enum LDC_with_SHA = false;
183     enum LDC_with_BMI2 = false;
184 }
185 
186 enum LDC_with_ARM = LDC_with_ARM32 | LDC_with_ARM64;
187 
188 version(DigitalMars)
189 {
190     version(D_InlineAsm_X86)
191         enum DMD_with_asm = true;
192     else version(D_InlineAsm_X86_64)
193         enum DMD_with_asm = true;
194     else
195         enum DMD_with_asm = false;
196 
197     version(D_InlineAsm_X86)
198         enum DMD_with_32bit_asm = DMD_with_asm; // sometimes you want a 32-bit DMD only solution
199     else
200         enum DMD_with_32bit_asm = false;
201 
202     version (D_SIMD)
203         enum DMD_with_DSIMD = !SSESizedVectorsAreEmulated;
204     else
205         enum DMD_with_DSIMD = false;
206 }
207 else
208 {
209     enum DMD_with_asm = false;
210     enum DMD_with_32bit_asm = false;
211     enum DMD_with_DSIMD = false;
212 }
213 
214 static if (LDC_with_ARM32)
215 {
216     package uint arm_get_fpcr() nothrow @nogc @trusted
217     {
218         return __builtin_arm_get_fpscr();
219     }
220 
221     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
222     {
223         __builtin_arm_set_fpscr(cw);
224     }
225 }
226 
227 static if (LDC_with_ARM64)
228 {
229     pragma(LDC_intrinsic, "llvm.aarch64.get.fpcr")
230         long __builtin_aarch64_get_fpcr() pure nothrow @nogc @safe;
231 
232     package uint arm_get_fpcr() pure nothrow @nogc @trusted
233     {
234         // LLVM intrinsic "llvm.aarch64.get.fpcr" seems buggy and doesn't return FPCR
235         return __asm!uint("mrs $0, fpcr", "=r");
236     }
237 
238     package void arm_set_fpcr(uint cw) nothrow @nogc @trusted
239     {
240         // Note: there doesn't seem to be an intrinsic in LLVM to set FPCR.
241         long save_x2;
242         __asm!void("str x2, $1 \n" ~
243                    "ldr w2, $0 \n" ~
244                    "msr fpcr, x2 \n" ~
245                    "ldr x2, $1 "   , "m,m", cw, &save_x2);
246     }
247 }
248 
249 
250 // For internal use only, since public API deals with a x86 semantic emulation
251 enum uint _MM_ROUND_NEAREST_ARM     = 0x00000000;
252 enum uint _MM_ROUND_DOWN_ARM        = 0x00800000;
253 enum uint _MM_ROUND_UP_ARM          = 0x00400000;
254 enum uint _MM_ROUND_TOWARD_ZERO_ARM = 0x00C00000;
255 enum uint _MM_ROUND_MASK_ARM        = 0x00C00000;
256 enum uint _MM_FLUSH_ZERO_MASK_ARM = 0x01000000;
257 
258 
259 //
260 //  <ROUNDING>
261 //
262 //  Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE
263 //  doesn't change the FPU rounding mode, and isn't expected to do so.
264 //  So we devised these rounding function to help having consistent rounding between 
265 //  LDC and DMD. It's important that DMD uses whatever is in MXCSR to round.
266 //
267 //  Note: There is no MXCSR in ARM. But there is fpcr/fpscr that implements similar 
268 //  functionality.
269 //  https://developer.arm.com/documentation/dui0068/b/vector-floating-point-programming/vfp-system-registers/fpscr--the-floating-point-status-and-control-register
270 //  We use fpcr/fpscr since it's thread-local, so we can emulate those x86 conversion albeit slowly.
271 
272 int convertFloatToInt32UsingMXCSR(float value) @trusted
273 {
274     int result;
275     version(GNU)
276     {
277         asm pure nothrow @nogc @trusted
278         {
279             "cvtss2si %1, %0\n": "=r"(result) : "x" (value);
280         }
281     }
282     else static if (LDC_with_ARM32)
283     {
284         // TODO: this is a bug, it won't preserve registers when optimized
285         result = __asm!int(`vldr s2, $1
286                             vcvtr.s32.f32 s2, s2
287                             vmov $0, s2`, "=r,m", value);
288     }
289     else static if (LDC_with_ARM64)
290     {
291         // Get current rounding mode.
292         uint fpscr = arm_get_fpcr();
293 
294         switch(fpscr & _MM_ROUND_MASK_ARM)
295         {
296             default:
297             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f32(value); break;
298             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f32(value); break;
299             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f32(value); break;
300             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f32(value);  break;
301         }
302     }
303     else
304     {
305         asm pure nothrow @nogc @trusted
306         {
307             cvtss2si EAX, value;
308             mov result, EAX;
309         }
310     }
311     return result;
312 }
313 
314 int convertDoubleToInt32UsingMXCSR(double value) @trusted
315 {
316     int result;
317     version(GNU)
318     {
319         asm pure nothrow @nogc @trusted
320         {
321             "cvtsd2si %1, %0\n": "=r"(result) : "x" (value);
322         }
323     }
324     else static if (LDC_with_ARM32)
325     {
326         // TODO: bug, doesn't preserve registers
327         result = __asm!int(`vldr d2, $1
328                             vcvtr.s32.f64 s2, d2
329                             vmov $0, s2`, "=r,m", value);
330     }
331     else static if (LDC_with_ARM64)
332     {
333         // Get current rounding mode.
334         uint fpscr = arm_get_fpcr();
335 
336         switch(fpscr & _MM_ROUND_MASK_ARM)
337         {
338             default:
339             case _MM_ROUND_NEAREST_ARM:     result = vcvtns_s32_f64(value); break;
340             case _MM_ROUND_DOWN_ARM:        result = vcvtms_s32_f64(value); break;
341             case _MM_ROUND_UP_ARM:          result = vcvtps_s32_f64(value); break;
342             case _MM_ROUND_TOWARD_ZERO_ARM: result = vcvts_s32_f64(value);  break;
343         }
344     }
345     else
346     {
347         asm pure nothrow @nogc @trusted
348         {
349             cvtsd2si EAX, value;
350             mov result, EAX;
351         }
352     }
353     return result;
354 }
355 
356 long convertFloatToInt64UsingMXCSR(float value) @trusted
357 {
358     static if (LDC_with_ARM32)
359     {
360         // We have to resort to libc since 32-bit ARM 
361         // doesn't seem to have 64-bit registers.
362         
363         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
364 
365         // Note: converting to double precision else rounding could be different for large integers
366         double asDouble = value; 
367 
368         switch(fpscr & _MM_ROUND_MASK_ARM)
369         {
370             default:
371             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(asDouble));
372             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(asDouble));
373             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(asDouble));
374             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(asDouble);
375         }
376     }
377     else static if (LDC_with_ARM64)
378     {
379         uint fpscr = arm_get_fpcr();
380 
381         switch(fpscr & _MM_ROUND_MASK_ARM)
382         {
383             default:
384             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f32(value);
385             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f32(value);
386             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f32(value);
387             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f32(value);
388         }
389     }
390     // 64-bit can use an SSE instruction
391     else version(D_InlineAsm_X86_64)
392     {
393         long result;
394         version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet."
395         {
396             asm pure nothrow @nogc @trusted
397             {
398                 movss XMM0, value;
399                 cvtss2si RAX, XMM0;
400                 mov result, RAX;
401             }
402         }
403         else
404         {
405             asm pure nothrow @nogc @trusted
406             {
407                 movss XMM0, value;
408                 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit)
409                 mov result, RAX;
410             }
411         }
412         return result;
413     }
414     else version(D_InlineAsm_X86)
415     {
416         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
417         // This leads to an unfortunate FPU sequence in every C++ compiler.
418         // See: https://godbolt.org/z/vZym77
419 
420         // Get current MXCSR rounding
421         uint sseRounding;
422         ushort savedFPUCW;
423         ushort newFPUCW;
424         long result;
425         asm pure nothrow @nogc @trusted
426         {
427             stmxcsr sseRounding;
428             fld value;
429             fnstcw savedFPUCW;
430             mov AX, savedFPUCW;
431             and AX, 0xf3ff;          // clear FPU rounding bits
432             movzx ECX, word ptr sseRounding;
433             and ECX, 0x6000;         // only keep SSE rounding bits
434             shr ECX, 3;
435             or AX, CX;               // make a new control word for FPU with SSE bits
436             mov newFPUCW, AX;
437             fldcw newFPUCW;
438             fistp qword ptr result;            // convert, respecting MXCSR (but not other control word things)
439             fldcw savedFPUCW;
440         }
441         return result;
442     }
443     else static if (GDC_with_x86)
444     {
445         version(X86_64) // 64-bit can just use the right instruction
446         {
447             static assert(GDC_with_SSE);
448             __m128 A;
449             A.ptr[0] = value;
450             return __builtin_ia32_cvtss2si64 (A);
451         }
452         else version(X86) // 32-bit
453         {
454             // This is untested!
455             uint sseRounding;
456             ushort savedFPUCW;
457             ushort newFPUCW;
458             long result;
459             asm pure nothrow @nogc @trusted
460             {
461                 "stmxcsr %1;\n" ~
462                 "fld %2;\n" ~
463                 "fnstcw %3;\n" ~
464                 "movw %3, %%ax;\n" ~
465                 "andw $0xf3ff, %%ax;\n" ~
466                 "movzwl %1, %%ecx;\n" ~
467                 "andl $0x6000, %%ecx;\n" ~
468                 "shrl $3, %%ecx;\n" ~
469                 "orw %%cx, %%ax\n" ~
470                 "movw %%ax, %4;\n" ~
471                 "fldcw %4;\n" ~
472                 "fistpll %0;\n" ~
473                 "fldcw %3;\n" 
474                   : "=m"(result)    // %0
475                   : "m" (sseRounding),
476                     "f" (value),
477                     "m" (savedFPUCW),
478                     "m" (newFPUCW) 
479                   : "eax", "ecx", "st";
480             }
481             return result;
482         }
483         else
484             static assert(false);
485     }
486     else
487         static assert(false);
488 }
489 
490 
491 ///ditto
492 long convertDoubleToInt64UsingMXCSR(double value) @trusted
493 {
494     static if (LDC_with_ARM32)
495     {
496         // We have to resort to libc since 32-bit ARM 
497         // doesn't seem to have 64-bit registers.
498         uint fpscr = arm_get_fpcr(); // Get current rounding mode.
499         switch(fpscr & _MM_ROUND_MASK_ARM)
500         {
501             default:
502             case _MM_ROUND_NEAREST_ARM:     return cast(long)(llvm_round(value));
503             case _MM_ROUND_DOWN_ARM:        return cast(long)(llvm_floor(value));
504             case _MM_ROUND_UP_ARM:          return cast(long)(llvm_ceil(value));
505             case _MM_ROUND_TOWARD_ZERO_ARM: return cast(long)(value);
506         }
507     }
508     else static if (LDC_with_ARM64)
509     {
510         // Get current rounding mode.
511         uint fpscr = arm_get_fpcr();
512 
513         switch(fpscr & _MM_ROUND_MASK_ARM)
514         {
515             default:
516             case _MM_ROUND_NEAREST_ARM:     return vcvtns_s64_f64(value);
517             case _MM_ROUND_DOWN_ARM:        return vcvtms_s64_f64(value);
518             case _MM_ROUND_UP_ARM:          return vcvtps_s64_f64(value);
519             case _MM_ROUND_TOWARD_ZERO_ARM: return vcvts_s64_f64(value);
520         }
521     }
522     // 64-bit can use an SSE instruction
523     else version(D_InlineAsm_X86_64)
524     {
525         long result;
526         version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet."
527         {
528             asm pure nothrow @nogc @trusted
529             {
530                 movsd XMM0, value;
531                 cvtsd2si RAX, XMM0;
532                 mov result, RAX;
533             }
534         }
535         else
536         {
537             asm pure nothrow @nogc @trusted
538             {
539                 movsd XMM0, value;
540                 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit)
541                 mov result, RAX;
542             }
543         }
544         return result;
545     }
546     else version(D_InlineAsm_X86)
547     {
548         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
549         // This leads to an unfortunate FPU sequence in every C++ compiler.
550         // See: https://godbolt.org/z/vZym77
551 
552         // Get current MXCSR rounding
553         uint sseRounding;
554         ushort savedFPUCW;
555         ushort newFPUCW;
556         long result;
557         asm pure nothrow @nogc @trusted
558         {
559             stmxcsr sseRounding;
560             fld value;
561             fnstcw savedFPUCW;
562             mov AX, savedFPUCW;
563             and AX, 0xf3ff;
564             movzx ECX, word ptr sseRounding;
565             and ECX, 0x6000;
566             shr ECX, 3;
567             or AX, CX;
568             mov newFPUCW, AX;
569             fldcw newFPUCW;
570             fistp result;
571             fldcw savedFPUCW;
572         }
573         return result;
574     }
575     else static if (GDC_with_x86)
576     {
577         version(X86_64)
578         {
579             static assert(GDC_with_SSE2);
580             __m128d A;
581             A.ptr[0] = value;
582             return __builtin_ia32_cvtsd2si64 (A);
583         }
584         else
585         {
586             // This is untested!
587             uint sseRounding;
588             ushort savedFPUCW;
589             ushort newFPUCW;
590             long result;
591             asm pure nothrow @nogc @trusted
592             {
593                 "stmxcsr %1;\n" ~
594                 "fld %2;\n" ~
595                 "fnstcw %3;\n" ~
596                 "movw %3, %%ax;\n" ~
597                 "andw $0xf3ff, %%ax;\n" ~
598                 "movzwl %1, %%ecx;\n" ~
599                 "andl $0x6000, %%ecx;\n" ~
600                 "shrl $3, %%ecx;\n" ~
601                 "orw %%cx, %%ax\n" ~
602                 "movw %%ax, %4;\n" ~
603                 "fldcw %4;\n" ~
604                 "fistpll %0;\n" ~
605                 "fldcw %3;\n"         
606                   : "=m"(result)    // %0
607                   : "m" (sseRounding),
608                     "t" (value),
609                     "m" (savedFPUCW),
610                     "m" (newFPUCW) 
611                   : "eax", "ecx", "st";
612             }
613             return result;
614         }
615     }
616     else
617         static assert(false);
618 }
619 
620 //
621 //  </ROUNDING>
622 //
623 
624 
625 // using the Intel terminology here
626 
627 byte saturateSignedWordToSignedByte(short value) pure @safe
628 {
629     if (value > 127) value = 127;
630     if (value < -128) value = -128;
631     return cast(byte) value;
632 }
633 
634 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe
635 {
636     if (value > 255) value = 255;
637     if (value < 0) value = 0;
638     return cast(ubyte) value;
639 }
640 
641 short saturateSignedIntToSignedShort(int value) pure @safe
642 {
643     if (value > 32767) value = 32767;
644     if (value < -32768) value = -32768;
645     return cast(short) value;
646 }
647 
648 ushort saturateSignedIntToUnsignedShort(int value) pure @safe
649 {
650     if (value > 65535) value = 65535;
651     if (value < 0) value = 0;
652     return cast(ushort) value;
653 }
654 
655 unittest // test saturate operations
656 {
657     assert( saturateSignedWordToSignedByte(32000) == 127);
658     assert( saturateSignedWordToUnsignedByte(32000) == 255);
659     assert( saturateSignedWordToSignedByte(-4000) == -128);
660     assert( saturateSignedWordToUnsignedByte(-4000) == 0);
661     assert( saturateSignedIntToSignedShort(32768) == 32767);
662     assert( saturateSignedIntToUnsignedShort(32768) == 32768);
663     assert( saturateSignedIntToSignedShort(-32769) == -32768);
664     assert( saturateSignedIntToUnsignedShort(-32769) == 0);
665 }
666 
667 version(unittest)
668 {
669     // This is just for debugging tests
670     import core.stdc.stdio: printf;
671 
672     // printing vectors for implementation
673     // Note: you can override `pure` within a `debug` clause
674 
675     void _mm_print_pi64(__m64 v) @trusted
676     {
677         long1 vl = cast(long1)v;
678         printf("%lld\n", vl.array[0]);
679     }
680 
681     void _mm_print_pi32(__m64 v) @trusted
682     {
683         int[2] C = (cast(int2)v).array;
684         printf("%d %d\n", C[0], C[1]);
685     }
686 
687     void _mm_print_pi16(__m64 v) @trusted
688     {
689         short[4] C = (cast(short4)v).array;
690         printf("%d %d %d %d\n", C[0], C[1], C[2], C[3]);
691     }
692 
693     void _mm_print_pi8(__m64 v) @trusted
694     {
695         byte[8] C = (cast(byte8)v).array;
696         printf("%d %d %d %d %d %d %d %d\n",
697         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
698     }
699 
700     void _mm_print_epi64(__m128i v) @trusted
701     {
702         long2 vl = cast(long2)v;
703         printf("%lld %lld\n", vl.array[0], vl.array[1]);
704     }
705 
706     void _mm_print_epi32(__m128i v) @trusted
707     {
708         printf("%d %d %d %d\n",
709               v.array[0], v.array[1], v.array[2], v.array[3]);
710     }  
711 
712     void _mm_print_epi16(__m128i v) @trusted
713     {
714         short[8] C = (cast(short8)v).array;
715         printf("%d %d %d %d %d %d %d %d\n",
716         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
717     }
718 
719     void _mm_print_epi8(__m128i v) @trusted
720     {
721         byte[16] C = (cast(byte16)v).array;
722         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
723         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]);
724     }
725 
726     void _mm_print_ps(__m128 v) @trusted
727     {
728         float[4] C = (cast(float4)v).array;
729         printf("%f %f %f %f\n", C[0], C[1], C[2], C[3]);
730     }
731 
732     void _mm_print_pd(__m128d v) @trusted
733     {
734         double[2] C = (cast(double2)v).array;
735         printf("%f %f\n", C[0], C[1]);
736     }    
737 }
738 
739 
740 //
741 //  <FLOATING-POINT COMPARISONS>
742 //
743 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we
744 //       need different IR generation.
745 
746 enum FPComparison
747 {
748     oeq,   // ordered and equal
749     ogt,   // ordered and greater than
750     oge,   // ordered and greater than or equal
751     olt,   // ordered and less than
752     ole,   // ordered and less than or equal
753     one,   // ordered and not equal
754     ord,   // ordered (no nans)
755     ueq,   // unordered or equal
756     ugt,   // unordered or greater than ("nle")
757     uge,   // unordered or greater than or equal ("nlt")
758     ult,   // unordered or less than ("nge")
759     ule,   // unordered or less than or equal ("ngt")
760     une,   // unordered or not equal ("neq")
761     uno,   // unordered (either nans)
762 }
763 
764 private static immutable string[FPComparison.max+1] FPComparisonToString =
765 [
766     "oeq",
767     "ogt",
768     "oge",
769     "olt",
770     "ole",
771     "one",
772     "ord",
773     "ueq",
774     "ugt",
775     "uge",
776     "ult",
777     "ule",
778     "une",
779     "uno",
780 ];
781 
782 // Individual float comparison: returns -1 for true or 0 for false.
783 // Useful for DMD and testing
784 private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe
785 {
786     bool unordered = isnan(a) || isnan(b);
787     final switch(comparison) with(FPComparison)
788     {
789         case oeq: return a == b;
790         case ogt: return a > b;
791         case oge: return a >= b;
792         case olt: return a < b;
793         case ole: return a <= b;
794         case one: return !unordered && (a != b); // NaN with != always yields true
795         case ord: return !unordered; 
796         case ueq: return unordered || (a == b);
797         case ugt: return unordered || (a > b);
798         case uge: return unordered || (a >= b);
799         case ult: return unordered || (a < b);
800         case ule: return unordered || (a <= b);
801         case une: return (a != b); // NaN with != always yields true
802         case uno: return unordered;
803     }
804 }
805 
806 version(LDC)
807 {
808     /// Provides packed float comparisons
809     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
810     {
811         enum ir = `
812             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1
813             %r = sext <4 x i1> %cmp to <4 x i32>
814             ret <4 x i32> %r`;
815 
816         return LDCInlineIR!(ir, int4, float4, float4)(a, b);
817     }
818 
819     /// Provides packed double comparisons
820     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
821     {
822         enum ir = `
823             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1
824             %r = sext <2 x i1> %cmp to <2 x i64>
825             ret <2 x i64> %r`;
826 
827         return LDCInlineIR!(ir, long2, double2, double2)(a, b);
828     }
829 
830     /// CMPSS-style comparisons
831     /// clang implement it through x86 intrinsics, it is possible with IR alone
832     /// but leads to less optimal code.
833     /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 
834     /// Not that simple.
835     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
836     {
837         /*
838         enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison];
839         enum bool invertOp = (predicateNumber & 0x80) != 0;
840         static if(invertOp)
841             return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f);
842         else
843             return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f);
844         */
845         enum ir = `
846             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
847             %r = sext i1 %cmp to i32
848             %r2 = bitcast i32 %r to float
849             ret float %r2`;
850 
851         float4 r = a;
852         r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]);
853         return r;
854     }
855 
856     /// CMPSD-style comparisons
857     /// clang implement it through x86 intrinsics, it is possible with IR alone
858     /// but leads to less optimal code.
859     /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 
860     /// Not that simple.    
861     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
862     {
863         enum ir = `
864             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
865             %r = sext i1 %cmp to i64
866             %r2 = bitcast i64 %r to double
867             ret double %r2`;
868 
869         double2 r = a;
870         r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]);
871         return r;
872     }
873 }
874 else
875 {
876     /// Provides packed float comparisons
877     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @trusted
878     {
879         int4 result;
880         foreach(i; 0..4)
881         {
882             result.ptr[i] = compareFloat!float(comparison, a.array[i], b.array[i]) ? -1 : 0;
883         }
884         return result;
885     }
886 
887     /// Provides packed double comparisons
888     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @trusted
889     {
890         long2 result;
891         foreach(i; 0..2)
892         {
893             result.ptr[i] = compareFloat!double(comparison, a.array[i], b.array[i]) ? -1 : 0;
894         }
895         return result;
896     }
897 
898     /// Provides CMPSS-style comparison
899     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @trusted
900     {
901         int4 result = cast(int4)a;
902         result.ptr[0] = compareFloat!float(comparison, a.array[0], b.array[0]) ? -1 : 0;
903         return cast(float4)result;
904     }
905 
906     /// Provides CMPSD-style comparison
907     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @trusted
908     {
909         long2 result = cast(long2)a;
910         result.ptr[0] = compareFloat!double(comparison, a.array[0], b.array[0]) ? -1 : 0;
911         return cast(double2)result;
912     }
913 }
914 unittest // cmpps
915 {
916     // Check all comparison type is working
917     float4 A = [1, 3, 5, float.nan];
918     float4 B = [2, 3, 4, 5];
919 
920     int4 result_oeq = cmpps!(FPComparison.oeq)(A, B);
921     int4 result_ogt = cmpps!(FPComparison.ogt)(A, B);
922     int4 result_oge = cmpps!(FPComparison.oge)(A, B);
923     int4 result_olt = cmpps!(FPComparison.olt)(A, B);
924     int4 result_ole = cmpps!(FPComparison.ole)(A, B);
925     int4 result_one = cmpps!(FPComparison.one)(A, B);
926     int4 result_ord = cmpps!(FPComparison.ord)(A, B);
927     int4 result_ueq = cmpps!(FPComparison.ueq)(A, B);
928     int4 result_ugt = cmpps!(FPComparison.ugt)(A, B);
929     int4 result_uge = cmpps!(FPComparison.uge)(A, B);
930     int4 result_ult = cmpps!(FPComparison.ult)(A, B);
931     int4 result_ule = cmpps!(FPComparison.ule)(A, B);
932     int4 result_une = cmpps!(FPComparison.une)(A, B);
933     int4 result_uno = cmpps!(FPComparison.uno)(A, B);
934 
935     static immutable int[4] correct_oeq    = [ 0,-1, 0, 0];
936     static immutable int[4] correct_ogt    = [ 0, 0,-1, 0];
937     static immutable int[4] correct_oge    = [ 0,-1,-1, 0];
938     static immutable int[4] correct_olt    = [-1, 0, 0, 0];
939     static immutable int[4] correct_ole    = [-1,-1, 0, 0];
940     static immutable int[4] correct_one    = [-1, 0,-1, 0];
941     static immutable int[4] correct_ord    = [-1,-1,-1, 0];
942     static immutable int[4] correct_ueq    = [ 0,-1, 0,-1];
943     static immutable int[4] correct_ugt    = [ 0, 0,-1,-1];
944     static immutable int[4] correct_uge    = [ 0,-1,-1,-1];
945     static immutable int[4] correct_ult    = [-1, 0, 0,-1];
946     static immutable int[4] correct_ule    = [-1,-1, 0,-1];
947     static immutable int[4] correct_une    = [-1, 0,-1,-1];
948     static immutable int[4] correct_uno    = [ 0, 0, 0,-1];
949 
950     assert(result_oeq.array == correct_oeq);
951     assert(result_ogt.array == correct_ogt);
952     assert(result_oge.array == correct_oge);
953     assert(result_olt.array == correct_olt);
954     assert(result_ole.array == correct_ole);
955     assert(result_one.array == correct_one);
956     assert(result_ord.array == correct_ord);
957     assert(result_ueq.array == correct_ueq);
958     assert(result_ugt.array == correct_ugt);
959     assert(result_uge.array == correct_uge);
960     assert(result_ult.array == correct_ult);
961     assert(result_ule.array == correct_ule);
962     assert(result_une.array == correct_une);
963     assert(result_uno.array == correct_uno);
964 }
965 unittest
966 {
967     double2 a = [1, 3];
968     double2 b = [2, 3];
969     long2 c = cmppd!(FPComparison.ult)(a, b);
970     static immutable long[2] correct = [cast(long)(-1), 0];
971     assert(c.array == correct);
972 }
973 unittest // cmpss
974 {
975     void testComparison(FPComparison comparison)(float4 A, float4 B)
976     {
977         float4 result = cmpss!comparison(A, B);
978         int4 iresult = cast(int4)result;
979         int expected = compareFloat!float(comparison, A.array[0], B.array[0]) ? -1 : 0;
980         assert(iresult.array[0] == expected);
981         assert(result.array[1] == A.array[1]);
982         assert(result.array[2] == A.array[2]);
983         assert(result.array[3] == A.array[3]);
984     }
985 
986     // Check all comparison type is working
987     float4 A = [1, 3, 5, 6];
988     float4 B = [2, 3, 4, 5];
989     float4 C = [float.nan, 3, 4, 5];
990 
991     testComparison!(FPComparison.oeq)(A, B);
992     testComparison!(FPComparison.oeq)(A, C);
993     testComparison!(FPComparison.ogt)(A, B);
994     testComparison!(FPComparison.ogt)(A, C);
995     testComparison!(FPComparison.oge)(A, B);
996     testComparison!(FPComparison.oge)(A, C);
997     testComparison!(FPComparison.olt)(A, B);
998     testComparison!(FPComparison.olt)(A, C);
999     testComparison!(FPComparison.ole)(A, B);
1000     testComparison!(FPComparison.ole)(A, C);
1001     testComparison!(FPComparison.one)(A, B);
1002     testComparison!(FPComparison.one)(A, C);
1003     testComparison!(FPComparison.ord)(A, B);
1004     testComparison!(FPComparison.ord)(A, C);
1005     testComparison!(FPComparison.ueq)(A, B);
1006     testComparison!(FPComparison.ueq)(A, C);
1007     testComparison!(FPComparison.ugt)(A, B);
1008     testComparison!(FPComparison.ugt)(A, C);
1009     testComparison!(FPComparison.uge)(A, B);
1010     testComparison!(FPComparison.uge)(A, C);
1011     testComparison!(FPComparison.ult)(A, B);
1012     testComparison!(FPComparison.ult)(A, C);
1013     testComparison!(FPComparison.ule)(A, B);
1014     testComparison!(FPComparison.ule)(A, C);
1015     testComparison!(FPComparison.une)(A, B);
1016     testComparison!(FPComparison.une)(A, C);
1017     testComparison!(FPComparison.uno)(A, B);
1018     testComparison!(FPComparison.uno)(A, C);
1019 }
1020 unittest // cmpsd
1021 {
1022     void testComparison(FPComparison comparison)(double2 A, double2 B)
1023     {
1024         double2 result = cmpsd!comparison(A, B);
1025         long2 iresult = cast(long2)result;
1026         long expected = compareFloat!double(comparison, A.array[0], B.array[0]) ? -1 : 0;
1027         assert(iresult.array[0] == expected);
1028         assert(result.array[1] == A.array[1]);
1029     }
1030 
1031     // Check all comparison type is working
1032     double2 A = [1, 3];
1033     double2 B = [2, 4];
1034     double2 C = [double.nan, 5];
1035 
1036     testComparison!(FPComparison.oeq)(A, B);
1037     testComparison!(FPComparison.oeq)(A, C);
1038     testComparison!(FPComparison.ogt)(A, B);
1039     testComparison!(FPComparison.ogt)(A, C);
1040     testComparison!(FPComparison.oge)(A, B);
1041     testComparison!(FPComparison.oge)(A, C);
1042     testComparison!(FPComparison.olt)(A, B);
1043     testComparison!(FPComparison.olt)(A, C);
1044     testComparison!(FPComparison.ole)(A, B);
1045     testComparison!(FPComparison.ole)(A, C);
1046     testComparison!(FPComparison.one)(A, B);
1047     testComparison!(FPComparison.one)(A, C);
1048     testComparison!(FPComparison.ord)(A, B);
1049     testComparison!(FPComparison.ord)(A, C);
1050     testComparison!(FPComparison.ueq)(A, B);
1051     testComparison!(FPComparison.ueq)(A, C);
1052     testComparison!(FPComparison.ugt)(A, B);
1053     testComparison!(FPComparison.ugt)(A, C);
1054     testComparison!(FPComparison.uge)(A, B);
1055     testComparison!(FPComparison.uge)(A, C);
1056     testComparison!(FPComparison.ult)(A, B);
1057     testComparison!(FPComparison.ult)(A, C);
1058     testComparison!(FPComparison.ule)(A, B);
1059     testComparison!(FPComparison.ule)(A, C);
1060     testComparison!(FPComparison.une)(A, B);
1061     testComparison!(FPComparison.une)(A, C);
1062     testComparison!(FPComparison.uno)(A, B);
1063     testComparison!(FPComparison.uno)(A, C);
1064 }
1065 
1066 //
1067 //  </FLOATING-POINT COMPARISONS>
1068 //
1069 
1070 
1071 __m64 to_m64(__m128i a) pure @trusted
1072 {
1073     long2 la = cast(long2)a;
1074     long1 r = la.array[0];
1075     return r;
1076 }
1077 
1078 __m128i to_m128i(__m64 a) pure @trusted
1079 {
1080   /* Not sufficient to avoid https://issues.dlang.org/show_bug.cgi?id=21474 
1081     
1082     version(DigitalMars) // Workaround for https://issues.dlang.org/show_bug.cgi?id=21474 
1083     {
1084         long2 r = a.array[0];
1085         r.ptr[1] = 0;
1086         return cast(int4)r;
1087     }
1088     else */
1089     {
1090         long2 r = [0, 0];
1091         r.ptr[0] = a.array[0];
1092         return cast(__m128i)r;
1093     }
1094 }
1095 
1096 // ADDITIONAL x86 INTRINSICS
1097 // Absent from ldc.gccbuiltins_x86 for some reason, but needed.
1098 // https://github.com/ldc-developers/llvm-project/blob/ldc-release/12.x/llvm/include/llvm/IR/IntrinsicsX86.td
1099 static if (LDC_with_SSE41)
1100 {
1101     pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb")
1102         byte16 __builtin_ia32_pblendvb(byte16, byte16, byte16) pure @safe;
1103 }
1104 
1105 // SOME NEON INTRINSICS
1106 // Emulating some x86 intrinsics needs access to a range of ARM intrinsics.
1107 // Not in the public API but the simde project expose it all for the user to use.
1108 // MAYDO: create a new neon.d module, for internal use only.
1109 // MAYDO: port them to ARM32 so that ARM32 can be as fast as ARM64.
1110 static if (LDC_with_ARM64)
1111 {
1112     // VERY USEFUL LINK
1113     // https://github.com/ldc-developers/llvm-project/blob/ldc-release/11.x/llvm/include/llvm/IR/IntrinsicsAArch64.td
1114     // Also: https://developer.arm.com/architectures/instruction-sets/intrinsics/
1115 
1116     pragma(LDC_intrinsic, "llvm.aarch64.crc32cb")
1117         uint __crc32cb(uint a, uint b) pure @safe;
1118 
1119     pragma(LDC_intrinsic, "llvm.aarch64.crc32ch")
1120         uint __crc32ch(uint a, uint b) pure @safe;
1121 
1122     pragma(LDC_intrinsic, "llvm.aarch64.crc32cw")
1123         uint __crc32cw(uint a, uint b) pure @safe;
1124 
1125     pragma(LDC_intrinsic, "llvm.aarch64.crc32cx")
1126         uint __crc32cd(uint a, ulong b) pure @safe;
1127 
1128     pragma(LDC_intrinsic, "llvm.aarch64.neon.uabd.v16i8")
1129         byte16 vabdq_u8(byte16 a, byte16 b) pure @safe;
1130 
1131     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v8i16")
1132         short8 vabsq_s16(short8 a) pure @safe;
1133 
1134     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v4i32")
1135         int4 vabsq_s32(int4 a) pure @safe;
1136 
1137     pragma(LDC_intrinsic, "llvm.aarch64.neon.abs.v16i8")
1138         byte16 vabsq_s8(byte16 a) pure @safe;
1139 
1140     byte8 vand_u8(byte8 a, byte8 b) pure @safe
1141     {
1142         return a & b;
1143     }
1144 
1145     long2 vandq_s64(long2 a, long2 b)
1146     {
1147         return a & b;
1148     }
1149 
1150     long2 vbicq_s64(long2 a, long2 b) pure @safe
1151     {
1152         return a & ~b;
1153     }
1154 
1155     int4 vbslq_s32(int4 a, int4 b, int4 c) pure @safe
1156     {
1157         return c ^ ((c ^ b) & a);
1158     }
1159 
1160     byte16 vbslq_s8(byte16 a, byte16 b, byte16 c) pure @safe
1161     {
1162         return c ^ ((c ^ b) & a);
1163     }
1164 
1165     long2 vbslq_s64(long2 a, long2 b, long2 c) pure @safe
1166     {
1167         return c ^ ((c ^ b) & a);
1168     }
1169 
1170     short8 vcombine_s16(short4 lo, short4 hi) pure @trusted
1171     {
1172         short8 r;
1173         r.ptr[0]  = lo.array[0];
1174         r.ptr[1]  = lo.array[1];
1175         r.ptr[2]  = lo.array[2];
1176         r.ptr[3]  = lo.array[3];
1177         r.ptr[4]  = hi.array[0];
1178         r.ptr[5]  = hi.array[1];
1179         r.ptr[6]  = hi.array[2];
1180         r.ptr[7]  = hi.array[3];
1181         return r;
1182     }
1183 
1184     int4 vcombine_s32(int2 lo, int2 hi) pure @trusted
1185     {
1186         int4 r;
1187         r.ptr[0] = lo.array[0];
1188         r.ptr[1] = lo.array[1];
1189         r.ptr[2] = hi.array[0];
1190         r.ptr[3] = hi.array[1];
1191         return r;
1192     }
1193 
1194     byte16 vcombine_s8(byte8 lo, byte8 hi) pure @trusted
1195     {
1196         byte16 r;
1197         r.ptr[0]  = lo.array[0];
1198         r.ptr[1]  = lo.array[1];
1199         r.ptr[2]  = lo.array[2];
1200         r.ptr[3]  = lo.array[3];
1201         r.ptr[4]  = lo.array[4];
1202         r.ptr[5]  = lo.array[5];
1203         r.ptr[6]  = lo.array[6];
1204         r.ptr[7]  = lo.array[7];
1205         r.ptr[8]  = hi.array[0];
1206         r.ptr[9]  = hi.array[1];
1207         r.ptr[10] = hi.array[2];
1208         r.ptr[11] = hi.array[3];
1209         r.ptr[12] = hi.array[4];
1210         r.ptr[13] = hi.array[5];
1211         r.ptr[14] = hi.array[6];
1212         r.ptr[15] = hi.array[7];
1213         return r;
1214     }
1215 
1216     short8 vcombine_u16(short4 lo, short4 hi) pure @trusted
1217     {
1218         short8 r;
1219         r.ptr[0]  = lo.array[0];
1220         r.ptr[1]  = lo.array[1];
1221         r.ptr[2]  = lo.array[2];
1222         r.ptr[3]  = lo.array[3];
1223         r.ptr[4]  = hi.array[0];
1224         r.ptr[5]  = hi.array[1];
1225         r.ptr[6]  = hi.array[2];
1226         r.ptr[7]  = hi.array[3];
1227         return r;
1228     }
1229 
1230 
1231     // float4 => int4
1232 
1233     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v4i32.v4f32")
1234         int4 vcvtmq_s32_f32(float4 a) pure @safe;
1235 
1236     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v4i32.v4f32")
1237         int4 vcvtnq_s32_f32(float4 a) pure @safe;
1238 
1239     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v4i32.v4f32")
1240         int4 vcvtpq_s32_f32(float4 a) pure @safe;
1241 
1242     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v4i32.v4f32")
1243         int4 vcvtzq_s32_f32(float4 a) pure @safe;
1244 
1245 
1246     // double2 => long2
1247 
1248     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.v2i64.v2f64")
1249         long2 vcvtmq_s64_f64(double2 a) pure @safe;
1250 
1251     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.v2i64.v2f64")
1252         long2 vcvtnq_s64_f64(double2 a) pure @safe;
1253 
1254     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.v2i64.v2f64")
1255         long2 vcvtpq_s64_f64(double2 a) pure @safe;
1256 
1257     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.v2i64.v2f64")
1258         long2 vcvtzq_s64_f64(double2 a) pure @safe;
1259 
1260     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f32")
1261         int vcvtms_s32_f32(float a) pure @safe;
1262 
1263     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f32")
1264         int vcvtns_s32_f32(float a) pure @safe;    
1265 
1266     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f32")
1267         int vcvtps_s32_f32(float a) pure @safe;
1268 
1269     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f32")
1270         int vcvts_s32_f32(float a) pure @safe;
1271      
1272     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i32.f64")
1273         int vcvtms_s32_f64(double a) pure @safe;
1274 
1275     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i32.f64")
1276         int vcvtns_s32_f64(double a) pure @safe;    
1277 
1278     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i32.f64")
1279         int vcvtps_s32_f64(double a) pure @safe;
1280 
1281     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i32.f64")
1282         int vcvts_s32_f64(double a) pure @safe;
1283 
1284     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f32")
1285         long vcvtms_s64_f32(float a) pure @safe;
1286 
1287     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f32")
1288         long vcvtns_s64_f32(float a) pure @safe;    
1289 
1290     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f32")
1291         long vcvtps_s64_f32(float a) pure @safe;
1292 
1293     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f32")
1294         long vcvts_s64_f32(float a) pure @safe;
1295 
1296     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtms.i64.f64")
1297         long vcvtms_s64_f64(double a) pure @safe;
1298 
1299     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtns.i64.f64")
1300         long vcvtns_s64_f64(double a) pure @safe;    
1301 
1302     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtps.i64.f64")
1303         long vcvtps_s64_f64(double a) pure @safe; // Note: technically should be named vcvtpd_s64_f64
1304 
1305     pragma(LDC_intrinsic, "llvm.aarch64.neon.fcvtzs.i64.f64")
1306         long vcvts_s64_f64(double a) pure @safe;
1307 
1308     long2 vdupq_n_s64(long value) pure @safe
1309     {
1310         long2 r;
1311         r = value;
1312         return r;
1313     }
1314 
1315     short4 vget_high_s16(short8 a) pure @trusted
1316     {
1317         short4 r;
1318         r.ptr[0] = a.array[4];
1319         r.ptr[1] = a.array[5];
1320         r.ptr[2] = a.array[6];
1321         r.ptr[3] = a.array[7];
1322         return r;
1323     }
1324 
1325     int2 vget_high_s32(int4 a) pure @trusted
1326     {
1327         int2 r;
1328         r.ptr[0] = a.array[2];
1329         r.ptr[1] = a.array[3];
1330         return r;
1331     }
1332 
1333     byte8 vget_high_u8(byte16 a) pure @trusted
1334     {
1335         byte8 r;
1336         r.ptr[0] = a.array[8];
1337         r.ptr[1] = a.array[9];
1338         r.ptr[2] = a.array[10];
1339         r.ptr[3] = a.array[11];
1340         r.ptr[4] = a.array[12];
1341         r.ptr[5] = a.array[13];
1342         r.ptr[6] = a.array[14];
1343         r.ptr[7] = a.array[15];
1344         return r;
1345     }
1346 
1347     short4 vget_low_s16(short8 a) pure @trusted
1348     {
1349         short4 r;
1350         r.ptr[0] = a.array[0];
1351         r.ptr[1] = a.array[1];
1352         r.ptr[2] = a.array[2];
1353         r.ptr[3] = a.array[3];
1354         return r;
1355     } 
1356 
1357     int2 vget_low_s32(int4 a) pure @trusted
1358     {
1359         int2 r;
1360         r.ptr[0] = a.array[0];
1361         r.ptr[1] = a.array[1];
1362         return r;
1363     }
1364 
1365     byte8 vget_low_u8(byte16 a) pure @trusted
1366     {
1367         byte8 r;
1368         r.ptr[0] = a.array[0];
1369         r.ptr[1] = a.array[1];
1370         r.ptr[2] = a.array[2];
1371         r.ptr[3] = a.array[3];
1372         r.ptr[4] = a.array[4];
1373         r.ptr[5] = a.array[5];
1374         r.ptr[6] = a.array[6];
1375         r.ptr[7] = a.array[7];
1376         return r;
1377     }
1378 
1379     long vgetq_lane_s64(long2 v, const int lane) pure @safe
1380     {
1381         return v.array[lane];
1382     }
1383 
1384     pragma(LDC_intrinsic, "llvm.aarch64.neon.smax.v8i16")
1385         short8 vmaxq_s16(short8 a, short8 b) pure @safe;
1386 
1387     int4 vmaxq_s32(int4 a, int4 b)
1388     {
1389         int4 r;
1390         r[0] = a[0] >= b[0] ? a[0] : b[0];
1391         r[1] = a[1] >= b[1] ? a[1] : b[1];
1392         r[2] = a[2] >= b[2] ? a[2] : b[2];
1393         r[3] = a[3] >= b[3] ? a[3] : b[3];
1394         return r;
1395     }
1396 
1397     pragma(LDC_intrinsic, "llvm.aarch64.neon.smin.v8i16")
1398         short8 vminq_s16(short8 a, short8 b) pure @safe;
1399 
1400     int2 vmovn_s64(long2 a) pure @trusted
1401     {
1402         int2 r;
1403         r.ptr[0] = cast(int)(a.array[0]);
1404         r.ptr[1] = cast(int)(a.array[1]);
1405         return r;
1406     }        
1407 
1408     int4 vmull_s16(short4 a, short4 b) pure @trusted
1409     {
1410         int4 r;
1411         r.ptr[0] = a.array[0] * b.array[0];
1412         r.ptr[1] = a.array[1] * b.array[1];
1413         r.ptr[2] = a.array[2] * b.array[2];
1414         r.ptr[3] = a.array[3] * b.array[3];
1415         return r;
1416     }
1417 
1418     pragma(LDC_intrinsic, "llvm.aarch64.neon.smull.v2i64")
1419         long2 vmull_s32(int2 a, int2 b) pure @safe;
1420 
1421     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i16")
1422         short4 vpadd_s16(short4 a, short4 b) pure @safe;
1423 
1424     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v2i32")
1425         int2 vpadd_s32(int2 a, int2 b) pure @safe;
1426 
1427     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i8")
1428         byte8 vpadd_u8(byte8 a, byte8 b) pure @safe;
1429 
1430     pragma(LDC_intrinsic, "llvm.aarch64.neon.uaddlp.v8i16.v16i8")
1431         short8 vpaddlq_u8 (byte16 a) pure @safe;
1432 
1433     static if(__VERSION__ >= 2088) // LDC 1.18 start using LLVM9 who changes the name of the builtin
1434     {
1435         pragma(LDC_intrinsic, "llvm.aarch64.neon.faddp.v4f32")
1436             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1437     }
1438     else
1439     {
1440         pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4f32")
1441             float4 vpaddq_f32(float4 a, float4 b) pure @safe;
1442     }
1443     
1444     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v8i16")
1445         short8 vpaddq_s16(short8 a, short8 b) pure @safe;
1446 
1447     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v16i8")
1448         byte16 vpaddq_s8(byte16 a, byte16 b) pure @safe;
1449 
1450     pragma(LDC_intrinsic, "llvm.aarch64.neon.addp.v4i32")
1451         int4 vpaddq_s32(int4 a, int4 b) pure @safe;
1452 
1453     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v4i16")
1454         short4 vqadd_s16(short4 a, short4 b) pure @safe;
1455 
1456     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqadd.v8i16")
1457         short8 vqaddq_s16(short8 a, short8 b) pure @safe;
1458 
1459     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v8i8")
1460         byte8 vqmovn_s16(short8 a) pure @safe;
1461 
1462     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtn.v4i16")
1463         short4 vqmovn_s32(int4 a) pure @safe;
1464 
1465     pragma(LDC_intrinsic, "llvm.aarch64.neon.uqxtn.v4i16")
1466         short4 vqmovn_u32(int4 a) pure @safe;
1467 
1468     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqxtun.v8i8")
1469         byte8 vqmovun_s16(short8 a) pure @safe;
1470 
1471     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v4i16")
1472         short4 vqsub_s16(short4 a, short4 b) pure @safe;
1473 
1474     pragma(LDC_intrinsic, "llvm.aarch64.neon.sqsub.v8i16")
1475         short8 vqsubq_s16(short8 a, short8 b) pure @safe;
1476 
1477     pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v16i8")
1478         byte16 vqtbl1q_s8(byte16 t, byte16 idx) pure @safe;
1479 
1480     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v16i8")
1481         byte16 vrhadd_u8(byte16 a, byte16 b) pure @safe;
1482 
1483     pragma(LDC_intrinsic, "llvm.aarch64.neon.urhadd.v8i16")
1484         short8 vrhadd_u16(short8 a, short8 b) pure @safe;
1485 
1486     pragma(LDC_intrinsic, "llvm.aarch64.neon.rshrn.v4i16")
1487         short4 vrshrn_n_s32(int4 a, int n) pure @safe;        
1488 
1489     byte8 vshr_u8(byte8 a, byte8 b) pure @safe
1490     {
1491         return a >>> b;
1492     }
1493 
1494     byte16 vshrq_n_s8(byte16 a, byte r) pure @safe
1495     { 
1496         a = a >> byte16(cast(byte)r);
1497         return a;
1498     }
1499 
1500     pragma(LDC_intrinsic, "llvm.aarch64.neon.tbl1.v8i8")
1501         byte8 vtbl1_s8(byte16 t, byte8 idx) pure @safe;
1502 }
1503 
1504 version(unittest)
1505 {
1506     double abs_double(double x) @trusted
1507     {
1508         version(LDC)
1509             return llvm_fabs(x);
1510         else
1511         {
1512             long uf = *cast(long*)(&x);
1513             uf &= 0x7fffffff_ffffffff;
1514             return *cast(double*)(&uf);
1515         }
1516     }
1517 }
1518 
1519 // needed because in old GDC from travis, core.stdc.math.isnan isn't pure
1520 
1521 bool isnan(float x) pure @trusted
1522 {
1523     uint u = *cast(uint*)(&x);
1524     bool result = ((u & 0x7F800000) == 0x7F800000) && (u & 0x007FFFFF);
1525     return result;
1526 }
1527 unittest
1528 {
1529     float x = float.nan;
1530     assert(isnan(x));
1531 
1532     x = 0;
1533     assert(!isnan(x));
1534     
1535     x = float.infinity;
1536     assert(!isnan(x));
1537 }
1538 
1539 bool isnan(double x) pure @trusted
1540 {
1541     ulong u = *cast(ulong*)(&x);
1542     return ((u & 0x7FF00000_00000000) == 0x7FF00000_00000000) && (u & 0x000FFFFF_FFFFFFFF);
1543 }
1544 unittest
1545 {
1546     double x = double.nan;
1547     assert(isnan(x));
1548 
1549     x = 0;
1550     assert(!isnan(x));
1551     
1552     x = double.infinity;
1553     assert(!isnan(x));
1554 }