1 /**
2 * Copyright: Copyright Auburn Sounds 2016-2018.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 module inteli.internals;
7 
8 import inteli.types;
9 
10 version(unittest)
11     import core.stdc.stdio;
12 
13 // The only math functions needed for intel-intrinsics
14 public import core.math: fabs, sqrt; // since they are intrinsics
15 
16 version(LDC)
17 {
18     public import core.simd;
19     public import ldc.simd;
20     public import ldc.gccbuiltins_x86;
21     public import ldc.intrinsics;
22 
23     // Since LDC 1.13, using the new ldc.llvmasm.__ir variants instead of inlineIR
24     static if (__VERSION__ >= 2083)
25     {
26          import ldc.llvmasm;
27          alias LDCInlineIR = __ir_pure;
28     }
29     else
30     {
31         alias LDCInlineIR = inlineIR;
32     }
33 }
34 
35 
36 
37 package:
38 nothrow @nogc:
39 
40 
41 //
42 //  <ROUNDING>
43 //
44 //  Why is that there? For DMD, we cannot use rint because _MM_SET_ROUNDING_MODE
45 //  doesn't change the FPU rounding mode, and isn't expected to do so.
46 //  So we devised these rounding function to help having consistent rouding between 
47 //  LDC and DMD. It's important that DMD uses what is in MXCST to round.
48 //
49 
50 
51 int convertFloatToInt32UsingMXCSR(float value) pure @safe
52 {
53     int result;
54     asm pure nothrow @nogc @trusted
55     {
56         cvtss2si EAX, value;
57         mov result, EAX;
58     }
59     return result;
60 }
61 
62 int convertDoubleToInt32UsingMXCSR(double value) pure @safe
63 {
64     int result;
65     asm pure nothrow @nogc @trusted
66     {
67         cvtsd2si EAX, value;
68         mov result, EAX;
69     }
70     return result;
71 }
72 
73 long convertFloatToInt64UsingMXCSR(float value) pure @safe
74 {
75     // 64-bit can use an SSE instruction
76     version(D_InlineAsm_X86_64)
77     {
78         long result;
79         version(LDC) // work-around for " Data definition directives inside inline asm are not supported yet."
80         {
81             asm pure nothrow @nogc @trusted
82             {
83                 movss XMM0, value;
84                 cvtss2si RAX, XMM0;
85                 mov result, RAX;
86             }
87         }
88         else
89         {
90             asm pure nothrow @nogc @trusted
91             {
92                 movss XMM0, value;
93                 db 0xf3; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtss2si RAX, XMM0 (DMD refuses to emit)
94                 mov result, RAX;
95             }
96         }
97         return result;
98     }
99     else version(D_InlineAsm_X86)
100     {
101         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
102         // This leads to an unfortunate FPU sequence in every C++ compiler.
103         // See: https://godbolt.org/z/vZym77
104 
105         // Get current MXCSR rounding
106         uint sseRounding;
107         ushort savedFPUCW;
108         ushort newFPUCW;
109         long result;
110         asm pure nothrow @nogc @trusted
111         {
112             stmxcsr sseRounding;
113             fld value;
114             fnstcw savedFPUCW;
115             mov AX, savedFPUCW;
116             and AX, 0xf3ff;          // clear FPU rounding bits
117             movzx ECX, word ptr sseRounding;
118             and ECX, 0x6000;         // only keep SSE rounding bits
119             shr ECX, 3;
120             or AX, CX;               // make a new control word for FPU with SSE bits
121             mov newFPUCW, AX;
122             fldcw newFPUCW;
123             fistp qword ptr result;            // convert, respecting MXCSR (but not other control word things)
124             fldcw savedFPUCW;
125         }
126         return result;
127     }
128     else
129         static assert(false);
130 }
131 
132 ///ditto
133 long convertDoubleToInt64UsingMXCSR(double value) pure @safe
134 {
135     // 64-bit can use an SSE instruction
136     version(D_InlineAsm_X86_64)
137     {
138         long result;
139         version(LDC) // work-around for "Data definition directives inside inline asm are not supported yet."
140         {
141             asm pure nothrow @nogc @trusted
142             {
143                 movsd XMM0, value;
144                 cvtsd2si RAX, XMM0;
145                 mov result, RAX;
146             }
147         }
148         else
149         {
150             asm pure nothrow @nogc @trusted
151             {
152                 movsd XMM0, value;
153                 db 0xf2; db 0x48; db 0x0f; db 0x2d; db 0xc0; // cvtsd2si RAX, XMM0 (DMD refuses to emit)
154                 mov result, RAX;
155             }
156         }
157         return result;
158     }
159     else version(D_InlineAsm_X86)
160     {
161         // In the case of 32-bit x86 there is no SSE2 way to convert FP to 64-bit int
162         // This leads to an unfortunate FPU sequence in every C++ compiler.
163         // See: https://godbolt.org/z/vZym77
164 
165         // Get current MXCSR rounding
166         uint sseRounding;
167         ushort savedFPUCW;
168         ushort newFPUCW;
169         long result;
170         asm pure nothrow @nogc @trusted
171         {
172             stmxcsr sseRounding;
173             fld value;
174             fnstcw savedFPUCW;
175             mov AX, savedFPUCW;
176             and AX, 0xf3ff;
177             movzx ECX, word ptr sseRounding;
178             and ECX, 0x6000;
179             shr ECX, 3;
180             or AX, CX;
181             mov newFPUCW, AX;
182             fldcw newFPUCW;
183             fistp result;
184             fldcw savedFPUCW;
185         }
186         return result;
187     }
188     else
189         static assert(false);
190 }
191 
192 
193 //
194 //  </ROUNDING>
195 //
196 
197 
198 // using the Intel terminology here
199 
200 byte saturateSignedWordToSignedByte(short value) pure @safe
201 {
202     if (value > 127) value = 127;
203     if (value < -128) value = -128;
204     return cast(byte) value;
205 }
206 
207 ubyte saturateSignedWordToUnsignedByte(short value) pure @safe
208 {
209     if (value > 255) value = 255;
210     if (value < 0) value = 0;
211     return cast(ubyte) value;
212 }
213 
214 short saturateSignedIntToSignedShort(int value) pure @safe
215 {
216     if (value > 32767) value = 32767;
217     if (value < -32768) value = -32768;
218     return cast(short) value;
219 }
220 
221 ushort saturateSignedIntToUnsignedShort(int value) pure @safe
222 {
223     if (value > 65535) value = 65535;
224     if (value < 0) value = 0;
225     return cast(ushort) value;
226 }
227 
228 unittest // test saturate operations
229 {
230     assert( saturateSignedWordToSignedByte(32000) == 127);
231     assert( saturateSignedWordToUnsignedByte(32000) == 255);
232     assert( saturateSignedWordToSignedByte(-4000) == -128);
233     assert( saturateSignedWordToUnsignedByte(-4000) == 0);
234     assert( saturateSignedIntToSignedShort(32768) == 32767);
235     assert( saturateSignedIntToUnsignedShort(32768) == 32768);
236     assert( saturateSignedIntToSignedShort(-32769) == -32768);
237     assert( saturateSignedIntToUnsignedShort(-32769) == 0);
238 }
239 
240 version(unittest)
241 {
242     // printing vectors for implementation
243     // Note: you can override `pure` within a `debug` clause
244     void _mm_print_epi32(__m128i v) @trusted
245     {
246         printf("%d %d %d %d\n",
247               v[0], v[1], v[2], v[3]);
248     }
249 
250     void _mm_print_epi16(__m128i v) @trusted
251     {
252         short8 C = cast(short8)v;
253         printf("%d %d %d %d %d %d %d %d\n",
254         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7]);
255     }
256 
257     void _mm_print_epi8(__m128i v) @trusted
258     {
259         byte16 C = cast(byte16)v;
260         printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
261         C[0], C[1], C[2], C[3], C[4], C[5], C[6], C[7], C[8], C[9], C[10], C[11], C[12], C[13], C[14], C[15]);
262     }
263 }
264 
265 
266 //
267 //  <FLOATING-POINT COMPARISONS>
268 //
269 // Note: `ldc.simd` cannot express all nuances of FP comparisons, so we
270 //       need different IR generation.
271 
272 enum FPComparison
273 {
274     oeq,   // ordered and equal
275     ogt,   // ordered and greater than
276     oge,   // ordered and greater than or equal
277     olt,   // ordered and less than
278     ole,   // ordered and less than or equal
279     one,   // ordered and not equal
280     ord,   // ordered (no nans)
281     ueq,   // unordered or equal
282     ugt,   // unordered or greater than ("nle")
283     uge,   // unordered or greater than or equal ("nlt")
284     ult,   // unordered or less than ("nge")
285     ule,   // unordered or less than or equal ("ngt")
286     une,   // unordered or not equal ("neq")
287     uno,   // unordered (either nans)
288 }
289 
290 private static immutable string[FPComparison.max+1] FPComparisonToString =
291 [
292     "oeq",
293     "ogt",
294     "oge",
295     "olt",
296     "ole",
297     "one",
298     "ord",
299     "ueq",
300     "ugt",
301     "uge",
302     "ult",
303     "ule",
304     "une",
305     "uno",
306 ];
307 
308 // Individual float comparison: returns -1 for true or 0 for false.
309 // Useful for DMD and testing
310 version (unittest)
311 {
312     private bool compareFloat(T)(FPComparison comparison, T a, T b) pure @safe
313     {
314         import std.math;
315         bool unordered = isNaN(a) || isNaN(b);
316         final switch(comparison) with(FPComparison)
317         {
318             case oeq: return a == b;
319             case ogt: return a > b;
320             case oge: return a >= b;
321             case olt: return a < b;
322             case ole: return a <= b;
323             case one: return !unordered && (a != b); // NaN with != always yields true
324             case ord: return !unordered; 
325             case ueq: return unordered || (a == b);
326             case ugt: return unordered || (a > b);
327             case uge: return unordered || (a >= b);
328             case ult: return unordered || (a < b);
329             case ule: return unordered || (a <= b);
330             case une: return (a != b); // NaN with != always yields true
331             case uno: return unordered;
332         }
333     }
334 }
335 
336 version(LDC)
337 {
338     /// Provides packed float comparisons
339     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
340     {
341         enum ir = `
342             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <4 x float> %0, %1
343             %r = sext <4 x i1> %cmp to <4 x i32>
344             ret <4 x i32> %r`;
345 
346         return LDCInlineIR!(ir, int4, float4, float4)(a, b);
347     }
348 
349     /// Provides packed double comparisons
350     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
351     {
352         enum ir = `
353             %cmp = fcmp `~ FPComparisonToString[comparison] ~` <2 x double> %0, %1
354             %r = sext <2 x i1> %cmp to <2 x i64>
355             ret <2 x i64> %r`;
356 
357         return LDCInlineIR!(ir, long2, double2, double2)(a, b);
358     }
359 
360     /// CMPSS-style comparisons
361     /// clang implement it through x86 intrinsics, it is possible with IR alone
362     /// but leads to less optimal code.
363     /// PERF: try to implement it with __builtin_ia32_cmpss and immediate 0 to 7. 
364     /// Not that simple.
365     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
366     {
367         /*
368         enum ubyte predicateNumber = FPComparisonToX86Predicate[comparison];
369         enum bool invertOp = (predicateNumber & 0x80) != 0;
370         static if(invertOp)
371             return __builtin_ia32_cmpsd(b, a, predicateNumber & 0x7f);
372         else
373             return __builtin_ia32_cmpsd(a, b, predicateNumber & 0x7f);
374         */
375         enum ir = `
376             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
377             %r = sext i1 %cmp to i32
378             %r2 = bitcast i32 %r to float
379             ret float %r2`;
380 
381         float4 r = a;
382         r[0] = LDCInlineIR!(ir, float, float, float)(a[0], b[0]);
383         return r;
384     }
385 
386     /// CMPSD-style comparisons
387     /// clang implement it through x86 intrinsics, it is possible with IR alone
388     /// but leads to less optimal code.
389     /// PERF: try to implement it with __builtin_ia32_cmpsd and immediate 0 to 7. 
390     /// Not that simple.    
391     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
392     {
393         enum ir = `
394             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
395             %r = sext i1 %cmp to i64
396             %r2 = bitcast i64 %r to double
397             ret double %r2`;
398 
399         double2 r = a;
400         r[0] = LDCInlineIR!(ir, double, double, double)(a[0], b[0]);
401         return r;
402     }
403 
404     // Note: ucomss and ucomsd are left unimplemented
405     package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe
406     {
407         enum ir = `
408             %cmp = fcmp `~ FPComparisonToString[comparison] ~` float %0, %1
409             %r = zext i1 %cmp to i32
410             ret i32 %r`;
411 
412         return LDCInlineIR!(ir, int, float, float)(a[0], b[0]);
413     }
414 
415     // Note: ucomss and ucomsd are left unimplemented
416     package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe
417     {
418         enum ir = `
419             %cmp = fcmp `~ FPComparisonToString[comparison] ~` double %0, %1
420             %r = zext i1 %cmp to i32
421             ret i32 %r`;
422 
423         return LDCInlineIR!(ir, int, double, double)(a[0], b[0]);
424     }
425 }
426 else
427 {
428     /// Provides packed float comparisons
429     package int4 cmpps(FPComparison comparison)(float4 a, float4 b) pure @safe
430     {
431         int4 result;
432         foreach(i; 0..4)
433         {
434             result[i] = compareFloat!float(comparison, a[i], b[i]) ? -1 : 0;
435         }
436         return result;
437     }
438 
439     /// Provides packed double comparisons
440     package long2 cmppd(FPComparison comparison)(double2 a, double2 b) pure @safe
441     {
442         long2 result;
443         foreach(i; 0..2)
444         {
445             result[i] = compareFloat!double(comparison, a[i], b[i]) ? -1 : 0;
446         }
447         return result;
448     }
449 
450     /// Provides CMPSS-style comparison
451     package float4 cmpss(FPComparison comparison)(float4 a, float4 b) pure @safe
452     {
453         int4 result = cast(int4)a;
454         result[0] = compareFloat!float(comparison, a[0], b[0]) ? -1 : 0;
455         return cast(float4)result;
456     }
457 
458     /// Provides CMPSD-style comparison
459     package double2 cmpsd(FPComparison comparison)(double2 a, double2 b) pure @safe
460     {
461         long2 result = cast(long2)a;
462         result[0] = compareFloat!double(comparison, a[0], b[0]) ? -1 : 0;
463         return cast(double2)result;
464     }
465 
466     package int comss(FPComparison comparison)(float4 a, float4 b) pure @safe
467     {
468         return compareFloat!float(comparison, a[0], b[0]) ? 1 : 0;
469     }
470 
471     // Note: ucomss and ucomsd are left unimplemented
472     package int comsd(FPComparison comparison)(double2 a, double2 b) pure @safe
473     {
474         return compareFloat!double(comparison, a[0], b[0]) ? 1 : 0;
475     }
476 }
477 unittest // cmpps
478 {
479     // Check all comparison type is working
480     float4 A = [1, 3, 5, float.nan];
481     float4 B = [2, 3, 4, 5];
482 
483     int4 result_oeq = cmpps!(FPComparison.oeq)(A, B);
484     int4 result_ogt = cmpps!(FPComparison.ogt)(A, B);
485     int4 result_oge = cmpps!(FPComparison.oge)(A, B);
486     int4 result_olt = cmpps!(FPComparison.olt)(A, B);
487     int4 result_ole = cmpps!(FPComparison.ole)(A, B);
488     int4 result_one = cmpps!(FPComparison.one)(A, B);
489     int4 result_ord = cmpps!(FPComparison.ord)(A, B);
490     int4 result_ueq = cmpps!(FPComparison.ueq)(A, B);
491     int4 result_ugt = cmpps!(FPComparison.ugt)(A, B);
492     int4 result_uge = cmpps!(FPComparison.uge)(A, B);
493     int4 result_ult = cmpps!(FPComparison.ult)(A, B);
494     int4 result_ule = cmpps!(FPComparison.ule)(A, B);
495     int4 result_une = cmpps!(FPComparison.une)(A, B);
496     int4 result_uno = cmpps!(FPComparison.uno)(A, B);
497 
498     static immutable int[4] correct_oeq    = [ 0,-1, 0, 0];
499     static immutable int[4] correct_ogt    = [ 0, 0,-1, 0];
500     static immutable int[4] correct_oge    = [ 0,-1,-1, 0];
501     static immutable int[4] correct_olt    = [-1, 0, 0, 0];
502     static immutable int[4] correct_ole    = [-1,-1, 0, 0];
503     static immutable int[4] correct_one    = [-1, 0,-1, 0];
504     static immutable int[4] correct_ord    = [-1,-1,-1, 0];
505     static immutable int[4] correct_ueq    = [ 0,-1, 0,-1];
506     static immutable int[4] correct_ugt    = [ 0, 0,-1,-1];
507     static immutable int[4] correct_uge    = [ 0,-1,-1,-1];
508     static immutable int[4] correct_ult    = [-1, 0, 0,-1];
509     static immutable int[4] correct_ule    = [-1,-1, 0,-1];
510     static immutable int[4] correct_une    = [-1, 0,-1,-1];
511     static immutable int[4] correct_uno    = [ 0, 0, 0,-1];
512 
513     assert(result_oeq.array == correct_oeq);
514     assert(result_ogt.array == correct_ogt);
515     assert(result_oge.array == correct_oge);
516     assert(result_olt.array == correct_olt);
517     assert(result_ole.array == correct_ole);
518     assert(result_one.array == correct_one);
519     assert(result_ord.array == correct_ord);
520     assert(result_ueq.array == correct_ueq);
521     assert(result_ugt.array == correct_ugt);
522     assert(result_uge.array == correct_uge);
523     assert(result_ult.array == correct_ult);
524     assert(result_ule.array == correct_ule);
525     assert(result_une.array == correct_une);
526     assert(result_uno.array == correct_uno);
527 }
528 unittest
529 {
530     double2 a = [1, 3];
531     double2 b = [2, 3];
532     long2 c = cmppd!(FPComparison.ult)(a, b);
533     static immutable long[2] correct = [cast(long)(-1), 0];
534     assert(c.array == correct);
535 }
536 unittest // cmpss and comss
537 {
538     void testComparison(FPComparison comparison)(float4 A, float4 B)
539     {
540         float4 result = cmpss!comparison(A, B);
541         int4 iresult = cast(int4)result;
542         int expected = compareFloat!float(comparison, A[0], B[0]) ? -1 : 0;
543         assert(iresult[0] == expected);
544         assert(result[1] == A[1]);
545         assert(result[2] == A[2]);
546         assert(result[3] == A[3]);
547 
548         // check comss
549         int comResult = comss!comparison(A, B);
550         assert( (expected != 0) == (comResult != 0) );
551     }
552 
553     // Check all comparison type is working
554     float4 A = [1, 3, 5, 6];
555     float4 B = [2, 3, 4, 5];
556     float4 C = [float.nan, 3, 4, 5];
557 
558     testComparison!(FPComparison.oeq)(A, B);
559     testComparison!(FPComparison.oeq)(A, C);
560     testComparison!(FPComparison.ogt)(A, B);
561     testComparison!(FPComparison.ogt)(A, C);
562     testComparison!(FPComparison.oge)(A, B);
563     testComparison!(FPComparison.oge)(A, C);
564     testComparison!(FPComparison.olt)(A, B);
565     testComparison!(FPComparison.olt)(A, C);
566     testComparison!(FPComparison.ole)(A, B);
567     testComparison!(FPComparison.ole)(A, C);
568     testComparison!(FPComparison.one)(A, B);
569     testComparison!(FPComparison.one)(A, C);
570     testComparison!(FPComparison.ord)(A, B);
571     testComparison!(FPComparison.ord)(A, C);
572     testComparison!(FPComparison.ueq)(A, B);
573     testComparison!(FPComparison.ueq)(A, C);
574     testComparison!(FPComparison.ugt)(A, B);
575     testComparison!(FPComparison.ugt)(A, C);
576     testComparison!(FPComparison.uge)(A, B);
577     testComparison!(FPComparison.uge)(A, C);
578     testComparison!(FPComparison.ult)(A, B);
579     testComparison!(FPComparison.ult)(A, C);
580     testComparison!(FPComparison.ule)(A, B);
581     testComparison!(FPComparison.ule)(A, C);
582     testComparison!(FPComparison.une)(A, B);
583     testComparison!(FPComparison.une)(A, C);
584     testComparison!(FPComparison.uno)(A, B);
585     testComparison!(FPComparison.uno)(A, C);
586 }
587 unittest // cmpsd and comsd
588 {
589     void testComparison(FPComparison comparison)(double2 A, double2 B)
590     {
591         double2 result = cmpsd!comparison(A, B);
592         long2 iresult = cast(long2)result;
593         long expected = compareFloat!double(comparison, A[0], B[0]) ? -1 : 0;
594         assert(iresult[0] == expected);
595         assert(result[1] == A[1]);
596 
597         // check comsd
598         int comResult = comsd!comparison(A, B);
599         assert( (expected != 0) == (comResult != 0) );
600     }
601 
602     // Check all comparison type is working
603     double2 A = [1, 3];
604     double2 B = [2, 4];
605     double2 C = [double.nan, 5];
606 
607     testComparison!(FPComparison.oeq)(A, B);
608     testComparison!(FPComparison.oeq)(A, C);
609     testComparison!(FPComparison.ogt)(A, B);
610     testComparison!(FPComparison.ogt)(A, C);
611     testComparison!(FPComparison.oge)(A, B);
612     testComparison!(FPComparison.oge)(A, C);
613     testComparison!(FPComparison.olt)(A, B);
614     testComparison!(FPComparison.olt)(A, C);
615     testComparison!(FPComparison.ole)(A, B);
616     testComparison!(FPComparison.ole)(A, C);
617     testComparison!(FPComparison.one)(A, B);
618     testComparison!(FPComparison.one)(A, C);
619     testComparison!(FPComparison.ord)(A, B);
620     testComparison!(FPComparison.ord)(A, C);
621     testComparison!(FPComparison.ueq)(A, B);
622     testComparison!(FPComparison.ueq)(A, C);
623     testComparison!(FPComparison.ugt)(A, B);
624     testComparison!(FPComparison.ugt)(A, C);
625     testComparison!(FPComparison.uge)(A, B);
626     testComparison!(FPComparison.uge)(A, C);
627     testComparison!(FPComparison.ult)(A, B);
628     testComparison!(FPComparison.ult)(A, C);
629     testComparison!(FPComparison.ule)(A, B);
630     testComparison!(FPComparison.ule)(A, C);
631     testComparison!(FPComparison.une)(A, B);
632     testComparison!(FPComparison.une)(A, C);
633     testComparison!(FPComparison.uno)(A, B);
634     testComparison!(FPComparison.uno)(A, C);
635 }
636 
637 //
638 //  </FLOATING-POINT COMPARISONS>
639 //