1 /**
2 * `core.simd` emulation layer.
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.types;
8 
9 
10 pure:
11 nothrow:
12 @nogc:
13 
14 version(GNU)
15 {
16     // Note: for GDC support, be sure to use https://explore.dgnu.org/
17 
18     version(X86_64)
19     {
20         enum MMXSizedVectorsAreEmulated = false;
21         enum SSESizedVectorsAreEmulated = false;
22 
23         // TODO: use D_AVX and D_AVX2 eventually to detect AVX?
24         enum AVXSizedVectorsAreEmulated = true;
25 
26         import gcc.builtins;
27 
28         float4 loadUnaligned(Vec)(const(float)* pvec) @trusted if (is(Vec == float4))
29         {
30             return __builtin_ia32_loadups(pvec);
31         }
32 
33         double2 loadUnaligned(Vec)(const(double)* pvec) @trusted if (is(Vec == double2))
34         {
35             return __builtin_ia32_loadupd(pvec);
36         }
37 
38         byte16 loadUnaligned(Vec)(const(byte)* pvec) @trusted if (is(Vec == byte16))
39         {
40             return cast(byte16) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
41         }
42 
43         short8 loadUnaligned(Vec)(const(short)* pvec) @trusted if (is(Vec == short8))
44         {
45             return cast(short8) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
46         }
47 
48         int4 loadUnaligned(Vec)(const(int)* pvec) @trusted if (is(Vec == int4))
49         {
50             return cast(int4) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
51         }
52 
53         long2 loadUnaligned(Vec)(const(long)* pvec) @trusted if (is(Vec == long2))
54         {
55             return cast(long2) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
56         }
57 
58         void storeUnaligned(Vec)(Vec v, float* pvec) @trusted if (is(Vec == float4))
59         {
60             __builtin_ia32_storeups(pvec, v);
61         }
62 
63         void storeUnaligned(Vec)(Vec v, double* pvec) @trusted if (is(Vec == double2))
64         {
65             __builtin_ia32_storeupd(pvec, v);
66         }
67 
68         void storeUnaligned(Vec)(Vec v, byte* pvec) @trusted if (is(Vec == byte16))
69         {
70             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
71         }
72 
73         void storeUnaligned(Vec)(Vec v, short* pvec) @trusted if (is(Vec == short8))
74         {
75             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
76         }
77 
78         void storeUnaligned(Vec)(Vec v, int* pvec) @trusted if (is(Vec == int4))
79         {
80             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
81         }
82 
83         void storeUnaligned(Vec)(Vec v, long* pvec) @trusted if (is(Vec == long2))
84         {
85             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
86         }
87 
88         // TODO: for performance, replace that anywhere possible by a GDC intrinsic
89         Vec shufflevector(Vec, mask...)(Vec a, Vec b) @trusted
90         {
91             enum Count = Vec.array.length;
92             static assert(mask.length == Count);
93 
94             Vec r = void;
95             foreach(int i, m; mask)
96             {
97                 static assert (m < Count * 2);
98                 int ind = cast(int)m;
99                 if (ind < Count)
100                     r.ptr[i] = a.array[ind];
101                 else
102                     r.ptr[i] = b.array[ind - Count];
103             }
104             return r;
105         }
106     }
107     else
108     {
109         enum MMXSizedVectorsAreEmulated = true;
110         enum SSESizedVectorsAreEmulated = true;
111         enum AVXSizedVectorsAreEmulated = true;
112     }
113 }
114 else version(LDC)
115 {
116     public import ldc.simd;
117 
118     // Use this alias to mention it should only be used with LDC,
119     // for example when emulated shufflevector would just be wasteful.
120     alias shufflevectorLDC = shufflevector; 
121 
122     enum MMXSizedVectorsAreEmulated = false;
123     enum SSESizedVectorsAreEmulated = false;
124     enum AVXSizedVectorsAreEmulated = false;
125 }
126 else version(DigitalMars)
127 {
128     public import core.simd;
129 
130     // Note: turning this true is desirable,
131     // and leads to many bugs being discovered upstream.
132     // the fact that it works relies on many workardounds.
133     // in particular intel-intrinsics with this on is a honeypot for DMD backend bugs.
134     static if (__VERSION__ >= 2100)
135     {
136         enum bool tryToEnableCoreSimdWithDMD = true;
137     }
138     else
139     {
140         enum bool tryToEnableCoreSimdWithDMD = false;
141     }
142 
143     version(D_SIMD)
144     {
145         enum MMXSizedVectorsAreEmulated = true;
146         enum SSESizedVectorsAreEmulated = !tryToEnableCoreSimdWithDMD;
147         version(D_AVX)
148             enum AVXSizedVectorsAreEmulated = !tryToEnableCoreSimdWithDMD;
149         else
150             enum AVXSizedVectorsAreEmulated = true;
151     }
152     else
153     {
154         // Some DMD 32-bit targets don't have D_SIMD
155         enum MMXSizedVectorsAreEmulated = true;
156         enum SSESizedVectorsAreEmulated = true;
157         enum AVXSizedVectorsAreEmulated = true;
158     }
159 }
160 
161 enum CoreSimdIsEmulated = MMXSizedVectorsAreEmulated || SSESizedVectorsAreEmulated || AVXSizedVectorsAreEmulated;
162 
163 version(GNU)
164     enum bool DefineGenericLoadStoreUnaligned = false;
165 else
166     enum bool DefineGenericLoadStoreUnaligned = CoreSimdIsEmulated;
167 
168 
169 static if (CoreSimdIsEmulated)
170 {
171     // core.simd is emulated in some capacity: introduce `VectorOps`
172 
173     mixin template VectorOps(VectorType, ArrayType: BaseType[N], BaseType, size_t N)
174     {
175         enum Count = N;
176         alias Base = BaseType;
177 
178         BaseType* ptr() return pure nothrow @nogc
179         {
180             return array.ptr;
181         }
182 
183         // Unary operators
184         VectorType opUnary(string op)() pure nothrow @safe @nogc
185         {
186             VectorType res = void;
187             mixin("res.array[] = " ~ op ~ "array[];");
188             return res;
189         }
190 
191         // Binary operators
192         VectorType opBinary(string op)(VectorType other) pure const nothrow @safe @nogc
193         {
194             VectorType res = void;
195             mixin("res.array[] = array[] " ~ op ~ " other.array[];");
196             return res;
197         }
198 
199         // Assigning a BaseType value
200         void opAssign(BaseType e) pure nothrow @safe @nogc
201         {
202             array[] = e;
203         }
204 
205         // Assigning a static array
206         void opAssign(ArrayType v) pure nothrow @safe @nogc
207         {
208             array[] = v[];
209         }
210 
211         void opOpAssign(string op)(VectorType other) pure nothrow @safe @nogc
212         {
213             mixin("array[] "  ~ op ~ "= other.array[];");
214         }
215 
216         // Assigning a dyn array
217         this(ArrayType v) pure nothrow @safe @nogc
218         {
219             array[] = v[];
220         }
221 
222         // Broadcast constructor
223         this(BaseType x) pure nothrow @safe @nogc
224         {
225             array[] = x;
226         }
227 
228         /// We can't support implicit conversion but do support explicit casting.
229         /// "Vector types of the same size can be implicitly converted among each other."
230         /// Casting to another vector type is always just a raw copy.
231         VecDest opCast(VecDest)() pure const nothrow @trusted @nogc
232             if (VecDest.sizeof == VectorType.sizeof)
233             {
234                 VecDest dest = void;
235                 // Copy
236                 dest.array[] = (cast(typeof(dest.array))cast(void[VectorType.sizeof])array)[];
237                 return dest;
238             }
239 
240         ref inout(BaseType) opIndex(size_t i) inout return pure nothrow @safe @nogc
241         {
242             return array[i];
243         }
244 
245     }
246 
247     // they just weren't interesting enough, use v.array[i] instead.
248     deprecated auto extractelement(Vec, int index, Vec2)(Vec2 vec) @trusted
249     {
250         static assert(Vec.sizeof == Vec2.sizeof);
251         import core.stdc.string: memcpy;
252         Vec v = void;
253         memcpy(&v, &vec, Vec2.sizeof);
254         return v.array[index];
255     }
256 
257     // they just weren't interesting enough, use v.ptr[i] = x instead.
258     deprecated auto insertelement(Vec, int index, Vec2)(Vec2 vec, Vec.Base e) @trusted
259     {
260         static assert(Vec.sizeof == Vec2.sizeof);
261         import core.stdc.string: memcpy;
262         Vec v = void;
263         memcpy(&v, &vec, Vec2.sizeof);
264         v.array[index] = e;
265         return v;
266     }
267 }
268 else
269 {
270     public import core.simd;
271 
272     // GDC cannot convert implicitely __vector from signed to unsigned, but LDC can
273     // And LDC sometimes need those unsigned vector types for some intrinsics.
274     // For internal use only.
275     package alias ushort8 = Vector!(ushort[8]);
276     package alias ubyte8  = Vector!(ubyte[8]);
277     package alias ubyte16 = Vector!(ubyte[16]);
278 }
279 
280 static if (DefineGenericLoadStoreUnaligned)
281 {
282     template loadUnaligned(Vec)
283     {
284         // Note: can't be @safe with this signature
285         Vec loadUnaligned(const(BaseType!Vec)* pvec) @trusted
286         {
287             enum bool isVector = ( (Vec.sizeof == 8)  && (!MMXSizedVectorsAreEmulated)
288                                 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated)
289                                 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) );
290 
291             static if (isVector)
292             {
293                 // PERF DMD
294                 // BUG: code is wrong, should cast to Vec, see https://github.com/dlang/druntime/pull/3808/commits/b5670753248ec3b1631a0eb8ca76a27e8d6a39b9
295                 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d
296                 static if (DMD_with_DSIMD && Vec.sizeof == 8)
297                 {
298                     static if (is(Vec == double2))
299                         return cast(Vec)__simd(XMM.LODUPD, *pvec);
300                     else static if (is(Vec == float4))
301                         return cast(Vec)__simd(XMM.LODUPS, *pvec);
302                     else
303                         return cast(Vec)__simd(XMM.LODDQU, *pvec);
304                 }
305                 else */
306                 {
307                     enum size_t Count = Vec.array.length;
308                     Vec result;
309                     foreach(int i; 0..Count)
310                     {
311                         result.ptr[i] = pvec[i];
312                     }
313                     return result;
314                 }
315             }
316             else
317             {
318                 // Since this vector is emulated, it doesn't have alignement constraints
319                 // and as such we can just cast it.
320                 return *cast(Vec*)(pvec);
321             }
322         }
323     }
324 
325     template storeUnaligned(Vec)
326     {
327         // Note: can't be @safe with this signature
328         void storeUnaligned(Vec v, BaseType!Vec* pvec) @trusted
329         {
330             enum bool isVector = ( (Vec.sizeof == 8)  && (!MMXSizedVectorsAreEmulated)
331                                 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated)
332                                 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) );
333 
334             static if (isVector)
335             {
336                 // PERF DMD
337                 // BUG: code is wrong, should cast to Vec, see https://github.com/dlang/druntime/pull/3808/commits/b5670753248ec3b1631a0eb8ca76a27e8d6a39b9
338                 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d
339                 static if (DMD_with_DSIMD && Vec.sizeof == 8)
340                 {
341                     static if (is(Vec == double2))
342                         __simd_sto(XMM.STOUPD, *pvec, value);
343                     else static if (is(Vec == float4))
344                         __simd_sto(XMM.STOUPS, *pvec, value);
345                     else
346                         __simd_sto(XMM.STODQU, *pvec, value);
347                 }
348                 else*/
349                 {
350                     enum size_t Count = Vec.array.length;
351                     foreach(int i; 0..Count)
352                         pvec[i] = v.array[i];
353                 }
354             }
355             else
356             {
357                 *cast(Vec*)(pvec) = v;
358             }
359         }
360     }
361 
362     Vec shufflevector(Vec, mask...)(Vec a, Vec b) @safe if (Vec.sizeof < 32)
363     {
364         enum size_t Count = Vec.array.length;
365         static assert(mask.length == Count);
366 
367         Vec r = void;
368         foreach(int i, m; mask)
369         {
370             static assert (m < Count * 2);
371             enum int ind = cast(int)m;
372             static if (ind < Count)
373                 r.array[i] = a.array[ind];
374             else
375                 r.array[i] = b.array[ind-Count];
376         }
377         return r;
378     }
379 }
380 
381 // Emulate ldc.simd cmpMask and other masks.
382 // Note: these should be deprecated on non-LDC, 
383 // since it's slower to generate that code.
384 version(LDC)
385 {} 
386 else
387 {
388     private template BaseType(V)
389     {
390         alias typeof( ( { V v; return v; }()).array[0]) BaseType;
391     }
392 
393     private template TrueMask(V)
394     {
395         alias Elem = BaseType!V;
396 
397         static if (is(Elem == float))
398         {
399             immutable uint m1 = 0xffffffff;
400             enum Elem TrueMask = *cast(float*)(&m1);
401         }
402         else static if (is(Elem == double))
403         {
404             immutable ulong m1 = 0xffffffff_ffffffff;
405             enum Elem TrueMask = *cast(double*)(&m1);
406         }
407         else // integer case
408         {
409             enum Elem TrueMask = -1;
410         }
411     }
412 
413     Vec equalMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "oeq" comparison
414     {
415         enum size_t Count = Vec.array.length;
416         Vec result;
417         foreach(int i; 0..Count)
418         {
419             bool cond = a.array[i] == b.array[i];
420             result.ptr[i] = cond ? TrueMask!Vec : 0;
421         }
422         return result;
423     }
424 
425     Vec notEqualMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "one" comparison
426     {
427         enum size_t Count = Vec.array.length;
428         Vec result;
429         foreach(int i; 0..Count)
430         {
431             bool cond = a.array[i] != b.array[i];
432             result.ptr[i] = cond ? TrueMask!Vec : 0;
433         }
434         return result;
435     }
436 
437     Vec greaterMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "ogt" comparison
438     {
439         enum size_t Count = Vec.array.length;
440         Vec result;
441         foreach(int i; 0..Count)
442         {
443             bool cond = a.array[i] > b.array[i];
444             result.ptr[i] = cond ? TrueMask!Vec : 0;
445         }
446         return result;
447     }
448 }
449 
450 unittest
451 {
452     float4 a = [1, 3, 5, 7];
453     float4 b = [2, 3, 4, 5];
454     int4 c = cast(int4)(greaterMask!float4(a, b));
455     static immutable int[4] correct = [0, 0, 0xffff_ffff, 0xffff_ffff];
456     assert(c.array == correct);
457 }
458 
459 static if (MMXSizedVectorsAreEmulated)
460 {
461     /// MMX-like SIMD types
462     struct float2
463     {
464         float[2] array;
465         mixin VectorOps!(float2, float[2]);
466     }
467 
468     struct byte8
469     {
470         byte[8] array;
471         mixin VectorOps!(byte8, byte[8]);
472     }
473 
474     struct short4
475     {
476         short[4] array;
477         mixin VectorOps!(short4, short[4]);
478     }
479 
480     struct int2
481     {
482         int[2] array;
483         mixin VectorOps!(int2, int[2]);
484     }
485 
486     struct long1
487     {
488         long[1] array;
489         mixin VectorOps!(long1, long[1]);
490     }
491 }
492 else
493 {
494     // For this compiler, defining MMX-sized vectors is working.
495     public import core.simd;
496     alias Vector!(long [1]) long1;
497     alias Vector!(float[2]) float2;
498     alias Vector!(int  [2]) int2;
499     alias Vector!(short[4]) short4;
500     alias Vector!(byte [8]) byte8;
501 }
502 
503 static assert(float2.sizeof == 8);
504 static assert(byte8.sizeof == 8);
505 static assert(short4.sizeof == 8);
506 static assert(int2.sizeof == 8);
507 static assert(long1.sizeof == 8);
508 
509 
510 static if (SSESizedVectorsAreEmulated)
511 {
512     /// SSE-like SIMD types
513 
514     struct float4
515     {
516         float[4] array;
517         mixin VectorOps!(float4, float[4]);
518     }
519 
520     struct byte16
521     {
522         byte[16] array;
523         mixin VectorOps!(byte16, byte[16]);
524     }
525 
526     struct short8
527     {
528         short[8] array;
529         mixin VectorOps!(short8, short[8]);
530     }
531 
532     struct int4
533     {
534         int[4] array;
535         mixin VectorOps!(int4, int[4]);
536     }
537 
538     struct long2
539     {
540         long[2] array;
541         mixin VectorOps!(long2, long[2]);
542     }
543 
544     struct double2
545     {
546         double[2] array;
547         mixin VectorOps!(double2, double[2]);
548     }
549 }
550 
551 static assert(float4.sizeof == 16);
552 static assert(byte16.sizeof == 16);
553 static assert(short8.sizeof == 16);
554 static assert(int4.sizeof == 16);
555 static assert(long2.sizeof == 16);
556 static assert(double2.sizeof == 16);
557 
558 
559 static if (AVXSizedVectorsAreEmulated)
560 {
561     /// AVX-like SIMD types
562 
563     struct float8
564     {
565         float[8] array;
566         mixin VectorOps!(float8, float[8]);
567     }
568 
569     struct byte32
570     {
571         byte[32] array;
572         mixin VectorOps!(byte32, byte[32]);
573     }
574 
575     struct short16
576     {
577         short[16] array;
578         mixin VectorOps!(short16, short[16]);
579     }
580 
581     struct int8
582     {
583         int[8] array;
584         mixin VectorOps!(int8, int[8]);
585     }
586 
587     struct long4
588     {
589         long[4] array;
590         mixin VectorOps!(long4, long[4]);
591     }
592 
593     struct double4
594     {
595         double[4] array;
596         mixin VectorOps!(double4, double[4]);
597     }
598 }
599 else
600 {
601     public import core.simd;    
602 }
603 static assert(float8.sizeof == 32);
604 static assert(byte32.sizeof == 32);
605 static assert(short16.sizeof == 32);
606 static assert(int8.sizeof == 32);
607 static assert(long4.sizeof == 32);
608 static assert(double4.sizeof == 32);
609 
610 
611 
612 
613 alias __m256 = float8;
614 alias __m256i = long4; // long long __vector with ICC, GCC, and clang
615 alias __m256d = double4;
616 alias __m128 = float4;
617 alias __m128i = int4;
618 alias __m128d = double2;
619 alias __m64 = long1; // like in Clang, __m64 is a vector of 1 long
620 
621 int _MM_SHUFFLE2(int x, int y) pure @safe
622 {
623     assert(x >= 0 && x <= 1);
624     assert(y >= 0 && y <= 1);
625     return (x << 1) | y;
626 }
627 
628 int _MM_SHUFFLE(int z, int y, int x, int w) pure @safe
629 {
630     assert(x >= 0 && x <= 3);
631     assert(y >= 0 && y <= 3);
632     assert(z >= 0 && z <= 3);
633     assert(w >= 0 && w <= 3);
634     return (z<<6) | (y<<4) | (x<<2) | w;
635 }
636 
637 // test assignment from scalar to vector type
638 unittest
639 {
640     float4 A = 3.0f;
641     float[4] correctA = [3.0f, 3.0f, 3.0f, 3.0f];
642     assert(A.array == correctA);
643 
644     int2 B = 42;
645     int[2] correctB = [42, 42];
646     assert(B.array == correctB);
647 }