1 /**
2 * `core.simd` emulation layer.
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.types;
8 
9 
10 pure:
11 nothrow:
12 @nogc:
13 
14 version(GNU)
15 {
16     // Note: for GDC support, be sure to use https://explore.dgnu.org/
17 
18     version(X86_64)
19     {
20         enum MMXSizedVectorsAreEmulated = false;
21         enum SSESizedVectorsAreEmulated = false;
22 
23         // TODO: use D_AVX and D_AVX2 eventually to detect AVX?
24         enum AVXSizedVectorsAreEmulated = true;
25 
26         import gcc.builtins;
27 
28         float4 loadUnaligned(Vec)(const(float)* pvec) @trusted if (is(Vec == float4))
29         {
30             return __builtin_ia32_loadups(pvec);
31         }
32 
33         double2 loadUnaligned(Vec)(const(double)* pvec) @trusted if (is(Vec == double2))
34         {
35             return __builtin_ia32_loadupd(pvec);
36         }
37 
38         byte16 loadUnaligned(Vec)(const(byte)* pvec) @trusted if (is(Vec == byte16))
39         {
40             return cast(byte16) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
41         }
42 
43         short8 loadUnaligned(Vec)(const(short)* pvec) @trusted if (is(Vec == short8))
44         {
45             return cast(short8) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
46         }
47 
48         int4 loadUnaligned(Vec)(const(int)* pvec) @trusted if (is(Vec == int4))
49         {
50             return cast(int4) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
51         }
52 
53         long2 loadUnaligned(Vec)(const(long)* pvec) @trusted if (is(Vec == long2))
54         {
55             return cast(long2) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
56         }
57 
58         void storeUnaligned(Vec)(Vec v, float* pvec) @trusted if (is(Vec == float4))
59         {
60             __builtin_ia32_storeups(pvec, v);
61         }
62 
63         void storeUnaligned(Vec)(Vec v, double* pvec) @trusted if (is(Vec == double2))
64         {
65             __builtin_ia32_storeupd(pvec, v);
66         }
67 
68         void storeUnaligned(Vec)(Vec v, byte* pvec) @trusted if (is(Vec == byte16))
69         {
70             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
71         }
72 
73         void storeUnaligned(Vec)(Vec v, short* pvec) @trusted if (is(Vec == short8))
74         {
75             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
76         }
77 
78         void storeUnaligned(Vec)(Vec v, int* pvec) @trusted if (is(Vec == int4))
79         {
80             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
81         }
82 
83         void storeUnaligned(Vec)(Vec v, long* pvec) @trusted if (is(Vec == long2))
84         {
85             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
86         }
87 
88         // TODO: for performance, replace that anywhere possible by a GDC intrinsic
89         Vec shufflevector(Vec, mask...)(Vec a, Vec b) @trusted
90         {
91             enum Count = Vec.array.length;
92             static assert(mask.length == Count);
93 
94             Vec r = void;
95             foreach(int i, m; mask)
96             {
97                 static assert (m < Count * 2);
98                 int ind = cast(int)m;
99                 if (ind < Count)
100                     r.ptr[i] = a.array[ind];
101                 else
102                     r.ptr[i] = b.array[ind - Count];
103             }
104             return r;
105         }
106     }
107     else
108     {
109         enum MMXSizedVectorsAreEmulated = true;
110         enum SSESizedVectorsAreEmulated = true;
111         enum AVXSizedVectorsAreEmulated = true;
112     }
113 }
114 else version(LDC)
115 {
116     public import ldc.simd;
117 
118     // Use this alias to mention it should only be used with LDC,
119     // for example when emulated shufflevector would just be wasteful.
120     alias shufflevectorLDC = shufflevector; 
121 
122     enum MMXSizedVectorsAreEmulated = false;
123     enum SSESizedVectorsAreEmulated = false;
124     enum AVXSizedVectorsAreEmulated = false;
125 }
126 else version(DigitalMars)
127 {
128     public import core.simd;
129 
130     // Note: turning this true is desirable,
131     // and leads to many bugs being discovered upstream.
132     // the fact that it works relies on many workardounds.
133     // in particular intel-intrinsics with this on is a honeypot for DMD backend bugs.
134     // What happends next is that contributors end up on a DMD bug in their PR.
135     //
136     // Failed attempts: xxx
137     //
138     static if (__VERSION__ >= 2100)
139     {
140         enum bool tryToEnableCoreSimdWithDMD = false;
141     }
142     else
143     {
144         enum bool tryToEnableCoreSimdWithDMD = false;
145     }
146 
147     version(D_SIMD)
148     {
149         enum MMXSizedVectorsAreEmulated = true;
150         enum SSESizedVectorsAreEmulated = !tryToEnableCoreSimdWithDMD;
151         version(D_AVX)
152             enum AVXSizedVectorsAreEmulated = !tryToEnableCoreSimdWithDMD;
153         else
154             enum AVXSizedVectorsAreEmulated = true;
155     }
156     else
157     {
158         // Some DMD 32-bit targets don't have D_SIMD
159         enum MMXSizedVectorsAreEmulated = true;
160         enum SSESizedVectorsAreEmulated = true;
161         enum AVXSizedVectorsAreEmulated = true;
162     }
163 }
164 
165 enum CoreSimdIsEmulated = MMXSizedVectorsAreEmulated || SSESizedVectorsAreEmulated || AVXSizedVectorsAreEmulated;
166 
167 version(GNU)
168     enum bool DefineGenericLoadStoreUnaligned = false;
169 else
170     enum bool DefineGenericLoadStoreUnaligned = CoreSimdIsEmulated;
171 
172 
173 static if (CoreSimdIsEmulated)
174 {
175     // core.simd is emulated in some capacity: introduce `VectorOps`
176 
177     mixin template VectorOps(VectorType, ArrayType: BaseType[N], BaseType, size_t N)
178     {
179         enum Count = N;
180         alias Base = BaseType;
181 
182         BaseType* ptr() return pure nothrow @nogc
183         {
184             return array.ptr;
185         }
186 
187         // Unary operators
188         VectorType opUnary(string op)() pure nothrow @safe @nogc
189         {
190             VectorType res = void;
191             mixin("res.array[] = " ~ op ~ "array[];");
192             return res;
193         }
194 
195         // Binary operators
196         VectorType opBinary(string op)(VectorType other) pure const nothrow @safe @nogc
197         {
198             VectorType res = void;
199             mixin("res.array[] = array[] " ~ op ~ " other.array[];");
200             return res;
201         }
202 
203         // Assigning a BaseType value
204         void opAssign(BaseType e) pure nothrow @safe @nogc
205         {
206             array[] = e;
207         }
208 
209         // Assigning a static array
210         void opAssign(ArrayType v) pure nothrow @safe @nogc
211         {
212             array[] = v[];
213         }
214 
215         void opOpAssign(string op)(VectorType other) pure nothrow @safe @nogc
216         {
217             mixin("array[] "  ~ op ~ "= other.array[];");
218         }
219 
220         // Assigning a dyn array
221         this(ArrayType v) pure nothrow @safe @nogc
222         {
223             array[] = v[];
224         }
225 
226         // Broadcast constructor
227         this(BaseType x) pure nothrow @safe @nogc
228         {
229             array[] = x;
230         }
231 
232         /// We can't support implicit conversion but do support explicit casting.
233         /// "Vector types of the same size can be implicitly converted among each other."
234         /// Casting to another vector type is always just a raw copy.
235         VecDest opCast(VecDest)() pure const nothrow @trusted @nogc
236             if (VecDest.sizeof == VectorType.sizeof)
237             {
238                 VecDest dest = void;
239                 // Copy
240                 dest.array[] = (cast(typeof(dest.array))cast(void[VectorType.sizeof])array)[];
241                 return dest;
242             }
243 
244         ref inout(BaseType) opIndex(size_t i) inout return pure nothrow @safe @nogc
245         {
246             return array[i];
247         }
248 
249     }
250 
251     // they just weren't interesting enough, use v.array[i] instead.
252     deprecated auto extractelement(Vec, int index, Vec2)(Vec2 vec) @trusted
253     {
254         static assert(Vec.sizeof == Vec2.sizeof);
255         import core.stdc.string: memcpy;
256         Vec v = void;
257         memcpy(&v, &vec, Vec2.sizeof);
258         return v.array[index];
259     }
260 
261     // they just weren't interesting enough, use v.ptr[i] = x instead.
262     deprecated auto insertelement(Vec, int index, Vec2)(Vec2 vec, Vec.Base e) @trusted
263     {
264         static assert(Vec.sizeof == Vec2.sizeof);
265         import core.stdc.string: memcpy;
266         Vec v = void;
267         memcpy(&v, &vec, Vec2.sizeof);
268         v.array[index] = e;
269         return v;
270     }
271 }
272 else
273 {
274     public import core.simd;
275 
276     // GDC cannot convert implicitely __vector from signed to unsigned, but LDC can
277     // And LDC sometimes need those unsigned vector types for some intrinsics.
278     // For internal use only.
279     package alias ushort8 = Vector!(ushort[8]);
280     package alias ubyte8  = Vector!(ubyte[8]);
281     package alias ubyte16 = Vector!(ubyte[16]);
282 }
283 
284 static if (DefineGenericLoadStoreUnaligned)
285 {
286     template loadUnaligned(Vec)
287     {
288         // Note: can't be @safe with this signature
289         Vec loadUnaligned(const(BaseType!Vec)* pvec) @trusted
290         {
291             enum bool isVector = ( (Vec.sizeof == 8)  && (!MMXSizedVectorsAreEmulated)
292                                 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated)
293                                 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) );
294 
295             static if (isVector)
296             {
297                 // PERF DMD
298                 // BUG: code is wrong, should cast to Vec, see https://github.com/dlang/druntime/pull/3808/commits/b5670753248ec3b1631a0eb8ca76a27e8d6a39b9
299                 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d
300                 static if (DMD_with_DSIMD && Vec.sizeof == 8)
301                 {
302                     static if (is(Vec == double2))
303                         return cast(Vec)__simd(XMM.LODUPD, *pvec);
304                     else static if (is(Vec == float4))
305                         return cast(Vec)__simd(XMM.LODUPS, *pvec);
306                     else
307                         return cast(Vec)__simd(XMM.LODDQU, *pvec);
308                 }
309                 else */
310                 {
311                     enum size_t Count = Vec.array.length;
312                     Vec result;
313                     foreach(int i; 0..Count)
314                     {
315                         result.ptr[i] = pvec[i];
316                     }
317                     return result;
318                 }
319             }
320             else
321             {
322                 // Since this vector is emulated, it doesn't have alignement constraints
323                 // and as such we can just cast it.
324                 return *cast(Vec*)(pvec);
325             }
326         }
327     }
328 
329     template storeUnaligned(Vec)
330     {
331         // Note: can't be @safe with this signature
332         void storeUnaligned(Vec v, BaseType!Vec* pvec) @trusted
333         {
334             enum bool isVector = ( (Vec.sizeof == 8)  && (!MMXSizedVectorsAreEmulated)
335                                 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated)
336                                 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) );
337 
338             static if (isVector)
339             {
340                 // PERF DMD
341                 // BUG: code is wrong, should cast to Vec, see https://github.com/dlang/druntime/pull/3808/commits/b5670753248ec3b1631a0eb8ca76a27e8d6a39b9
342                 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d
343                 static if (DMD_with_DSIMD && Vec.sizeof == 8)
344                 {
345                     static if (is(Vec == double2))
346                         __simd_sto(XMM.STOUPD, *pvec, value);
347                     else static if (is(Vec == float4))
348                         __simd_sto(XMM.STOUPS, *pvec, value);
349                     else
350                         __simd_sto(XMM.STODQU, *pvec, value);
351                 }
352                 else*/
353                 {
354                     enum size_t Count = Vec.array.length;
355                     foreach(int i; 0..Count)
356                         pvec[i] = v.array[i];
357                 }
358             }
359             else
360             {
361                 *cast(Vec*)(pvec) = v;
362             }
363         }
364     }
365 
366     Vec shufflevector(Vec, mask...)(Vec a, Vec b) @safe if (Vec.sizeof < 32)
367     {
368         enum size_t Count = Vec.array.length;
369         static assert(mask.length == Count);
370 
371         Vec r = void;
372         foreach(int i, m; mask)
373         {
374             static assert (m < Count * 2);
375             enum int ind = cast(int)m;
376             static if (ind < Count)
377                 r.array[i] = a.array[ind];
378             else
379                 r.array[i] = b.array[ind-Count];
380         }
381         return r;
382     }
383 }
384 
385 // Emulate ldc.simd cmpMask and other masks.
386 // Note: these should be deprecated on non-LDC, 
387 // since it's slower to generate that code.
388 version(LDC)
389 {} 
390 else
391 {
392     private template BaseType(V)
393     {
394         alias typeof( ( { V v; return v; }()).array[0]) BaseType;
395     }
396 
397     private template TrueMask(V)
398     {
399         alias Elem = BaseType!V;
400 
401         static if (is(Elem == float))
402         {
403             immutable uint m1 = 0xffffffff;
404             enum Elem TrueMask = *cast(float*)(&m1);
405         }
406         else static if (is(Elem == double))
407         {
408             immutable ulong m1 = 0xffffffff_ffffffff;
409             enum Elem TrueMask = *cast(double*)(&m1);
410         }
411         else // integer case
412         {
413             enum Elem TrueMask = -1;
414         }
415     }
416 
417     Vec equalMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "oeq" comparison
418     {
419         enum size_t Count = Vec.array.length;
420         Vec result;
421         foreach(int i; 0..Count)
422         {
423             bool cond = a.array[i] == b.array[i];
424             result.ptr[i] = cond ? TrueMask!Vec : 0;
425         }
426         return result;
427     }
428 
429     Vec notEqualMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "one" comparison
430     {
431         enum size_t Count = Vec.array.length;
432         Vec result;
433         foreach(int i; 0..Count)
434         {
435             bool cond = a.array[i] != b.array[i];
436             result.ptr[i] = cond ? TrueMask!Vec : 0;
437         }
438         return result;
439     }
440 
441     Vec greaterMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "ogt" comparison
442     {
443         enum size_t Count = Vec.array.length;
444         Vec result;
445         foreach(int i; 0..Count)
446         {
447             bool cond = a.array[i] > b.array[i];
448             result.ptr[i] = cond ? TrueMask!Vec : 0;
449         }
450         return result;
451     }
452 }
453 
454 unittest
455 {
456     float4 a = [1, 3, 5, 7];
457     float4 b = [2, 3, 4, 5];
458     int4 c = cast(int4)(greaterMask!float4(a, b));
459     static immutable int[4] correct = [0, 0, 0xffff_ffff, 0xffff_ffff];
460     assert(c.array == correct);
461 }
462 
463 static if (MMXSizedVectorsAreEmulated)
464 {
465     /// MMX-like SIMD types
466     struct float2
467     {
468         float[2] array;
469         mixin VectorOps!(float2, float[2]);
470     }
471 
472     struct byte8
473     {
474         byte[8] array;
475         mixin VectorOps!(byte8, byte[8]);
476     }
477 
478     struct short4
479     {
480         short[4] array;
481         mixin VectorOps!(short4, short[4]);
482     }
483 
484     struct int2
485     {
486         int[2] array;
487         mixin VectorOps!(int2, int[2]);
488     }
489 
490     struct long1
491     {
492         long[1] array;
493         mixin VectorOps!(long1, long[1]);
494     }
495 }
496 else
497 {
498     // For this compiler, defining MMX-sized vectors is working.
499     public import core.simd;
500     alias Vector!(long [1]) long1;
501     alias Vector!(float[2]) float2;
502     alias Vector!(int  [2]) int2;
503     alias Vector!(short[4]) short4;
504     alias Vector!(byte [8]) byte8;
505 }
506 
507 static assert(float2.sizeof == 8);
508 static assert(byte8.sizeof == 8);
509 static assert(short4.sizeof == 8);
510 static assert(int2.sizeof == 8);
511 static assert(long1.sizeof == 8);
512 
513 
514 static if (SSESizedVectorsAreEmulated)
515 {
516     /// SSE-like SIMD types
517 
518     struct float4
519     {
520         float[4] array;
521         mixin VectorOps!(float4, float[4]);
522     }
523 
524     struct byte16
525     {
526         byte[16] array;
527         mixin VectorOps!(byte16, byte[16]);
528     }
529 
530     struct short8
531     {
532         short[8] array;
533         mixin VectorOps!(short8, short[8]);
534     }
535 
536     struct int4
537     {
538         int[4] array;
539         mixin VectorOps!(int4, int[4]);
540     }
541 
542     struct long2
543     {
544         long[2] array;
545         mixin VectorOps!(long2, long[2]);
546     }
547 
548     struct double2
549     {
550         double[2] array;
551         mixin VectorOps!(double2, double[2]);
552     }
553 }
554 
555 static assert(float4.sizeof == 16);
556 static assert(byte16.sizeof == 16);
557 static assert(short8.sizeof == 16);
558 static assert(int4.sizeof == 16);
559 static assert(long2.sizeof == 16);
560 static assert(double2.sizeof == 16);
561 
562 
563 static if (AVXSizedVectorsAreEmulated)
564 {
565     /// AVX-like SIMD types
566 
567     struct float8
568     {
569         float[8] array;
570         mixin VectorOps!(float8, float[8]);
571     }
572 
573     struct byte32
574     {
575         byte[32] array;
576         mixin VectorOps!(byte32, byte[32]);
577     }
578 
579     struct short16
580     {
581         short[16] array;
582         mixin VectorOps!(short16, short[16]);
583     }
584 
585     struct int8
586     {
587         int[8] array;
588         mixin VectorOps!(int8, int[8]);
589     }
590 
591     struct long4
592     {
593         long[4] array;
594         mixin VectorOps!(long4, long[4]);
595     }
596 
597     struct double4
598     {
599         double[4] array;
600         mixin VectorOps!(double4, double[4]);
601     }
602 }
603 else
604 {
605     public import core.simd;    
606 }
607 static assert(float8.sizeof == 32);
608 static assert(byte32.sizeof == 32);
609 static assert(short16.sizeof == 32);
610 static assert(int8.sizeof == 32);
611 static assert(long4.sizeof == 32);
612 static assert(double4.sizeof == 32);
613 
614 
615 
616 
617 alias __m256 = float8;
618 alias __m256i = long4; // long long __vector with ICC, GCC, and clang
619 alias __m256d = double4;
620 alias __m128 = float4;
621 alias __m128i = int4;
622 alias __m128d = double2;
623 alias __m64 = long1; // like in Clang, __m64 is a vector of 1 long
624 
625 int _MM_SHUFFLE2(int x, int y) pure @safe
626 {
627     assert(x >= 0 && x <= 1);
628     assert(y >= 0 && y <= 1);
629     return (x << 1) | y;
630 }
631 
632 int _MM_SHUFFLE(int z, int y, int x, int w) pure @safe
633 {
634     assert(x >= 0 && x <= 3);
635     assert(y >= 0 && y <= 3);
636     assert(z >= 0 && z <= 3);
637     assert(w >= 0 && w <= 3);
638     return (z<<6) | (y<<4) | (x<<2) | w;
639 }
640 
641 // test assignment from scalar to vector type
642 unittest
643 {
644     float4 A = 3.0f;
645     float[4] correctA = [3.0f, 3.0f, 3.0f, 3.0f];
646     assert(A.array == correctA);
647 
648     int2 B = 42;
649     int[2] correctB = [42, 42];
650     assert(B.array == correctB);
651 }