1 /**
2 * `core.simd` emulation layer.
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.types;
8 
9 
10 pure:
11 nothrow:
12 @nogc:
13 
14 version(GNU)
15 {
16     // Note: for GDC support, be sure to use https://explore.dgnu.org/
17 
18     version(X86_64)
19     {
20         enum MMXSizedVectorsAreEmulated = false;
21         enum SSESizedVectorsAreEmulated = false;
22 
23         // TODO: use D_AVX and D_AVX2 eventually to detect AVX?
24         enum AVXSizedVectorsAreEmulated = true;
25 
26         import gcc.builtins;
27 
28         float4 loadUnaligned(Vec)(const(float)* pvec) @trusted if (is(Vec == float4))
29         {
30             return __builtin_ia32_loadups(pvec);
31         }
32 
33         double2 loadUnaligned(Vec)(const(double)* pvec) @trusted if (is(Vec == double2))
34         {
35             return __builtin_ia32_loadupd(pvec);
36         }
37 
38         byte16 loadUnaligned(Vec)(const(byte)* pvec) @trusted if (is(Vec == byte16))
39         {
40             return cast(byte16) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
41         }
42 
43         short8 loadUnaligned(Vec)(const(short)* pvec) @trusted if (is(Vec == short8))
44         {
45             return cast(short8) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
46         }
47 
48         int4 loadUnaligned(Vec)(const(int)* pvec) @trusted if (is(Vec == int4))
49         {
50             return cast(int4) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
51         }
52 
53         long2 loadUnaligned(Vec)(const(long)* pvec) @trusted if (is(Vec == long2))
54         {
55             return cast(long2) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
56         }
57 
58         void storeUnaligned(Vec)(Vec v, float* pvec) @trusted if (is(Vec == float4))
59         {
60             __builtin_ia32_storeups(pvec, v);
61         }
62 
63         void storeUnaligned(Vec)(Vec v, double* pvec) @trusted if (is(Vec == double2))
64         {
65             __builtin_ia32_storeupd(pvec, v);
66         }
67 
68         void storeUnaligned(Vec)(Vec v, byte* pvec) @trusted if (is(Vec == byte16))
69         {
70             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
71         }
72 
73         void storeUnaligned(Vec)(Vec v, short* pvec) @trusted if (is(Vec == short8))
74         {
75             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
76         }
77 
78         void storeUnaligned(Vec)(Vec v, int* pvec) @trusted if (is(Vec == int4))
79         {
80             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
81         }
82 
83         void storeUnaligned(Vec)(Vec v, long* pvec) @trusted if (is(Vec == long2))
84         {
85             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
86         }
87 
88         // TODO: for performance, replace that anywhere possible by a GDC intrinsic
89         Vec shufflevector(Vec, mask...)(Vec a, Vec b) @trusted
90         {
91             enum Count = Vec.array.length;
92             static assert(mask.length == Count);
93 
94             Vec r = void;
95             foreach(int i, m; mask)
96             {
97                 static assert (m < Count * 2);
98                 int ind = cast(int)m;
99                 if (ind < Count)
100                     r.ptr[i] = a.array[ind];
101                 else
102                     r.ptr[i] = b.array[ind - Count];
103             }
104             return r;
105         }
106     }
107     else
108     {
109         enum MMXSizedVectorsAreEmulated = true;
110         enum SSESizedVectorsAreEmulated = true;
111         enum AVXSizedVectorsAreEmulated = true;
112     }
113 }
114 else version(LDC)
115 {
116     public import ldc.simd;
117 
118     // Use this alias to mention it should only be used with LDC,
119     // for example when emulated shufflevector would just be wasteful.
120     alias shufflevectorLDC = shufflevector; 
121 
122     enum MMXSizedVectorsAreEmulated = false;
123     enum SSESizedVectorsAreEmulated = false;
124     enum AVXSizedVectorsAreEmulated = false;
125 }
126 else version(DigitalMars)
127 {
128     public import core.simd;
129 
130     // Note: turning this true is desirable,
131     // and leads to many bugs being discovered upstream.
132     // Last attemps to enable this: DMD 2.100-b1
133     // When this turn true, make it depend on __VERSION__.
134     // 30.04.2022 = all tests pass, with DMD core.simd actually used. Promising.
135     enum bool tryToEnableCoreSimdWithDMD = false;
136 
137     version(D_SIMD)
138     {
139         enum MMXSizedVectorsAreEmulated = true;
140         enum SSESizedVectorsAreEmulated = !tryToEnableCoreSimdWithDMD;
141         version(D_AVX)
142             enum AVXSizedVectorsAreEmulated = !tryToEnableCoreSimdWithDMD;
143         else
144             enum AVXSizedVectorsAreEmulated = true;
145     }
146     else
147     {
148         // Some DMD 32-bit targets don't have D_SIMD
149         enum MMXSizedVectorsAreEmulated = true;
150         enum SSESizedVectorsAreEmulated = true;
151         enum AVXSizedVectorsAreEmulated = true;
152     }
153 }
154 
155 enum CoreSimdIsEmulated = MMXSizedVectorsAreEmulated || SSESizedVectorsAreEmulated || AVXSizedVectorsAreEmulated;
156 
157 version(GNU)
158     enum bool DefineGenericLoadStoreUnaligned = false;
159 else
160     enum bool DefineGenericLoadStoreUnaligned = CoreSimdIsEmulated;
161 
162 
163 static if (CoreSimdIsEmulated)
164 {
165     // core.simd is emulated in some capacity: introduce `VectorOps`
166 
167     mixin template VectorOps(VectorType, ArrayType: BaseType[N], BaseType, size_t N)
168     {
169         enum Count = N;
170         alias Base = BaseType;
171 
172         BaseType* ptr() return pure nothrow @nogc
173         {
174             return array.ptr;
175         }
176 
177         // Unary operators
178         VectorType opUnary(string op)() pure nothrow @safe @nogc
179         {
180             VectorType res = void;
181             mixin("res.array[] = " ~ op ~ "array[];");
182             return res;
183         }
184 
185         // Binary operators
186         VectorType opBinary(string op)(VectorType other) pure const nothrow @safe @nogc
187         {
188             VectorType res = void;
189             mixin("res.array[] = array[] " ~ op ~ " other.array[];");
190             return res;
191         }
192 
193         // Assigning a BaseType value
194         void opAssign(BaseType e) pure nothrow @safe @nogc
195         {
196             array[] = e;
197         }
198 
199         // Assigning a static array
200         void opAssign(ArrayType v) pure nothrow @safe @nogc
201         {
202             array[] = v[];
203         }
204 
205         void opOpAssign(string op)(VectorType other) pure nothrow @safe @nogc
206         {
207             mixin("array[] "  ~ op ~ "= other.array[];");
208         }
209 
210         // Assigning a dyn array
211         this(ArrayType v) pure nothrow @safe @nogc
212         {
213             array[] = v[];
214         }
215 
216         // Broadcast constructor
217         this(BaseType x) pure nothrow @safe @nogc
218         {
219             array[] = x;
220         }
221 
222         /// We can't support implicit conversion but do support explicit casting.
223         /// "Vector types of the same size can be implicitly converted among each other."
224         /// Casting to another vector type is always just a raw copy.
225         VecDest opCast(VecDest)() pure const nothrow @trusted @nogc
226             if (VecDest.sizeof == VectorType.sizeof)
227             {
228                 VecDest dest = void;
229                 // Copy
230                 dest.array[] = (cast(typeof(dest.array))cast(void[VectorType.sizeof])array)[];
231                 return dest;
232             }
233 
234         ref inout(BaseType) opIndex(size_t i) inout return pure nothrow @safe @nogc
235         {
236             return array[i];
237         }
238 
239     }
240 
241     // they just weren't interesting enough, use v.array[i] instead.
242     deprecated auto extractelement(Vec, int index, Vec2)(Vec2 vec) @trusted
243     {
244         static assert(Vec.sizeof == Vec2.sizeof);
245         import core.stdc.string: memcpy;
246         Vec v = void;
247         memcpy(&v, &vec, Vec2.sizeof);
248         return v.array[index];
249     }
250 
251     // they just weren't interesting enough, use v.ptr[i] = x instead.
252     deprecated auto insertelement(Vec, int index, Vec2)(Vec2 vec, Vec.Base e) @trusted
253     {
254         static assert(Vec.sizeof == Vec2.sizeof);
255         import core.stdc.string: memcpy;
256         Vec v = void;
257         memcpy(&v, &vec, Vec2.sizeof);
258         v.array[index] = e;
259         return v;
260     }
261 }
262 else
263 {
264     public import core.simd;
265 
266     // GDC cannot convert implicitely __vector from signed to unsigned, but LDC can
267     // And LDC sometimes need those unsigned vector types for some intrinsics.
268     // For internal use only.
269     package alias ushort8 = Vector!(ushort[8]);
270     package alias ubyte8  = Vector!(ubyte[8]);
271     package alias ubyte16 = Vector!(ubyte[16]);
272 }
273 
274 static if (DefineGenericLoadStoreUnaligned)
275 {
276     template loadUnaligned(Vec)
277     {
278         // Note: can't be @safe with this signature
279         Vec loadUnaligned(const(BaseType!Vec)* pvec) @trusted
280         {
281             enum bool isVector = ( (Vec.sizeof == 8)  && (!MMXSizedVectorsAreEmulated)
282                                 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated)
283                                 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) );
284 
285             static if (isVector)
286             {
287                 // PERF DMD
288                 // BUG: code is wrong, should cast to Vec, see https://github.com/dlang/druntime/pull/3808/commits/b5670753248ec3b1631a0eb8ca76a27e8d6a39b9
289                 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d
290                 static if (DMD_with_DSIMD && Vec.sizeof == 8)
291                 {
292                     static if (is(Vec == double2))
293                         return cast(Vec)__simd(XMM.LODUPD, *pvec);
294                     else static if (is(Vec == float4))
295                         return cast(Vec)__simd(XMM.LODUPS, *pvec);
296                     else
297                         return cast(Vec)__simd(XMM.LODDQU, *pvec);
298                 }
299                 else */
300                 {
301                     enum size_t Count = Vec.array.length;
302                     Vec result;
303                     foreach(int i; 0..Count)
304                     {
305                         result.ptr[i] = pvec[i];
306                     }
307                     return result;
308                 }
309             }
310             else
311             {
312                 // Since this vector is emulated, it doesn't have alignement constraints
313                 // and as such we can just cast it.
314                 return *cast(Vec*)(pvec);
315             }
316         }
317     }
318 
319     template storeUnaligned(Vec)
320     {
321         // Note: can't be @safe with this signature
322         void storeUnaligned(Vec v, BaseType!Vec* pvec) @trusted
323         {
324             enum bool isVector = ( (Vec.sizeof == 8)  && (!MMXSizedVectorsAreEmulated)
325                                 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated)
326                                 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) );
327 
328             static if (isVector)
329             {
330                 // PERF DMD
331                 // BUG: code is wrong, should cast to Vec, see https://github.com/dlang/druntime/pull/3808/commits/b5670753248ec3b1631a0eb8ca76a27e8d6a39b9
332                 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d
333                 static if (DMD_with_DSIMD && Vec.sizeof == 8)
334                 {
335                     static if (is(Vec == double2))
336                         __simd_sto(XMM.STOUPD, *pvec, value);
337                     else static if (is(Vec == float4))
338                         __simd_sto(XMM.STOUPS, *pvec, value);
339                     else
340                         __simd_sto(XMM.STODQU, *pvec, value);
341                 }
342                 else*/
343                 {
344                     enum size_t Count = Vec.array.length;
345                     foreach(int i; 0..Count)
346                         pvec[i] = v.array[i];
347                 }
348             }
349             else
350             {
351                 *cast(Vec*)(pvec) = v;
352             }
353         }
354     }
355 
356     Vec shufflevector(Vec, mask...)(Vec a, Vec b) @safe if (Vec.sizeof < 32)
357     {
358         enum size_t Count = Vec.array.length;
359         static assert(mask.length == Count);
360 
361         Vec r = void;
362         foreach(int i, m; mask)
363         {
364             static assert (m < Count * 2);
365             enum int ind = cast(int)m;
366             static if (ind < Count)
367                 r.array[i] = a.array[ind];
368             else
369                 r.array[i] = b.array[ind-Count];
370         }
371         return r;
372     }
373 }
374 
375 // Emulate ldc.simd cmpMask and other masks.
376 // Note: these should be deprecated on non-LDC, 
377 // since it's slower to generate that code.
378 version(LDC)
379 {} 
380 else
381 {
382     private template BaseType(V)
383     {
384         alias typeof( ( { V v; return v; }()).array[0]) BaseType;
385     }
386 
387     private template TrueMask(V)
388     {
389         alias Elem = BaseType!V;
390 
391         static if (is(Elem == float))
392         {
393             immutable uint m1 = 0xffffffff;
394             enum Elem TrueMask = *cast(float*)(&m1);
395         }
396         else static if (is(Elem == double))
397         {
398             immutable ulong m1 = 0xffffffff_ffffffff;
399             enum Elem TrueMask = *cast(double*)(&m1);
400         }
401         else // integer case
402         {
403             enum Elem TrueMask = -1;
404         }
405     }
406 
407     Vec equalMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "oeq" comparison
408     {
409         enum size_t Count = Vec.array.length;
410         Vec result;
411         foreach(int i; 0..Count)
412         {
413             bool cond = a.array[i] == b.array[i];
414             result.ptr[i] = cond ? TrueMask!Vec : 0;
415         }
416         return result;
417     }
418 
419     Vec notEqualMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "one" comparison
420     {
421         enum size_t Count = Vec.array.length;
422         Vec result;
423         foreach(int i; 0..Count)
424         {
425             bool cond = a.array[i] != b.array[i];
426             result.ptr[i] = cond ? TrueMask!Vec : 0;
427         }
428         return result;
429     }
430 
431     Vec greaterMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "ogt" comparison
432     {
433         enum size_t Count = Vec.array.length;
434         Vec result;
435         foreach(int i; 0..Count)
436         {
437             bool cond = a.array[i] > b.array[i];
438             result.ptr[i] = cond ? TrueMask!Vec : 0;
439         }
440         return result;
441     }
442 }
443 
444 unittest
445 {
446     float4 a = [1, 3, 5, 7];
447     float4 b = [2, 3, 4, 5];
448     int4 c = cast(int4)(greaterMask!float4(a, b));
449     static immutable int[4] correct = [0, 0, 0xffff_ffff, 0xffff_ffff];
450     assert(c.array == correct);
451 }
452 
453 static if (MMXSizedVectorsAreEmulated)
454 {
455     /// MMX-like SIMD types
456     struct float2
457     {
458         float[2] array;
459         mixin VectorOps!(float2, float[2]);
460     }
461 
462     struct byte8
463     {
464         byte[8] array;
465         mixin VectorOps!(byte8, byte[8]);
466     }
467 
468     struct short4
469     {
470         short[4] array;
471         mixin VectorOps!(short4, short[4]);
472     }
473 
474     struct int2
475     {
476         int[2] array;
477         mixin VectorOps!(int2, int[2]);
478     }
479 
480     struct long1
481     {
482         long[1] array;
483         mixin VectorOps!(long1, long[1]);
484     }
485 }
486 else
487 {
488     // For this compiler, defining MMX-sized vectors is working.
489     public import core.simd;
490     alias Vector!(long [1]) long1;
491     alias Vector!(float[2]) float2;
492     alias Vector!(int  [2]) int2;
493     alias Vector!(short[4]) short4;
494     alias Vector!(byte [8]) byte8;
495 }
496 
497 static assert(float2.sizeof == 8);
498 static assert(byte8.sizeof == 8);
499 static assert(short4.sizeof == 8);
500 static assert(int2.sizeof == 8);
501 static assert(long1.sizeof == 8);
502 
503 
504 static if (SSESizedVectorsAreEmulated)
505 {
506     /// SSE-like SIMD types
507 
508     struct float4
509     {
510         float[4] array;
511         mixin VectorOps!(float4, float[4]);
512     }
513 
514     struct byte16
515     {
516         byte[16] array;
517         mixin VectorOps!(byte16, byte[16]);
518     }
519 
520     struct short8
521     {
522         short[8] array;
523         mixin VectorOps!(short8, short[8]);
524     }
525 
526     struct int4
527     {
528         int[4] array;
529         mixin VectorOps!(int4, int[4]);
530     }
531 
532     struct long2
533     {
534         long[2] array;
535         mixin VectorOps!(long2, long[2]);
536     }
537 
538     struct double2
539     {
540         double[2] array;
541         mixin VectorOps!(double2, double[2]);
542     }
543 }
544 
545 static assert(float4.sizeof == 16);
546 static assert(byte16.sizeof == 16);
547 static assert(short8.sizeof == 16);
548 static assert(int4.sizeof == 16);
549 static assert(long2.sizeof == 16);
550 static assert(double2.sizeof == 16);
551 
552 
553 static if (AVXSizedVectorsAreEmulated)
554 {
555     /// AVX-like SIMD types
556 
557     struct float8
558     {
559         float[8] array;
560         mixin VectorOps!(float8, float[8]);
561     }
562 
563     struct byte32
564     {
565         byte[32] array;
566         mixin VectorOps!(byte32, byte[32]);
567     }
568 
569     struct short16
570     {
571         short[16] array;
572         mixin VectorOps!(short16, short[16]);
573     }
574 
575     struct int8
576     {
577         int[8] array;
578         mixin VectorOps!(int8, int[8]);
579     }
580 
581     struct long4
582     {
583         long[4] array;
584         mixin VectorOps!(long4, long[4]);
585     }
586 
587     struct double4
588     {
589         double[4] array;
590         mixin VectorOps!(double4, double[4]);
591     }
592 }
593 else
594 {
595     public import core.simd;    
596 }
597 static assert(float8.sizeof == 32);
598 static assert(byte32.sizeof == 32);
599 static assert(short16.sizeof == 32);
600 static assert(int8.sizeof == 32);
601 static assert(long4.sizeof == 32);
602 static assert(double4.sizeof == 32);
603 
604 
605 
606 
607 alias __m256 = float8;
608 alias __m256i = long4; // long long __vector with ICC, GCC, and clang
609 alias __m256d = double4;
610 alias __m128 = float4;
611 alias __m128i = int4;
612 alias __m128d = double2;
613 alias __m64 = long1; // like in Clang, __m64 is a vector of 1 long
614 
615 int _MM_SHUFFLE2(int x, int y) pure @safe
616 {
617     assert(x >= 0 && x <= 1);
618     assert(y >= 0 && y <= 1);
619     return (x << 1) | y;
620 }
621 
622 int _MM_SHUFFLE(int z, int y, int x, int w) pure @safe
623 {
624     assert(x >= 0 && x <= 3);
625     assert(y >= 0 && y <= 3);
626     assert(z >= 0 && z <= 3);
627     assert(w >= 0 && w <= 3);
628     return (z<<6) | (y<<4) | (x<<2) | w;
629 }
630 
631 // test assignment from scalar to vector type
632 unittest
633 {
634     float4 A = 3.0f;
635     float[4] correctA = [3.0f, 3.0f, 3.0f, 3.0f];
636     assert(A.array == correctA);
637 
638     int2 B = 42;
639     int[2] correctB = [42, 42];
640     assert(B.array == correctB);
641 }