1 /**
2 * `core.simd` emulation layer.
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.types;
8 
9 
10 pure:
11 nothrow:
12 @nogc:
13 
14 version(GNU)
15 {
16     // Note: for GDC support, be sure to use https://explore.dgnu.org/
17 
18     version(X86_64)
19     {
20         enum MMXSizedVectorsAreEmulated = false;
21         enum SSESizedVectorsAreEmulated = false;
22 
23         // TODO: use D_AVX and D_AVX2 eventually to detect AVX?
24         enum AVXSizedVectorsAreEmulated = true;
25 
26         import gcc.builtins;
27 
28         float4 loadUnaligned(Vec)(const(float)* pvec) @trusted if (is(Vec == float4))
29         {
30             return __builtin_ia32_loadups(pvec);
31         }
32 
33         double2 loadUnaligned(Vec)(const(double)* pvec) @trusted if (is(Vec == double2))
34         {
35             return __builtin_ia32_loadupd(pvec);
36         }
37 
38         byte16 loadUnaligned(Vec)(const(byte)* pvec) @trusted if (is(Vec == byte16))
39         {
40             return cast(byte16) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
41         }
42 
43         short8 loadUnaligned(Vec)(const(short)* pvec) @trusted if (is(Vec == short8))
44         {
45             return cast(short8) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
46         }
47 
48         int4 loadUnaligned(Vec)(const(int)* pvec) @trusted if (is(Vec == int4))
49         {
50             return cast(int4) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
51         }
52 
53         long2 loadUnaligned(Vec)(const(long)* pvec) @trusted if (is(Vec == long2))
54         {
55             return cast(long2) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
56         }
57 
58         void storeUnaligned(Vec)(Vec v, float* pvec) @trusted if (is(Vec == float4))
59         {
60             __builtin_ia32_storeups(pvec, v);
61         }
62 
63         void storeUnaligned(Vec)(Vec v, double* pvec) @trusted if (is(Vec == double2))
64         {
65             __builtin_ia32_storeupd(pvec, v);
66         }
67 
68         void storeUnaligned(Vec)(Vec v, byte* pvec) @trusted if (is(Vec == byte16))
69         {
70             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
71         }
72 
73         void storeUnaligned(Vec)(Vec v, short* pvec) @trusted if (is(Vec == short8))
74         {
75             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
76         }
77 
78         void storeUnaligned(Vec)(Vec v, int* pvec) @trusted if (is(Vec == int4))
79         {
80             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
81         }
82 
83         void storeUnaligned(Vec)(Vec v, long* pvec) @trusted if (is(Vec == long2))
84         {
85             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
86         }
87 
88         // TODO: for performance, replace that anywhere possible by a GDC intrinsic
89         Vec shufflevector(Vec, mask...)(Vec a, Vec b) @trusted
90         {
91             enum Count = Vec.array.length;
92             static assert(mask.length == Count);
93 
94             Vec r = void;
95             foreach(int i, m; mask)
96             {
97                 static assert (m < Count * 2);
98                 int ind = cast(int)m;
99                 if (ind < Count)
100                     r.ptr[i] = a.array[ind];
101                 else
102                     r.ptr[i] = b.array[ind - Count];
103             }
104             return r;
105         }
106     }
107     else
108     {
109         enum MMXSizedVectorsAreEmulated = true;
110         enum SSESizedVectorsAreEmulated = true;
111         enum AVXSizedVectorsAreEmulated = true;
112     }
113 }
114 else version(LDC)
115 {
116     public import ldc.simd;
117 
118     // Use this alias to mention it should only be used with LDC,
119     // for example when emulated shufflevector would just be wasteful.
120     alias shufflevectorLDC = shufflevector; 
121 
122     enum MMXSizedVectorsAreEmulated = false;
123     enum SSESizedVectorsAreEmulated = false;
124     enum AVXSizedVectorsAreEmulated = false;
125 }
126 else version(DigitalMars)
127 {
128     public import core.simd;
129 
130     version(D_SIMD)
131     {
132         enum MMXSizedVectorsAreEmulated = true;
133 
134         static if (__VERSION__ >= 2100)
135         {
136             // Trying out D_SIMD finally, with DMD 2.100
137             //enum SSESizedVectorsAreEmulated = false;
138 
139             // It didn't work, maybe one day.
140             enum SSESizedVectorsAreEmulated = true;
141         }
142         else
143         {
144             // Basically blockd by DMD backend issues, tagged codegen, backend, or SIMD in Bugzilla.
145             enum SSESizedVectorsAreEmulated = true; 
146         }
147 
148         enum AVXSizedVectorsAreEmulated = true;
149     }
150     else
151     {
152         // Some DMD 32-bit targets don't have D_SIMD
153         enum MMXSizedVectorsAreEmulated = true;
154         enum SSESizedVectorsAreEmulated = true;
155         enum AVXSizedVectorsAreEmulated = true;
156     }
157 }
158 
159 enum CoreSimdIsEmulated = MMXSizedVectorsAreEmulated || SSESizedVectorsAreEmulated || AVXSizedVectorsAreEmulated;
160 
161 version(GNU)
162     enum bool DefineGenericLoadStoreUnaligned = false;
163 else
164     enum bool DefineGenericLoadStoreUnaligned = CoreSimdIsEmulated;
165 
166 
167 static if (CoreSimdIsEmulated)
168 {
169     // core.simd is emulated in some capacity: introduce `VectorOps`
170 
171     mixin template VectorOps(VectorType, ArrayType: BaseType[N], BaseType, size_t N)
172     {
173         enum Count = N;
174         alias Base = BaseType;
175 
176         BaseType* ptr() return pure nothrow @nogc
177         {
178             return array.ptr;
179         }
180 
181         // Unary operators
182         VectorType opUnary(string op)() pure nothrow @safe @nogc
183         {
184             VectorType res = void;
185             mixin("res.array[] = " ~ op ~ "array[];");
186             return res;
187         }
188 
189         // Binary operators
190         VectorType opBinary(string op)(VectorType other) pure const nothrow @safe @nogc
191         {
192             VectorType res = void;
193             mixin("res.array[] = array[] " ~ op ~ " other.array[];");
194             return res;
195         }
196 
197         // Assigning a BaseType value
198         void opAssign(BaseType e) pure nothrow @safe @nogc
199         {
200             array[] = e;
201         }
202 
203         // Assigning a static array
204         void opAssign(ArrayType v) pure nothrow @safe @nogc
205         {
206             array[] = v[];
207         }
208 
209         void opOpAssign(string op)(VectorType other) pure nothrow @safe @nogc
210         {
211             mixin("array[] "  ~ op ~ "= other.array[];");
212         }
213 
214         // Assigning a dyn array
215         this(ArrayType v) pure nothrow @safe @nogc
216         {
217             array[] = v[];
218         }
219 
220         // Broadcast constructor
221         this(BaseType x) pure nothrow @safe @nogc
222         {
223             array[] = x;
224         }
225 
226         /// We can't support implicit conversion but do support explicit casting.
227         /// "Vector types of the same size can be implicitly converted among each other."
228         /// Casting to another vector type is always just a raw copy.
229         VecDest opCast(VecDest)() pure const nothrow @trusted @nogc
230             if (VecDest.sizeof == VectorType.sizeof)
231             {
232                 VecDest dest = void;
233                 // Copy
234                 dest.array[] = (cast(typeof(dest.array))cast(void[VectorType.sizeof])array)[];
235                 return dest;
236             }
237 
238         ref inout(BaseType) opIndex(size_t i) inout return pure nothrow @safe @nogc
239         {
240             return array[i];
241         }
242 
243     }
244 
245     // they just weren't interesting enough, use v.array[i] instead.
246     deprecated auto extractelement(Vec, int index, Vec2)(Vec2 vec) @trusted
247     {
248         static assert(Vec.sizeof == Vec2.sizeof);
249         import core.stdc.string: memcpy;
250         Vec v = void;
251         memcpy(&v, &vec, Vec2.sizeof);
252         return v.array[index];
253     }
254 
255     // they just weren't interesting enough, use v.ptr[i] = x instead.
256     deprecated auto insertelement(Vec, int index, Vec2)(Vec2 vec, Vec.Base e) @trusted
257     {
258         static assert(Vec.sizeof == Vec2.sizeof);
259         import core.stdc.string: memcpy;
260         Vec v = void;
261         memcpy(&v, &vec, Vec2.sizeof);
262         v.array[index] = e;
263         return v;
264     }
265 }
266 else
267 {
268     public import core.simd;
269 
270     // GDC cannot convert implicitely __vector from signed to unsigned, but LDC can
271     // And LDC sometimes need those unsigned vector types for some intrinsics.
272     // For internal use only.
273     package alias ushort8 = Vector!(ushort[8]);
274     package alias ubyte8  = Vector!(ubyte[8]);
275     package alias ubyte16 = Vector!(ubyte[16]);
276 }
277 
278 static if (DefineGenericLoadStoreUnaligned)
279 {
280     template loadUnaligned(Vec)
281     {
282         // Note: can't be @safe with this signature
283         Vec loadUnaligned(const(BaseType!Vec)* pvec) @trusted
284         {
285             enum bool isVector = ( (Vec.sizeof == 8)  && (!MMXSizedVectorsAreEmulated)
286                                 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated)
287                                 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) );
288 
289             static if (isVector)
290             {
291                 // PERF DMD
292                 // BUG: code is wrong, should cast to Vec, see https://github.com/dlang/druntime/pull/3808/commits/b5670753248ec3b1631a0eb8ca76a27e8d6a39b9
293                 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d
294                 static if (DMD_with_DSIMD && Vec.sizeof == 8)
295                 {
296                     static if (is(Vec == double2))
297                         return cast(Vec)__simd(XMM.LODUPD, *pvec);
298                     else static if (is(Vec == float4))
299                         return cast(Vec)__simd(XMM.LODUPS, *pvec);
300                     else
301                         return cast(Vec)__simd(XMM.LODDQU, *pvec);
302                 }
303                 else */
304                 {
305                     enum size_t Count = Vec.array.length;
306                     Vec result;
307                     foreach(int i; 0..Count)
308                     {
309                         result.ptr[i] = pvec[i];
310                     }
311                     return result;
312                 }
313             }
314             else
315             {
316                 // Since this vector is emulated, it doesn't have alignement constraints
317                 // and as such we can just cast it.
318                 return *cast(Vec*)(pvec);
319             }
320         }
321     }
322 
323     template storeUnaligned(Vec)
324     {
325         // Note: can't be @safe with this signature
326         void storeUnaligned(Vec v, BaseType!Vec* pvec) @trusted
327         {
328             enum bool isVector = ( (Vec.sizeof == 8)  && (!MMXSizedVectorsAreEmulated)
329                                 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated)
330                                 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) );
331 
332             static if (isVector)
333             {
334                 // PERF DMD
335                 // BUG: code is wrong, should cast to Vec, see https://github.com/dlang/druntime/pull/3808/commits/b5670753248ec3b1631a0eb8ca76a27e8d6a39b9
336                 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d
337                 static if (DMD_with_DSIMD && Vec.sizeof == 8)
338                 {
339                     static if (is(Vec == double2))
340                         __simd_sto(XMM.STOUPD, *pvec, value);
341                     else static if (is(Vec == float4))
342                         __simd_sto(XMM.STOUPS, *pvec, value);
343                     else
344                         __simd_sto(XMM.STODQU, *pvec, value);
345                 }
346                 else*/
347                 {
348                     enum size_t Count = Vec.array.length;
349                     foreach(int i; 0..Count)
350                         pvec[i] = v.array[i];
351                 }
352             }
353             else
354             {
355                 *cast(Vec*)(pvec) = v;
356             }
357         }
358     }
359 
360     Vec shufflevector(Vec, mask...)(Vec a, Vec b) @safe if (Vec.sizeof < 32)
361     {
362         enum size_t Count = Vec.array.length;
363         static assert(mask.length == Count);
364 
365         Vec r = void;
366         foreach(int i, m; mask)
367         {
368             static assert (m < Count * 2);
369             enum int ind = cast(int)m;
370             static if (ind < Count)
371                 r.array[i] = a.array[ind];
372             else
373                 r.array[i] = b.array[ind-Count];
374         }
375         return r;
376     }
377 }
378 
379 // Emulate ldc.simd cmpMask and other masks.
380 // Note: these should be deprecated on non-LDC, 
381 // since it's slower to generate that code.
382 version(LDC)
383 {} 
384 else
385 {
386     private template BaseType(V)
387     {
388         alias typeof( ( { V v; return v; }()).array[0]) BaseType;
389     }
390 
391     private template TrueMask(V)
392     {
393         alias Elem = BaseType!V;
394 
395         static if (is(Elem == float))
396         {
397             immutable uint m1 = 0xffffffff;
398             enum Elem TrueMask = *cast(float*)(&m1);
399         }
400         else static if (is(Elem == double))
401         {
402             immutable ulong m1 = 0xffffffff_ffffffff;
403             enum Elem TrueMask = *cast(double*)(&m1);
404         }
405         else // integer case
406         {
407             enum Elem TrueMask = -1;
408         }
409     }
410 
411     Vec equalMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "oeq" comparison
412     {
413         enum size_t Count = Vec.array.length;
414         Vec result;
415         foreach(int i; 0..Count)
416         {
417             bool cond = a.array[i] == b.array[i];
418             result.ptr[i] = cond ? TrueMask!Vec : 0;
419         }
420         return result;
421     }
422 
423     Vec notEqualMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "one" comparison
424     {
425         enum size_t Count = Vec.array.length;
426         Vec result;
427         foreach(int i; 0..Count)
428         {
429             bool cond = a.array[i] != b.array[i];
430             result.ptr[i] = cond ? TrueMask!Vec : 0;
431         }
432         return result;
433     }
434 
435     Vec greaterMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "ogt" comparison
436     {
437         enum size_t Count = Vec.array.length;
438         Vec result;
439         foreach(int i; 0..Count)
440         {
441             bool cond = a.array[i] > b.array[i];
442             result.ptr[i] = cond ? TrueMask!Vec : 0;
443         }
444         return result;
445     }
446 }
447 
448 unittest
449 {
450     float4 a = [1, 3, 5, 7];
451     float4 b = [2, 3, 4, 5];
452     int4 c = cast(int4)(greaterMask!float4(a, b));
453     static immutable int[4] correct = [0, 0, 0xffff_ffff, 0xffff_ffff];
454     assert(c.array == correct);
455 }
456 
457 static if (MMXSizedVectorsAreEmulated)
458 {
459     /// MMX-like SIMD types
460     struct float2
461     {
462         float[2] array;
463         mixin VectorOps!(float2, float[2]);
464     }
465 
466     struct byte8
467     {
468         byte[8] array;
469         mixin VectorOps!(byte8, byte[8]);
470     }
471 
472     struct short4
473     {
474         short[4] array;
475         mixin VectorOps!(short4, short[4]);
476     }
477 
478     struct int2
479     {
480         int[2] array;
481         mixin VectorOps!(int2, int[2]);
482     }
483 
484     struct long1
485     {
486         long[1] array;
487         mixin VectorOps!(long1, long[1]);
488     }
489 }
490 else
491 {
492     // For this compiler, defining MMX-sized vectors is working.
493     public import core.simd;
494     alias Vector!(long [1]) long1;
495     alias Vector!(float[2]) float2;
496     alias Vector!(int  [2]) int2;
497     alias Vector!(short[4]) short4;
498     alias Vector!(byte [8]) byte8;
499 }
500 
501 static assert(float2.sizeof == 8);
502 static assert(byte8.sizeof == 8);
503 static assert(short4.sizeof == 8);
504 static assert(int2.sizeof == 8);
505 static assert(long1.sizeof == 8);
506 
507 
508 static if (SSESizedVectorsAreEmulated)
509 {
510     /// SSE-like SIMD types
511 
512     struct float4
513     {
514         float[4] array;
515         mixin VectorOps!(float4, float[4]);
516     }
517 
518     struct byte16
519     {
520         byte[16] array;
521         mixin VectorOps!(byte16, byte[16]);
522     }
523 
524     struct short8
525     {
526         short[8] array;
527         mixin VectorOps!(short8, short[8]);
528     }
529 
530     struct int4
531     {
532         int[4] array;
533         mixin VectorOps!(int4, int[4]);
534     }
535 
536     struct long2
537     {
538         long[2] array;
539         mixin VectorOps!(long2, long[2]);
540     }
541 
542     struct double2
543     {
544         double[2] array;
545         mixin VectorOps!(double2, double[2]);
546     }
547 }
548 
549 static assert(float4.sizeof == 16);
550 static assert(byte16.sizeof == 16);
551 static assert(short8.sizeof == 16);
552 static assert(int4.sizeof == 16);
553 static assert(long2.sizeof == 16);
554 static assert(double2.sizeof == 16);
555 
556 
557 static if (AVXSizedVectorsAreEmulated)
558 {
559     /// AVX-like SIMD types
560 
561     struct float8
562     {
563         float[8] array;
564         mixin VectorOps!(float8, float[8]);
565     }
566 
567     struct byte32
568     {
569         byte[32] array;
570         mixin VectorOps!(byte32, byte[32]);
571     }
572 
573     struct short16
574     {
575         short[16] array;
576         mixin VectorOps!(short16, short[16]);
577     }
578 
579     struct int8
580     {
581         int[8] array;
582         mixin VectorOps!(int8, int[8]);
583     }
584 
585     struct long4
586     {
587         long[4] array;
588         mixin VectorOps!(long4, long[4]);
589     }
590 
591     struct double4
592     {
593         double[4] array;
594         mixin VectorOps!(double4, double[4]);
595     }
596 }
597 
598 static assert(float8.sizeof == 32);
599 static assert(byte32.sizeof == 32);
600 static assert(short16.sizeof == 32);
601 static assert(int8.sizeof == 32);
602 static assert(long4.sizeof == 32);
603 static assert(double4.sizeof == 32);
604 
605 
606 
607 
608 alias __m256 = float8;
609 alias __m256i = long4; // long long __vector with ICC, GCC, and clang
610 alias __m256d = double4;
611 alias __m128 = float4;
612 alias __m128i = int4;
613 alias __m128d = double2;
614 alias __m64 = long1; // like in Clang, __m64 is a vector of 1 long
615 
616 int _MM_SHUFFLE2(int x, int y) pure @safe
617 {
618     assert(x >= 0 && x <= 1);
619     assert(y >= 0 && y <= 1);
620     return (x << 1) | y;
621 }
622 
623 int _MM_SHUFFLE(int z, int y, int x, int w) pure @safe
624 {
625     assert(x >= 0 && x <= 3);
626     assert(y >= 0 && y <= 3);
627     assert(z >= 0 && z <= 3);
628     assert(w >= 0 && w <= 3);
629     return (z<<6) | (y<<4) | (x<<2) | w;
630 }
631 
632 // test assignment from scalar to vector type
633 unittest
634 {
635     float4 A = 3.0f;
636     float[4] correctA = [3.0f, 3.0f, 3.0f, 3.0f];
637     assert(A.array == correctA);
638 
639     int2 B = 42;
640     int[2] correctB = [42, 42];
641     assert(B.array == correctB);
642 }