1 /**
2 * `core.simd` emulation layer.
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.types;
8 
9 
10 pure:
11 nothrow:
12 @nogc:
13 
14 version(GNU)
15 {
16     // Note: for GDC support, be sure to use https://explore.dgnu.org/
17 
18     version(X86_64)
19     {
20         enum MMXSizedVectorsAreEmulated = false;
21         enum SSESizedVectorsAreEmulated = false;
22 
23         // TODO: use D_AVX and D_AVX2 eventually to detect AVX?
24         enum AVXSizedVectorsAreEmulated = true;
25 
26         import gcc.builtins;
27 
28         float4 loadUnaligned(Vec)(const(float)* pvec) @trusted if (is(Vec == float4))
29         {
30             return __builtin_ia32_loadups(pvec);
31         }
32 
33         double2 loadUnaligned(Vec)(const(double)* pvec) @trusted if (is(Vec == double2))
34         {
35             return __builtin_ia32_loadupd(pvec);
36         }
37 
38         byte16 loadUnaligned(Vec)(const(byte)* pvec) @trusted if (is(Vec == byte16))
39         {
40             return cast(byte16) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
41         }
42 
43         short8 loadUnaligned(Vec)(const(short)* pvec) @trusted if (is(Vec == short8))
44         {
45             return cast(short8) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
46         }
47 
48         int4 loadUnaligned(Vec)(const(int)* pvec) @trusted if (is(Vec == int4))
49         {
50             return cast(int4) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
51         }
52 
53         long2 loadUnaligned(Vec)(const(long)* pvec) @trusted if (is(Vec == long2))
54         {
55             return cast(long2) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
56         }
57 
58         void storeUnaligned(Vec)(Vec v, float* pvec) @trusted if (is(Vec == float4))
59         {
60             __builtin_ia32_storeups(pvec, v);
61         }
62 
63         void storeUnaligned(Vec)(Vec v, double* pvec) @trusted if (is(Vec == double2))
64         {
65             __builtin_ia32_storeupd(pvec, v);
66         }
67 
68         void storeUnaligned(Vec)(Vec v, byte* pvec) @trusted if (is(Vec == byte16))
69         {
70             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
71         }
72 
73         void storeUnaligned(Vec)(Vec v, short* pvec) @trusted if (is(Vec == short8))
74         {
75             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
76         }
77 
78         void storeUnaligned(Vec)(Vec v, int* pvec) @trusted if (is(Vec == int4))
79         {
80             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
81         }
82 
83         void storeUnaligned(Vec)(Vec v, long* pvec) @trusted if (is(Vec == long2))
84         {
85             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
86         }
87 
88         // TODO: for performance, replace that anywhere possible by a GDC intrinsic
89         Vec shufflevector(Vec, mask...)(Vec a, Vec b) @trusted
90         {
91             enum Count = Vec.array.length;
92             static assert(mask.length == Count);
93 
94             Vec r = void;
95             foreach(int i, m; mask)
96             {
97                 static assert (m < Count * 2);
98                 int ind = cast(int)m;
99                 if (ind < Count)
100                     r.ptr[i] = a.array[ind];
101                 else
102                     r.ptr[i] = b.array[ind - Count];
103             }
104             return r;
105         }
106     }
107     else
108     {
109         enum MMXSizedVectorsAreEmulated = true;
110         enum SSESizedVectorsAreEmulated = true;
111         enum AVXSizedVectorsAreEmulated = true;
112     }
113 }
114 else version(LDC)
115 {
116     public import ldc.simd;
117 
118     // Use this alias to mention it should only be used with LDC,
119     // for example when emulated shufflevector would just be wasteful.
120     alias shufflevectorLDC = shufflevector; 
121 
122     enum MMXSizedVectorsAreEmulated = false;
123     enum SSESizedVectorsAreEmulated = false;
124     enum AVXSizedVectorsAreEmulated = false;
125 }
126 else version(DigitalMars)
127 {
128     public import core.simd;
129 
130     version(D_SIMD)
131     {
132         enum MMXSizedVectorsAreEmulated = true;
133 
134         static if (__VERSION__ >= 2099)
135         {
136             // Trying out D_SIMD finally, with DMD 2.099
137             //enum SSESizedVectorsAreEmulated = false;
138 
139             // It didn't work, maybe one day.
140             enum SSESizedVectorsAreEmulated = true;
141         }
142         else
143         {
144             // Basically blockd by DMD backend issues, tagged codegen, backend, or SIMD in Bugzilla.
145             enum SSESizedVectorsAreEmulated = true; 
146         }
147 
148         enum AVXSizedVectorsAreEmulated = true;
149     }
150     else
151     {
152         // Some DMD 32-bit targets don't have D_SIMD
153         enum MMXSizedVectorsAreEmulated = true;
154         enum SSESizedVectorsAreEmulated = true;
155         enum AVXSizedVectorsAreEmulated = true;
156     }
157 }
158 
159 enum CoreSimdIsEmulated = MMXSizedVectorsAreEmulated || SSESizedVectorsAreEmulated || AVXSizedVectorsAreEmulated;
160 
161 version(GNU)
162     enum bool DefineGenericLoadStoreUnaligned = false;
163 else
164     enum bool DefineGenericLoadStoreUnaligned = CoreSimdIsEmulated;
165 
166 
167 static if (CoreSimdIsEmulated)
168 {
169     // core.simd is emulated in some capacity: introduce `VectorOps`
170 
171     mixin template VectorOps(VectorType, ArrayType: BaseType[N], BaseType, size_t N)
172     {
173         enum Count = N;
174         alias Base = BaseType;
175 
176         BaseType* ptr() return pure nothrow @nogc
177         {
178             return array.ptr;
179         }
180 
181         // Unary operators
182         VectorType opUnary(string op)() pure nothrow @safe @nogc
183         {
184             VectorType res = void;
185             mixin("res.array[] = " ~ op ~ "array[];");
186             return res;
187         }
188 
189         // Binary operators
190         VectorType opBinary(string op)(VectorType other) pure const nothrow @safe @nogc
191         {
192             VectorType res = void;
193             mixin("res.array[] = array[] " ~ op ~ " other.array[];");
194             return res;
195         }
196 
197         // Assigning a BaseType value
198         void opAssign(BaseType e) pure nothrow @safe @nogc
199         {
200             array[] = e;
201         }
202 
203         // Assigning a static array
204         void opAssign(ArrayType v) pure nothrow @safe @nogc
205         {
206             array[] = v[];
207         }
208 
209         void opOpAssign(string op)(VectorType other) pure nothrow @safe @nogc
210         {
211             mixin("array[] "  ~ op ~ "= other.array[];");
212         }
213 
214         // Assigning a dyn array
215         this(ArrayType v) pure nothrow @safe @nogc
216         {
217             array[] = v[];
218         }
219 
220         // Broadcast constructor
221         this(BaseType x) pure nothrow @safe @nogc
222         {
223             array[] = x;
224         }
225 
226         /// We can't support implicit conversion but do support explicit casting.
227         /// "Vector types of the same size can be implicitly converted among each other."
228         /// Casting to another vector type is always just a raw copy.
229         VecDest opCast(VecDest)() pure const nothrow @trusted @nogc
230             if (VecDest.sizeof == VectorType.sizeof)
231             {
232                 VecDest dest = void;
233                 // Copy
234                 dest.array[] = (cast(typeof(dest.array))cast(void[VectorType.sizeof])array)[];
235                 return dest;
236             }
237 
238         ref inout(BaseType) opIndex(size_t i) inout pure nothrow @safe @nogc
239         {
240             return array[i];
241         }
242 
243     }
244 
245     // they just weren't interesting enough, use v.array[i] instead.
246     deprecated auto extractelement(Vec, int index, Vec2)(Vec2 vec) @trusted
247     {
248         static assert(Vec.sizeof == Vec2.sizeof);
249         import core.stdc.string: memcpy;
250         Vec v = void;
251         memcpy(&v, &vec, Vec2.sizeof);
252         return v.array[index];
253     }
254 
255     // they just weren't interesting enough, use v.ptr[i] = x instead.
256     deprecated auto insertelement(Vec, int index, Vec2)(Vec2 vec, Vec.Base e) @trusted
257     {
258         static assert(Vec.sizeof == Vec2.sizeof);
259         import core.stdc.string: memcpy;
260         Vec v = void;
261         memcpy(&v, &vec, Vec2.sizeof);
262         v.array[index] = e;
263         return v;
264     }
265 }
266 else
267 {
268     public import core.simd;
269 
270     // GDC cannot convert implicitely __vector from signed to unsigned, but LDC can
271     // And LDC sometimes need those unsigned vector types for some intrinsics.
272     // For internal use only.
273     package alias ushort8 = Vector!(ushort[8]);
274     package alias ubyte8  = Vector!(ubyte[8]);
275     package alias ubyte16 = Vector!(ubyte[16]);
276 }
277 
278 static if (DefineGenericLoadStoreUnaligned)
279 {
280     template loadUnaligned(Vec)
281     {
282         // Note: can't be @safe with this signature
283         Vec loadUnaligned(const(BaseType!Vec)* pvec) @trusted
284         {
285             enum bool isVector = ( (Vec.sizeof == 8)  && (!MMXSizedVectorsAreEmulated)
286                                 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated)
287                                 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) );
288 
289             static if (isVector)
290             {
291                 // PERF DMD
292                 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d
293                 static if (DMD_with_DSIMD && Vec.sizeof == 8)
294                 {
295                     static if (is(Vec == double2))
296                         return cast(Vec)__simd(XMM.LODUPD, *pvec);
297                     else static if (is(Vec == float4))
298                         return cast(Vec)__simd(XMM.LODUPS, *pvec);
299                     else
300                         return cast(Vec)__simd(XMM.LODDQU, *pvec);
301                 }
302                 else */
303                 {
304                     enum size_t Count = Vec.array.length;
305                     Vec result;
306                     foreach(int i; 0..Count)
307                     {
308                         result.ptr[i] = pvec[i];
309                     }
310                     return result;
311                 }
312             }
313             else
314             {
315                 // Since this vector is emulated, it doesn't have alignement constraints
316                 // and as such we can just cast it.
317                 return *cast(Vec*)(pvec);
318             }
319         }
320     }
321 
322     template storeUnaligned(Vec)
323     {
324         // Note: can't be @safe with this signature
325         void storeUnaligned(Vec v, BaseType!Vec* pvec) @trusted
326         {
327             enum bool isVector = ( (Vec.sizeof == 8)  && (!MMXSizedVectorsAreEmulated)
328                                 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated)
329                                 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) );
330 
331             static if (isVector)
332             {
333                 // PERF DMD
334                 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d
335                 static if (DMD_with_DSIMD && Vec.sizeof == 8)
336                 {
337                     static if (is(Vec == double2))
338                         __simd_sto(XMM.STOUPD, *pvec, value);
339                     else static if (is(Vec == float4))
340                         __simd_sto(XMM.STOUPS, *pvec, value);
341                     else
342                         __simd_sto(XMM.STODQU, *pvec, value);
343                 }
344                 else*/
345                 {
346                     enum size_t Count = Vec.array.length;
347                     foreach(int i; 0..Count)
348                         pvec[i] = v.array[i];
349                 }
350             }
351             else
352             {
353                 *cast(Vec*)(pvec) = v;
354             }
355         }
356     }
357 
358     Vec shufflevector(Vec, mask...)(Vec a, Vec b) @safe if (Vec.sizeof < 32)
359     {
360         enum size_t Count = Vec.array.length;
361         static assert(mask.length == Count);
362 
363         Vec r = void;
364         foreach(int i, m; mask)
365         {
366             static assert (m < Count * 2);
367             enum int ind = cast(int)m;
368             static if (ind < Count)
369                 r.array[i] = a.array[ind];
370             else
371                 r.array[i] = b.array[ind-Count];
372         }
373         return r;
374     }
375 }
376 
377 // Emulate ldc.simd cmpMask and other masks.
378 // Note: these should be deprecated on non-LDC, 
379 // since it's slower to generate that code.
380 version(LDC)
381 {} 
382 else
383 {
384     private template BaseType(V)
385     {
386         alias typeof( ( { V v; return v; }()).array[0]) BaseType;
387     }
388 
389     private template TrueMask(V)
390     {
391         alias Elem = BaseType!V;
392 
393         static if (is(Elem == float))
394         {
395             immutable uint m1 = 0xffffffff;
396             enum Elem TrueMask = *cast(float*)(&m1);
397         }
398         else static if (is(Elem == double))
399         {
400             immutable ulong m1 = 0xffffffff_ffffffff;
401             enum Elem TrueMask = *cast(double*)(&m1);
402         }
403         else // integer case
404         {
405             enum Elem TrueMask = -1;
406         }
407     }
408 
409     Vec equalMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "oeq" comparison
410     {
411         enum size_t Count = Vec.array.length;
412         Vec result;
413         foreach(int i; 0..Count)
414         {
415             bool cond = a.array[i] == b.array[i];
416             result.ptr[i] = cond ? TrueMask!Vec : 0;
417         }
418         return result;
419     }
420 
421     Vec notEqualMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "one" comparison
422     {
423         enum size_t Count = Vec.array.length;
424         Vec result;
425         foreach(int i; 0..Count)
426         {
427             bool cond = a.array[i] != b.array[i];
428             result.ptr[i] = cond ? TrueMask!Vec : 0;
429         }
430         return result;
431     }
432 
433     Vec greaterMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "ogt" comparison
434     {
435         enum size_t Count = Vec.array.length;
436         Vec result;
437         foreach(int i; 0..Count)
438         {
439             bool cond = a.array[i] > b.array[i];
440             result.ptr[i] = cond ? TrueMask!Vec : 0;
441         }
442         return result;
443     }
444 }
445 
446 unittest
447 {
448     float4 a = [1, 3, 5, 7];
449     float4 b = [2, 3, 4, 5];
450     int4 c = cast(int4)(greaterMask!float4(a, b));
451     static immutable int[4] correct = [0, 0, 0xffff_ffff, 0xffff_ffff];
452     assert(c.array == correct);
453 }
454 
455 static if (MMXSizedVectorsAreEmulated)
456 {
457     /// MMX-like SIMD types
458     struct float2
459     {
460         float[2] array;
461         mixin VectorOps!(float2, float[2]);
462     }
463 
464     struct byte8
465     {
466         byte[8] array;
467         mixin VectorOps!(byte8, byte[8]);
468     }
469 
470     struct short4
471     {
472         short[4] array;
473         mixin VectorOps!(short4, short[4]);
474     }
475 
476     struct int2
477     {
478         int[2] array;
479         mixin VectorOps!(int2, int[2]);
480     }
481 
482     struct long1
483     {
484         long[1] array;
485         mixin VectorOps!(long1, long[1]);
486     }
487 }
488 else
489 {
490     // For this compiler, defining MMX-sized vectors is working.
491     public import core.simd;
492     alias Vector!(long [1]) long1;
493     alias Vector!(float[2]) float2;
494     alias Vector!(int  [2]) int2;
495     alias Vector!(short[4]) short4;
496     alias Vector!(byte [8]) byte8;
497 }
498 
499 static assert(float2.sizeof == 8);
500 static assert(byte8.sizeof == 8);
501 static assert(short4.sizeof == 8);
502 static assert(int2.sizeof == 8);
503 static assert(long1.sizeof == 8);
504 
505 
506 static if (SSESizedVectorsAreEmulated)
507 {
508     /// SSE-like SIMD types
509 
510     struct float4
511     {
512         float[4] array;
513         mixin VectorOps!(float4, float[4]);
514     }
515 
516     struct byte16
517     {
518         byte[16] array;
519         mixin VectorOps!(byte16, byte[16]);
520     }
521 
522     struct short8
523     {
524         short[8] array;
525         mixin VectorOps!(short8, short[8]);
526     }
527 
528     struct int4
529     {
530         int[4] array;
531         mixin VectorOps!(int4, int[4]);
532     }
533 
534     struct long2
535     {
536         long[2] array;
537         mixin VectorOps!(long2, long[2]);
538     }
539 
540     struct double2
541     {
542         double[2] array;
543         mixin VectorOps!(double2, double[2]);
544     }
545 }
546 
547 static assert(float4.sizeof == 16);
548 static assert(byte16.sizeof == 16);
549 static assert(short8.sizeof == 16);
550 static assert(int4.sizeof == 16);
551 static assert(long2.sizeof == 16);
552 static assert(double2.sizeof == 16);
553 
554 
555 static if (AVXSizedVectorsAreEmulated)
556 {
557     /// AVX-like SIMD types
558 
559     struct float8
560     {
561         float[8] array;
562         mixin VectorOps!(float8, float[8]);
563     }
564 
565     struct byte32
566     {
567         byte[32] array;
568         mixin VectorOps!(byte32, byte[32]);
569     }
570 
571     struct short16
572     {
573         short[16] array;
574         mixin VectorOps!(short16, short[16]);
575     }
576 
577     struct int8
578     {
579         int[8] array;
580         mixin VectorOps!(int8, int[8]);
581     }
582 
583     struct long4
584     {
585         long[4] array;
586         mixin VectorOps!(long4, long[4]);
587     }
588 
589     struct double4
590     {
591         double[4] array;
592         mixin VectorOps!(double4, double[4]);
593     }
594 }
595 
596 static assert(float8.sizeof == 32);
597 static assert(byte32.sizeof == 32);
598 static assert(short16.sizeof == 32);
599 static assert(int8.sizeof == 32);
600 static assert(long4.sizeof == 32);
601 static assert(double4.sizeof == 32);
602 
603 
604 
605 
606 alias __m256 = float8;
607 alias __m256i = long4; // long long __vector with ICC, GCC, and clang
608 alias __m256d = double4;
609 alias __m128 = float4;
610 alias __m128i = int4;
611 alias __m128d = double2;
612 alias __m64 = long1; // like in Clang, __m64 is a vector of 1 long
613 
614 int _MM_SHUFFLE2(int x, int y) pure @safe
615 {
616     assert(x >= 0 && x <= 1);
617     assert(y >= 0 && y <= 1);
618     return (x << 1) | y;
619 }
620 
621 int _MM_SHUFFLE(int z, int y, int x, int w) pure @safe
622 {
623     assert(x >= 0 && x <= 3);
624     assert(y >= 0 && y <= 3);
625     assert(z >= 0 && z <= 3);
626     assert(w >= 0 && w <= 3);
627     return (z<<6) | (y<<4) | (x<<2) | w;
628 }
629 
630 // test assignment from scalar to vector type
631 unittest
632 {
633     float4 A = 3.0f;
634     float[4] correctA = [3.0f, 3.0f, 3.0f, 3.0f];
635     assert(A.array == correctA);
636 
637     int2 B = 42;
638     int[2] correctB = [42, 42];
639     assert(B.array == correctB);
640 }