1 /**
2 * `core.simd` emulation layer.
3 *
4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module inteli.types;
8 
9 
10 pure:
11 nothrow:
12 @nogc:
13 
14 version(GNU)
15 {
16     // Note: for GDC support, be sure to use https://explore.dgnu.org/
17 
18     version(X86_64)
19     {
20         enum MMXSizedVectorsAreEmulated = false;
21         enum SSESizedVectorsAreEmulated = false;
22 
23         // TODO: use D_AVX and D_AVX2 eventually to detect AVX?
24         enum AVXSizedVectorsAreEmulated = true;
25 
26         import gcc.builtins;
27 
28         float4 loadUnaligned(Vec)(const(float)* pvec) @trusted if (is(Vec == float4))
29         {
30             return __builtin_ia32_loadups(pvec);
31         }
32 
33         double2 loadUnaligned(Vec)(const(double)* pvec) @trusted if (is(Vec == double2))
34         {
35             return __builtin_ia32_loadupd(pvec);
36         }
37 
38         byte16 loadUnaligned(Vec)(const(byte)* pvec) @trusted if (is(Vec == byte16))
39         {
40             return cast(byte16) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
41         }
42 
43         short8 loadUnaligned(Vec)(const(short)* pvec) @trusted if (is(Vec == short8))
44         {
45             return cast(short8) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
46         }
47 
48         int4 loadUnaligned(Vec)(const(int)* pvec) @trusted if (is(Vec == int4))
49         {
50             return cast(int4) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
51         }
52 
53         long2 loadUnaligned(Vec)(const(long)* pvec) @trusted if (is(Vec == long2))
54         {
55             return cast(long2) __builtin_ia32_loaddqu(cast(const(char)*) pvec);
56         }
57 
58         void storeUnaligned(Vec)(Vec v, float* pvec) @trusted if (is(Vec == float4))
59         {
60             __builtin_ia32_storeups(pvec, v);
61         }
62 
63         void storeUnaligned(Vec)(Vec v, double* pvec) @trusted if (is(Vec == double2))
64         {
65             __builtin_ia32_storeupd(pvec, v);
66         }
67 
68         void storeUnaligned(Vec)(Vec v, byte* pvec) @trusted if (is(Vec == byte16))
69         {
70             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
71         }
72 
73         void storeUnaligned(Vec)(Vec v, short* pvec) @trusted if (is(Vec == short8))
74         {
75             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
76         }
77 
78         void storeUnaligned(Vec)(Vec v, int* pvec) @trusted if (is(Vec == int4))
79         {
80             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
81         }
82 
83         void storeUnaligned(Vec)(Vec v, long* pvec) @trusted if (is(Vec == long2))
84         {
85             __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v);
86         }
87 
88         // TODO: for performance, replace that anywhere possible by a GDC intrinsic
89         Vec shufflevector(Vec, mask...)(Vec a, Vec b) @trusted
90         {
91             enum Count = Vec.array.length;
92             static assert(mask.length == Count);
93 
94             Vec r = void;
95             foreach(int i, m; mask)
96             {
97                 static assert (m < Count * 2);
98                 int ind = cast(int)m;
99                 if (ind < Count)
100                     r.ptr[i] = a.array[ind];
101                 else
102                     r.ptr[i] = b.array[ind - Count];
103             }
104             return r;
105         }
106     }
107     else
108     {
109         enum MMXSizedVectorsAreEmulated = true;
110         enum SSESizedVectorsAreEmulated = true;
111         enum AVXSizedVectorsAreEmulated = true;
112     }
113 }
114 else version(LDC)
115 {
116     public import ldc.simd;
117 
118     // Use this alias to mention it should only be used with LDC,
119     // for example when emulated shufflevector would just be wasteful.
120     alias shufflevectorLDC = shufflevector; 
121 
122     enum MMXSizedVectorsAreEmulated = false;
123     enum SSESizedVectorsAreEmulated = false;
124     enum AVXSizedVectorsAreEmulated = false;
125 }
126 else version(DigitalMars)
127 {
128     public import core.simd;
129 
130     version(D_SIMD)
131     {
132         enum MMXSizedVectorsAreEmulated = true;
133 
134         static if (__VERSION__ >= 2099)
135         {
136             // Trying out D_IMD finally, with DMD 2.099
137             enum SSESizedVectorsAreEmulated = false;
138         }
139         else
140         {
141             // Basically blockd by DMD backend issues, tagged codegen, backend, or SIMD in Bugzilla.
142             enum SSESizedVectorsAreEmulated = true; 
143         }
144 
145         enum AVXSizedVectorsAreEmulated = true;
146     }
147     else
148     {
149         // Some DMD 32-bit targets don't have D_SIMD
150         enum MMXSizedVectorsAreEmulated = true;
151         enum SSESizedVectorsAreEmulated = true;
152         enum AVXSizedVectorsAreEmulated = true;
153     }
154 }
155 
156 enum CoreSimdIsEmulated = MMXSizedVectorsAreEmulated || SSESizedVectorsAreEmulated || AVXSizedVectorsAreEmulated;
157 
158 version(GNU)
159     enum bool DefineGenericLoadStoreUnaligned = false;
160 else
161     enum bool DefineGenericLoadStoreUnaligned = CoreSimdIsEmulated;
162 
163 
164 static if (CoreSimdIsEmulated)
165 {
166     // core.simd is emulated in some capacity: introduce `VectorOps`
167 
168     mixin template VectorOps(VectorType, ArrayType: BaseType[N], BaseType, size_t N)
169     {
170         enum Count = N;
171         alias Base = BaseType;
172 
173         BaseType* ptr() return pure nothrow @nogc
174         {
175             return array.ptr;
176         }
177 
178         // Unary operators
179         VectorType opUnary(string op)() pure nothrow @safe @nogc
180         {
181             VectorType res = void;
182             mixin("res.array[] = " ~ op ~ "array[];");
183             return res;
184         }
185 
186         // Binary operators
187         VectorType opBinary(string op)(VectorType other) pure const nothrow @safe @nogc
188         {
189             VectorType res = void;
190             mixin("res.array[] = array[] " ~ op ~ " other.array[];");
191             return res;
192         }
193 
194         // Assigning a BaseType value
195         void opAssign(BaseType e) pure nothrow @safe @nogc
196         {
197             array[] = e;
198         }
199 
200         // Assigning a static array
201         void opAssign(ArrayType v) pure nothrow @safe @nogc
202         {
203             array[] = v[];
204         }
205 
206         void opOpAssign(string op)(VectorType other) pure nothrow @safe @nogc
207         {
208             mixin("array[] "  ~ op ~ "= other.array[];");
209         }
210 
211         // Assigning a dyn array
212         this(ArrayType v) pure nothrow @safe @nogc
213         {
214             array[] = v[];
215         }
216 
217         // Broadcast constructor
218         this(BaseType x) pure nothrow @safe @nogc
219         {
220             array[] = x;
221         }
222 
223         /// We can't support implicit conversion but do support explicit casting.
224         /// "Vector types of the same size can be implicitly converted among each other."
225         /// Casting to another vector type is always just a raw copy.
226         VecDest opCast(VecDest)() pure const nothrow @trusted @nogc
227             if (VecDest.sizeof == VectorType.sizeof)
228             {
229                 VecDest dest = void;
230                 // Copy
231                 dest.array[] = (cast(typeof(dest.array))cast(void[VectorType.sizeof])array)[];
232                 return dest;
233             }
234 
235         ref inout(BaseType) opIndex(size_t i) inout pure nothrow @safe @nogc
236         {
237             return array[i];
238         }
239 
240     }
241 
242     // they just weren't interesting enough, use v.array[i] instead.
243     deprecated auto extractelement(Vec, int index, Vec2)(Vec2 vec) @trusted
244     {
245         static assert(Vec.sizeof == Vec2.sizeof);
246         import core.stdc.string: memcpy;
247         Vec v = void;
248         memcpy(&v, &vec, Vec2.sizeof);
249         return v.array[index];
250     }
251 
252     // they just weren't interesting enough, use v.ptr[i] = x instead.
253     deprecated auto insertelement(Vec, int index, Vec2)(Vec2 vec, Vec.Base e) @trusted
254     {
255         static assert(Vec.sizeof == Vec2.sizeof);
256         import core.stdc.string: memcpy;
257         Vec v = void;
258         memcpy(&v, &vec, Vec2.sizeof);
259         v.array[index] = e;
260         return v;
261     }
262 }
263 else
264 {
265     public import core.simd;
266 
267     // GDC cannot convert implicitely __vector from signed to unsigned, but LDC can
268     // And LDC sometimes need those unsigned vector types for some intrinsics.
269     // For internal use only.
270     package alias ushort8 = Vector!(ushort[8]);
271     package alias ubyte8  = Vector!(ubyte[8]);
272     package alias ubyte16 = Vector!(ubyte[16]);
273 }
274 
275 static if (DefineGenericLoadStoreUnaligned)
276 {
277     template loadUnaligned(Vec)
278     {
279         // Note: can't be @safe with this signature
280         Vec loadUnaligned(const(BaseType!Vec)* pvec) @trusted
281         {
282             enum bool isVector = ( (Vec.sizeof == 8)  && (!MMXSizedVectorsAreEmulated)
283                                 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated)
284                                 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) );
285 
286             static if (isVector)
287             {
288                 // PERF DMD
289                 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d
290                 static if (DMD_with_DSIMD && Vec.sizeof == 8)
291                 {
292                     static if (is(Vec == double2))
293                         return cast(Vec)__simd(XMM.LODUPD, *pvec);
294                     else static if (is(Vec == float4))
295                         return cast(Vec)__simd(XMM.LODUPS, *pvec);
296                     else
297                         return cast(Vec)__simd(XMM.LODDQU, *pvec);
298                 }
299                 else */
300                 {
301                     enum size_t Count = Vec.array.length;
302                     Vec result;
303                     foreach(int i; 0..Count)
304                     {
305                         result.ptr[i] = pvec[i];
306                     }
307                     return result;
308                 }
309             }
310             else
311             {
312                 // Since this vector is emulated, it doesn't have alignement constraints
313                 // and as such we can just cast it.
314                 return *cast(Vec*)(pvec);
315             }
316         }
317     }
318 
319     template storeUnaligned(Vec)
320     {
321         // Note: can't be @safe with this signature
322         void storeUnaligned(Vec v, BaseType!Vec* pvec) @trusted
323         {
324             enum bool isVector = ( (Vec.sizeof == 8)  && (!MMXSizedVectorsAreEmulated)
325                                 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated)
326                                 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) );
327 
328             static if (isVector)
329             {
330                 // PERF DMD
331                 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d
332                 static if (DMD_with_DSIMD && Vec.sizeof == 8)
333                 {
334                     static if (is(Vec == double2))
335                         __simd_sto(XMM.STOUPD, *pvec, value);
336                     else static if (is(Vec == float4))
337                         __simd_sto(XMM.STOUPS, *pvec, value);
338                     else
339                         __simd_sto(XMM.STODQU, *pvec, value);
340                 }
341                 else*/
342                 {
343                     enum size_t Count = Vec.array.length;
344                     foreach(int i; 0..Count)
345                         pvec[i] = v.array[i];
346                 }
347             }
348             else
349             {
350                 *cast(Vec*)(pvec) = v;
351             }
352         }
353     }
354 
355     Vec shufflevector(Vec, mask...)(Vec a, Vec b) @safe if (Vec.sizeof < 32)
356     {
357         enum size_t Count = Vec.array.length;
358         static assert(mask.length == Count);
359 
360         Vec r = void;
361         foreach(int i, m; mask)
362         {
363             static assert (m < Count * 2);
364             enum int ind = cast(int)m;
365             static if (ind < Count)
366                 r.array[i] = a.array[ind];
367             else
368                 r.array[i] = b.array[ind-Count];
369         }
370         return r;
371     }
372 }
373 
374 // Emulate ldc.simd cmpMask and other masks.
375 // Note: these should be deprecated on non-LDC, 
376 // since it's slower to generate that code.
377 version(LDC)
378 {} 
379 else
380 {
381     private template BaseType(V)
382     {
383         alias typeof( ( { V v; return v; }()).array[0]) BaseType;
384     }
385 
386     private template TrueMask(V)
387     {
388         alias Elem = BaseType!V;
389 
390         static if (is(Elem == float))
391         {
392             immutable uint m1 = 0xffffffff;
393             enum Elem TrueMask = *cast(float*)(&m1);
394         }
395         else static if (is(Elem == double))
396         {
397             immutable ulong m1 = 0xffffffff_ffffffff;
398             enum Elem TrueMask = *cast(double*)(&m1);
399         }
400         else // integer case
401         {
402             enum Elem TrueMask = -1;
403         }
404     }
405 
406     Vec equalMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "oeq" comparison
407     {
408         enum size_t Count = Vec.array.length;
409         Vec result;
410         foreach(int i; 0..Count)
411         {
412             bool cond = a.array[i] == b.array[i];
413             result.ptr[i] = cond ? TrueMask!Vec : 0;
414         }
415         return result;
416     }
417 
418     Vec notEqualMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "one" comparison
419     {
420         enum size_t Count = Vec.array.length;
421         Vec result;
422         foreach(int i; 0..Count)
423         {
424             bool cond = a.array[i] != b.array[i];
425             result.ptr[i] = cond ? TrueMask!Vec : 0;
426         }
427         return result;
428     }
429 
430     Vec greaterMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "ogt" comparison
431     {
432         enum size_t Count = Vec.array.length;
433         Vec result;
434         foreach(int i; 0..Count)
435         {
436             bool cond = a.array[i] > b.array[i];
437             result.ptr[i] = cond ? TrueMask!Vec : 0;
438         }
439         return result;
440     }
441 }
442 
443 unittest
444 {
445     float4 a = [1, 3, 5, 7];
446     float4 b = [2, 3, 4, 5];
447     int4 c = cast(int4)(greaterMask!float4(a, b));
448     static immutable int[4] correct = [0, 0, 0xffff_ffff, 0xffff_ffff];
449     assert(c.array == correct);
450 }
451 
452 static if (MMXSizedVectorsAreEmulated)
453 {
454     /// MMX-like SIMD types
455     struct float2
456     {
457         float[2] array;
458         mixin VectorOps!(float2, float[2]);
459     }
460 
461     struct byte8
462     {
463         byte[8] array;
464         mixin VectorOps!(byte8, byte[8]);
465     }
466 
467     struct short4
468     {
469         short[4] array;
470         mixin VectorOps!(short4, short[4]);
471     }
472 
473     struct int2
474     {
475         int[2] array;
476         mixin VectorOps!(int2, int[2]);
477     }
478 
479     struct long1
480     {
481         long[1] array;
482         mixin VectorOps!(long1, long[1]);
483     }
484 }
485 else
486 {
487     // For this compiler, defining MMX-sized vectors is working.
488     public import core.simd;
489     alias Vector!(long [1]) long1;
490     alias Vector!(float[2]) float2;
491     alias Vector!(int  [2]) int2;
492     alias Vector!(short[4]) short4;
493     alias Vector!(byte [8]) byte8;
494 }
495 
496 static assert(float2.sizeof == 8);
497 static assert(byte8.sizeof == 8);
498 static assert(short4.sizeof == 8);
499 static assert(int2.sizeof == 8);
500 static assert(long1.sizeof == 8);
501 
502 
503 static if (SSESizedVectorsAreEmulated)
504 {
505     /// SSE-like SIMD types
506 
507     struct float4
508     {
509         float[4] array;
510         mixin VectorOps!(float4, float[4]);
511     }
512 
513     struct byte16
514     {
515         byte[16] array;
516         mixin VectorOps!(byte16, byte[16]);
517     }
518 
519     struct short8
520     {
521         short[8] array;
522         mixin VectorOps!(short8, short[8]);
523     }
524 
525     struct int4
526     {
527         int[4] array;
528         mixin VectorOps!(int4, int[4]);
529     }
530 
531     struct long2
532     {
533         long[2] array;
534         mixin VectorOps!(long2, long[2]);
535     }
536 
537     struct double2
538     {
539         double[2] array;
540         mixin VectorOps!(double2, double[2]);
541     }
542 }
543 
544 static assert(float4.sizeof == 16);
545 static assert(byte16.sizeof == 16);
546 static assert(short8.sizeof == 16);
547 static assert(int4.sizeof == 16);
548 static assert(long2.sizeof == 16);
549 static assert(double2.sizeof == 16);
550 
551 
552 static if (AVXSizedVectorsAreEmulated)
553 {
554     /// AVX-like SIMD types
555 
556     struct float8
557     {
558         float[8] array;
559         mixin VectorOps!(float8, float[8]);
560     }
561 
562     struct byte32
563     {
564         byte[32] array;
565         mixin VectorOps!(byte32, byte[32]);
566     }
567 
568     struct short16
569     {
570         short[16] array;
571         mixin VectorOps!(short16, short[16]);
572     }
573 
574     struct int8
575     {
576         int[8] array;
577         mixin VectorOps!(int8, int[8]);
578     }
579 
580     struct long4
581     {
582         long[4] array;
583         mixin VectorOps!(long4, long[4]);
584     }
585 
586     struct double4
587     {
588         double[4] array;
589         mixin VectorOps!(double4, double[4]);
590     }
591 }
592 
593 static assert(float8.sizeof == 32);
594 static assert(byte32.sizeof == 32);
595 static assert(short16.sizeof == 32);
596 static assert(int8.sizeof == 32);
597 static assert(long4.sizeof == 32);
598 static assert(double4.sizeof == 32);
599 
600 
601 
602 
603 alias __m256 = float8;
604 alias __m256i = long4; // long long __vector with ICC, GCC, and clang
605 alias __m256d = double4;
606 alias __m128 = float4;
607 alias __m128i = int4;
608 alias __m128d = double2;
609 alias __m64 = long1; // like in Clang, __m64 is a vector of 1 long
610 
611 int _MM_SHUFFLE2(int x, int y) pure @safe
612 {
613     assert(x >= 0 && x <= 1);
614     assert(y >= 0 && y <= 1);
615     return (x << 1) | y;
616 }
617 
618 int _MM_SHUFFLE(int z, int y, int x, int w) pure @safe
619 {
620     assert(x >= 0 && x <= 3);
621     assert(y >= 0 && y <= 3);
622     assert(z >= 0 && z <= 3);
623     assert(w >= 0 && w <= 3);
624     return (z<<6) | (y<<4) | (x<<2) | w;
625 }
626 
627 // test assignment from scalar to vector type
628 unittest
629 {
630     float4 A = 3.0f;
631     float[4] correctA = [3.0f, 3.0f, 3.0f, 3.0f];
632     assert(A.array == correctA);
633 
634     int2 B = 42;
635     int[2] correctB = [42, 42];
636     assert(B.array == correctB);
637 }