1 /**
2 * AVX intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX
4 *
5 * Copyright: Guillaume Piolat 2022.
6 *            Johan Engelen 2022.
7 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8 */
9 module inteli.avxintrin;
10 
11 // AVX instructions
12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX
13 // Note: this header will work whether you have AVX enabled or not.
14 // With LDC, use "dflags-ldc": ["-mattr=+avx"] or equivalent to actively
15 // generate AVX instructions.
16 
17 public import inteli.types;
18 import inteli.internals;
19 
20 // Pull in all previous instruction set intrinsics.
21 public import inteli.tmmintrin;
22 
23 nothrow @nogc:
24 
25 /// Extract a 32-bit integer from `a`, selected with `imm8`.
26 int _mm256_extract_epi32 (__m256i a, const int imm8) pure @trusted
27 {
28     return (cast(int8)a).array[imm8 & 7];
29 }
30 unittest
31 {
32     align(16) int[8] data = [-1, 2, -3, 4, 9, -7, 8, -6];
33     auto A = _mm256_loadu_si256(cast(__m256i*) data.ptr);
34     assert(_mm256_extract_epi32(A, 0) == -1);
35     assert(_mm256_extract_epi32(A, 1 + 8) == 2);
36     assert(_mm256_extract_epi32(A, 3 + 16) == 4);
37     assert(_mm256_extract_epi32(A, 7 + 32) == -6);
38 }
39 
40 /// Load 256-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary.
41 __m256i _mm256_loadu_si256 (const(__m256i)* mem_addr) pure @trusted
42 {
43     // PERF DMD
44     pragma(inline, true);
45     static if (GDC_with_AVX)
46     {
47         return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) mem_addr);
48     }
49     else version(LDC)
50     {
51         return loadUnaligned!(__m256i)(cast(long*)mem_addr);
52     }
53     else
54     {
55         const(long)* p = cast(const(long)*)mem_addr; 
56         long4 r;
57         r.ptr[0] = p[0];
58         r.ptr[1] = p[1];
59         r.ptr[2] = p[2];
60         r.ptr[3] = p[3];
61         return r;
62     }
63 }
64 unittest
65 {
66     align(16) int[8] correct = [-1, 2, -3, 4, 9, -7, 8, -6];
67     int8 A = cast(int8) _mm256_loadu_si256(cast(__m256i*) correct.ptr);
68     assert(A.array == correct);
69 }
70 
71 /// Broadcast 8-bit integer `a` to all elements of the return value.
72 __m256i _mm256_set1_epi8 (byte a) pure @trusted
73 {
74     version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469
75     {
76         byte32 v = a;
77         return cast(__m256i) v;
78     }
79     else
80     {
81         pragma(inline, true);
82         return cast(__m256i)(byte32(a));
83     }
84 }
85 unittest
86 {
87     byte32 a = cast(byte32) _mm256_set1_epi8(31);
88     for (int i = 0; i < 32; ++i)
89         assert(a.array[i] == 31);
90 }
91 
92 /// Broadcast 16-bit integer `a` to all elements of the return value.
93 __m256i _mm256_set1_epi16 (short a) pure @trusted
94 {
95     // workaround https://issues.dlang.org/show_bug.cgi?id=21469
96     // It used to ICE, now the codegen is just wrong.
97     // TODO report this backend issue.
98     version(DigitalMars) 
99     {
100         short16 v = a;
101         return cast(__m256i) v;
102     }
103     else
104     {
105         pragma(inline, true);
106         return cast(__m256i)(short16(a));
107     }
108 }
109 unittest
110 {
111     short16 a = cast(short16) _mm256_set1_epi16(31);
112     for (int i = 0; i < 16; ++i)
113         assert(a.array[i] == 31);
114 }
115 
116 /// Broadcast 32-bit integer `a` to all elements.
117 __m256i _mm256_set1_epi32 (int a) pure @trusted
118 {
119     // Bad codegen else in DMD.
120     // TODO report this backend issue.
121     version(DigitalMars) 
122     {
123         int8 v = a;
124         return cast(__m256i) v;
125     }
126     else
127     {
128         pragma(inline, true);
129         return cast(__m256i)(int8(a));
130     }
131 }
132 unittest
133 {
134     int8 a = cast(int8) _mm256_set1_epi32(31);
135     for (int i = 0; i < 8; ++i)
136         assert(a.array[i] == 31);
137 }
138 
139 /// Set packed 8-bit integers with the supplied values in reverse order.
140 __m256i _mm256_setr_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24,
141                           byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16,
142                           byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9,  byte e8,
143                           byte e7,  byte e6,  byte e5,  byte e4,  byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
144 {
145     // PERF GDC, not checked
146     pragma(inline, true);
147     byte[32] result = [ e31,  e30,  e29,  e28,  e27,  e26,  e25,  e24,
148                         e23,  e22,  e21,  e20,  e19,  e18,  e17,  e16,
149                         e15,  e14,  e13,  e12,  e11,  e10,  e9,   e8,
150                         e7,   e6,   e5,   e4,   e3,   e2,   e1,   e0];
151     static if (GDC_with_AVX)
152     {
153         return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr);
154     }
155     else version(LDC)
156     {
157         return cast(__m256i)( loadUnaligned!(byte32)(result.ptr) );
158     }
159     else
160     {
161         byte32 r;
162         for(int n = 0; n < 32; ++n)
163             r.ptr[n] = result[n];
164         return cast(__m256i)r;
165     }
166 }
167 unittest
168 {
169     byte32 A = cast(byte32) _mm256_setr_epi8( -1, 0, -21, 21, 42, 127, -42, -128,
170                                               -1, 0, -21, 21, 42, 127, -42, -128,
171                                               -1, 0, -21, 21, 42, 127, -42, -128,
172                                               -1, 0, -21, 21, 42, 127, -42, -128);
173     byte[32] correct = [-1, 0, -21, 21, 42, 127, -42, -128,
174                         -1, 0, -21, 21, 42, 127, -42, -128,
175                         -1, 0, -21, 21, 42, 127, -42, -128,
176                         -1, 0, -21, 21, 42, 127, -42, -128];
177     assert(A.array == correct);
178 }
179 
180 /// Set packed 16-bit integers with the supplied values in reverse order.
181 __m256i _mm256_setr_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9,  short e8,
182                            short e7,  short e6,  short e5,  short e4,  short e3,  short e2,  short e1,  short e0) pure @trusted
183 {
184     pragma(inline, true);
185     short[16] result = [ e15,  e14,  e13,  e12,  e11,  e10,  e9,   e8,
186                          e7,   e6,   e5,   e4,   e3,   e2,   e1,   e0];
187     static if (GDC_with_AVX)
188     {
189          return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr);
190     }
191     else version(LDC)
192     {
193         return cast(__m256i)( loadUnaligned!(short16)(result.ptr) );
194     }
195     else
196     {
197         short16 r;
198         for(int n = 0; n < 16; ++n)
199             r.ptr[n] = result[n];
200         return cast(__m256i)r;
201     }
202 }
203 unittest
204 {
205     short16 A = cast(short16) _mm256_setr_epi16(-1, 0, -21, 21, 42, 127, -42, -128,
206                                                 -1, 0, -21, 21, 42, 127, -42, -128);
207     short[16] correct = [-1, 0, -21, 21, 42, 127, -42, -128,
208                          -1, 0, -21, 21, 42, 127, -42, -128];
209     assert(A.array == correct);
210 }
211 
212 /// Set packed 32-bit integers with the supplied values in reverse order.
213 __m256i _mm256_setr_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted
214 {
215     pragma(inline, true);
216     int[8] result = [e7, e6, e5, e4, e3, e2, e1, e0];
217     static if (GDC_with_AVX)
218     {
219         return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr);
220     }
221     else version(LDC)
222     {
223         return cast(__m256i)( loadUnaligned!(int8)(result.ptr) );
224     }
225     else
226     {
227         int8 r;
228         for(int n = 0; n < 8; ++n)
229             r.ptr[n] = result[n];
230         return cast(__m256i)r;
231     }
232 }
233 unittest
234 {
235     int8 A = cast(int8) _mm256_setr_epi32(-1, 0, -2147483648, 2147483647, 42, 666, -42, -666);
236     int[8] correct = [-1, 0, -2147483648, 2147483647, 42, 666, -42, -666];
237     assert(A.array == correct);
238 }
239 
240 
241 /// Return vector of type `__m256i` with all elements set to zero.
242 __m256i _mm256_setzero_si256() pure @trusted
243 {
244     // PERF: nothing was checked
245     pragma(inline, true);
246     
247     version(LDC)
248     {
249         int[8] result = [0, 0, 0, 0, 0, 0, 0, 0];
250         return cast(__m256i)( loadUnaligned!(int8)(result.ptr) );
251     }
252     else
253     {
254         __m256i r;
255         r = 0;
256         return r;
257     }
258 }
259 
260 /// Store 256-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular boundary.
261 pragma(inline, true)
262 void _mm256_storeu_si256 (const(__m256i)* mem_addr, __m256i a) pure @trusted
263 {
264     // PERF: DMD and GDC
265     version(LDC)
266     {
267         storeUnaligned!__m256i(a, cast(long*)mem_addr);
268     }
269     else
270     {
271         long4 v = cast(long4)a;
272         long* p = cast(long*)mem_addr;
273         for(int n = 0; n < 4; ++n)
274             p[n] = v[n];
275     }
276 }