1 /**
2 * AVX intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX
4 *
5 * Copyright: Guillaume Piolat 2022.
6 *            Johan Engelen 2022.
7 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8 */
9 module inteli.avxintrin;
10 
11 // AVX instructions
12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX
13 // Note: this header will work whether you have AVX enabled or not.
14 // With LDC, use "dflags-ldc": ["-mattr=+avx"] or equivalent to actively
15 // generate AVX instructions.
16 
17 public import inteli.types;
18 import inteli.internals;
19 
20 // Pull in all previous instruction set intrinsics.
21 public import inteli.tmmintrin;
22 
23 nothrow @nogc:
24 
25 /// Extract a 32-bit integer from `a`, selected with `imm8`.
26 int _mm256_extract_epi32 (__m256i a, const int imm8) pure @trusted
27 {
28     return (cast(int8)a).array[imm8 & 7];
29 }
30 unittest
31 {
32     align(16) int[8] data = [-1, 2, -3, 4, 9, -7, 8, -6];
33     auto A = _mm256_loadu_si256(cast(__m256i*) data.ptr);
34     assert(_mm256_extract_epi32(A, 0) == -1);
35     assert(_mm256_extract_epi32(A, 1 + 8) == 2);
36     assert(_mm256_extract_epi32(A, 3 + 16) == 4);
37     assert(_mm256_extract_epi32(A, 7 + 32) == -6);
38 }
39 
40 /// Load 256-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary.
41 __m256i _mm256_loadu_si256 (const(__m256i)* mem_addr) pure @trusted
42 {
43     // PERF DMD
44     pragma(inline, true);
45     static if (GDC_with_AVX)
46     {
47         return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) mem_addr);
48     }
49     else version(LDC)
50     {
51         return loadUnaligned!(__m256i)(cast(long*)mem_addr);
52     }
53     else
54     {
55         const(long)* p = cast(const(long)*)mem_addr; 
56         long4 r;
57         r.ptr[0] = p[0];
58         r.ptr[1] = p[1];
59         r.ptr[2] = p[2];
60         r.ptr[3] = p[3];
61         return r;
62     }
63 }
64 unittest
65 {
66     align(16) int[8] correct = [-1, 2, -3, 4, 9, -7, 8, -6];
67     int8 A = cast(int8) _mm256_loadu_si256(cast(__m256i*) correct.ptr);
68     assert(A.array == correct);
69 }
70 
71 /// Broadcast 8-bit integer `a` to all elements of the return value.
72 __m256i _mm256_set1_epi8 (byte a) pure @trusted
73 {
74     version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469
75     {
76         byte32 v = a;
77         return cast(__m256i) v;
78     }
79     else
80     {
81         pragma(inline, true);
82         return cast(__m256i)(byte32(a));
83     }
84 }
85 unittest
86 {
87     byte32 a = cast(byte32) _mm256_set1_epi8(31);
88     for (int i = 0; i < 32; ++i)
89         assert(a.array[i] == 31);
90 }
91 
92 /// Broadcast 16-bit integer `a` to all elements of the return value.
93 __m256i _mm256_set1_epi16 (short a) pure @trusted
94 {
95     version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469
96     {
97         short16 v = a;
98         return cast(__m256i) v;
99     }
100     else
101     {
102         pragma(inline, true);
103         return cast(__m256i)(short16(a));
104     }
105 }
106 unittest
107 {
108     short16 a = cast(short16) _mm256_set1_epi16(31);
109     for (int i = 0; i < 16; ++i)
110         assert(a.array[i] == 31);
111 }
112 
113 /// Broadcast 32-bit integer `a` to all elements.
114 __m256i _mm256_set1_epi32 (int a) pure @trusted
115 {
116     pragma(inline, true);
117     return cast(__m256i)(int8(a));
118 }
119 unittest
120 {
121     int8 a = cast(int8) _mm256_set1_epi32(31);
122     for (int i = 0; i < 8; ++i)
123         assert(a.array[i] == 31);
124 }
125 
126 /// Set packed 8-bit integers with the supplied values in reverse order.
127 __m256i _mm256_setr_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24,
128                           byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16,
129                           byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9,  byte e8,
130                           byte e7,  byte e6,  byte e5,  byte e4,  byte e3,  byte e2,  byte e1,  byte e0) pure @trusted
131 {
132     // PERF GDC, not checked
133     pragma(inline, true);
134     byte[32] result = [ e31,  e30,  e29,  e28,  e27,  e26,  e25,  e24,
135                         e23,  e22,  e21,  e20,  e19,  e18,  e17,  e16,
136                         e15,  e14,  e13,  e12,  e11,  e10,  e9,   e8,
137                         e7,   e6,   e5,   e4,   e3,   e2,   e1,   e0];
138     static if (GDC_with_AVX)
139     {
140         return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr);
141     }
142     else version(LDC)
143     {
144         return cast(__m256i)( loadUnaligned!(byte32)(result.ptr) );
145     }
146     else
147     {
148         byte32 r;
149         for(int n = 0; n < 32; ++n)
150             r.ptr[n] = result[n];
151         return cast(__m256i)r;
152     }
153 }
154 unittest
155 {
156     byte32 A = cast(byte32) _mm256_setr_epi8( -1, 0, -21, 21, 42, 127, -42, -128,
157                                               -1, 0, -21, 21, 42, 127, -42, -128,
158                                               -1, 0, -21, 21, 42, 127, -42, -128,
159                                               -1, 0, -21, 21, 42, 127, -42, -128);
160     byte[32] correct = [-1, 0, -21, 21, 42, 127, -42, -128,
161                         -1, 0, -21, 21, 42, 127, -42, -128,
162                         -1, 0, -21, 21, 42, 127, -42, -128,
163                         -1, 0, -21, 21, 42, 127, -42, -128];
164     assert(A.array == correct);
165 }
166 
167 /// Set packed 16-bit integers with the supplied values in reverse order.
168 __m256i _mm256_setr_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9,  short e8,
169                            short e7,  short e6,  short e5,  short e4,  short e3,  short e2,  short e1,  short e0) pure @trusted
170 {
171     pragma(inline, true);
172     short[16] result = [ e15,  e14,  e13,  e12,  e11,  e10,  e9,   e8,
173                          e7,   e6,   e5,   e4,   e3,   e2,   e1,   e0];
174     static if (GDC_with_AVX)
175     {
176          return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr);
177     }
178     else version(LDC)
179     {
180         return cast(__m256i)( loadUnaligned!(short16)(result.ptr) );
181     }
182     else
183     {
184         short16 r;
185         for(int n = 0; n < 16; ++n)
186             r.ptr[n] = result[n];
187         return cast(__m256i)r;
188     }
189 }
190 unittest
191 {
192     short16 A = cast(short16) _mm256_setr_epi16(-1, 0, -21, 21, 42, 127, -42, -128,
193                                                 -1, 0, -21, 21, 42, 127, -42, -128);
194     short[16] correct = [-1, 0, -21, 21, 42, 127, -42, -128,
195                          -1, 0, -21, 21, 42, 127, -42, -128];
196     assert(A.array == correct);
197 }
198 
199 /// Set packed 32-bit integers with the supplied values in reverse order.
200 __m256i _mm256_setr_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted
201 {
202     pragma(inline, true);
203     int[8] result = [e7, e6, e5, e4, e3, e2, e1, e0];
204     static if (GDC_with_AVX)
205     {
206         return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr);
207     }
208     else version(LDC)
209     {
210         return cast(__m256i)( loadUnaligned!(int8)(result.ptr) );
211     }
212     else
213     {
214         int8 r;
215         for(int n = 0; n < 8; ++n)
216             r.ptr[n] = result[n];
217         return cast(__m256i)r;
218     }
219 }
220 unittest
221 {
222     int8 A = cast(int8) _mm256_setr_epi32(-1, 0, -2147483648, 2147483647, 42, 666, -42, -666);
223     int[8] correct = [-1, 0, -2147483648, 2147483647, 42, 666, -42, -666];
224     assert(A.array == correct);
225 }
226 
227 
228 /// Return vector of type `__m256i` with all elements set to zero.
229 __m256i _mm256_setzero_si256() pure @trusted
230 {
231     // PERF: nothing was checked
232     pragma(inline, true);
233     
234     version(LDC)
235     {
236         int[8] result = [0, 0, 0, 0, 0, 0, 0, 0];
237         return cast(__m256i)( loadUnaligned!(int8)(result.ptr) );
238     }
239     else
240     {
241         __m256i r;
242         r = 0;
243         return r;
244     }
245 }
246 
247 /// Store 256-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular boundary.
248 pragma(inline, true)
249 void _mm256_storeu_si256 (const(__m256i)* mem_addr, __m256i a) pure @trusted
250 {
251     // PERF: DMD and GDC
252     version(LDC)
253     {
254         storeUnaligned!__m256i(a, cast(long*)mem_addr);
255     }
256     else
257     {
258         long4 v = cast(long4)a;
259         long* p = cast(long*)mem_addr;
260         for(int n = 0; n < 4; ++n)
261             p[n] = v[n];
262     }
263 }