1 /**
2 * AVX2 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX2
4 *
5 * Copyright: Guillaume Piolat 2022.
6 *            Johan Engelen 2022.
7 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8 */
9 module inteli.avx2intrin;
10 
11 // AVX2 instructions
12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX2
13 // Note: this header will work whether you have AVX2 enabled or not.
14 // With LDC, use "dflags-ldc": ["-mattr=+avx2"] or equivalent to actively
15 // generate AVX2 instructions.
16 
17 public import inteli.types;
18 import inteli.internals;
19 
20 // Pull in all previous instruction set intrinsics.
21 public import inteli.avxintrin;
22 
23 nothrow @nogc:
24 
25 /// Add packed 32-bit integers in `a` and `b`.
26 __m256i _mm256_add_epi32(__m256i a, __m256i b) pure @safe
27 {
28     pragma(inline, true);
29     return cast(__m256i)(cast(int8)a + cast(int8)b);
30 }
31 unittest
32 {
33     __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432);
34     int8 R = cast(int8) _mm256_add_epi32(A, A);
35     int[8] correct = [ -14, -2, 0, 18, -200, 200, 468, 864 ];
36     assert(R.array == correct);
37 }
38 
39 /// Compute the bitwise AND of 256 bits (representing integer data) in `a` and `b`.
40 __m256i _mm256_and_si256 (__m256i a, __m256i b) pure @safe
41 {
42     pragma(inline, true);
43     return a & b;
44 }
45 unittest
46 {
47     __m256i A = _mm256_set1_epi32(7);
48     __m256i B = _mm256_set1_epi32(14);
49     int8 R = cast(int8) _mm256_and_si256(A, B);
50     int[8] correct = [6, 6, 6, 6, 6, 6, 6, 6];
51     assert(R.array == correct);
52 }
53 
54 /// Zero-extend packed unsigned 16-bit integers in `a` to packed 32-bit integers.
55 __m256i _mm256_cvtepu16_epi32(__m128i a) pure @trusted
56 {
57     static if (GDC_with_AVX2)
58     {
59         return cast(__m256i) __builtin_ia32_pmovzxwd256(cast(short8)a);
60     }
61     else
62     {
63         short8 sa = cast(short8)a;
64         int8 r;
65         // Explicit cast to unsigned to get *zero* extension (instead of sign extension).
66         r.ptr[0] = cast(ushort)sa.array[0];
67         r.ptr[1] = cast(ushort)sa.array[1];
68         r.ptr[2] = cast(ushort)sa.array[2];
69         r.ptr[3] = cast(ushort)sa.array[3];
70         r.ptr[4] = cast(ushort)sa.array[4];
71         r.ptr[5] = cast(ushort)sa.array[5];
72         r.ptr[6] = cast(ushort)sa.array[6];
73         r.ptr[7] = cast(ushort)sa.array[7];
74         return cast(__m256i)r;
75     }
76 }
77 unittest
78 {
79     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, -1, 0, -32768, 32767);
80     int8 C = cast(int8) _mm256_cvtepu16_epi32(A);
81     int[8] correct = [65535, 0, 32768, 32767, 65535, 0, 32768, 32767];
82     assert(C.array == correct);
83 }
84 
85 /// Extract 128 bits (composed of integer data) from `a`, selected with `imm8`.
86 __m128i _mm256_extracti128_si256(int imm8)(__m256i a) pure @trusted
87     if ( (imm8 == 0) || (imm8 == 1) )
88 {
89     pragma(inline, true);
90 
91     static if (GDC_with_AVX2)
92     {
93         return cast(__m128i) __builtin_ia32_extract128i256(a, imm8);
94     }
95     else version (LDC)
96     {
97         enum str = (imm8 == 1) ? "<i32 2, i32 3>" : "<i32 0, i32 1>";
98         enum ir = "%r = shufflevector <4 x i64> %0, <4 x i64> undef, <2 x i32>" ~ str ~ "\n" ~
99                   "ret <2 x i64> %r";
100         return cast(__m128i) LDCInlineIR!(ir, ulong2, ulong4)(cast(ulong4)a);
101     }
102     else
103     {
104         long4 al = cast(long4) a;
105         long2 ret;
106         ret.ptr[0] = (imm8==1) ? al.array[2] : al.array[0];
107         ret.ptr[1] = (imm8==1) ? al.array[3] : al.array[1];
108         return cast(__m128i) ret;
109     }
110 }
111 unittest
112 {
113     __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432 );
114     int[4] correct0 = [ -7, -1, 0, 9 ];
115     int[4] correct1 = [ -100, 100, 234, 432 ];
116     __m128i R0 = _mm256_extracti128_si256!(0)(A);
117     __m128i R1 = _mm256_extracti128_si256!(1)(A);
118     assert(R0.array == correct0);
119     assert(R1.array == correct1);
120 }
121 
122 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
123 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
124 /// and pack the results in destination.
125 __m256i _mm256_madd_epi16 (__m256i a, __m256i b) pure @trusted
126 {
127     static if (GDC_with_AVX2)
128     {
129         return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b);
130     }
131     else static if (LDC_with_AVX2)
132     {
133         return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b);
134     }
135     else
136     {
137         short16 sa = cast(short16)a;
138         short16 sb = cast(short16)b;
139         int8 r;
140         foreach(i; 0..8)
141         {
142             r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
143         }
144         return cast(__m256i) r;
145     }
146 }
147 unittest
148 {
149     short16 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767];
150     short16 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767];
151     int8 R = cast(int8) _mm256_madd_epi16(cast(__m256i)A, cast(__m256i)B);
152     int[8] correct = [1, 13, -2147483648, 2*32767*32767, 1, 13, -2147483648, 2*32767*32767];
153     assert(R.array == correct);
154 }
155 
156 /// Compute the bitwise OR of 256 bits (representing integer data) in `a` and `b`.
157 __m256i _mm256_or_si256 (__m256i a, __m256i b) pure @safe
158 {
159     return a | b;
160 }
161 // TODO unittest and thus force inline
162 
163 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each
164 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the
165 /// low 16 bits of 64-bit elements in result.
166 __m256i _mm256_sad_epu8 (__m256i a, __m256i b) pure @trusted
167 {
168     static if (GDC_with_AVX2)
169     {
170         return cast(__m256i) __builtin_ia32_psadbw256(cast(ubyte32)a, cast(ubyte32)b);
171     }
172     else static if (LDC_with_AVX2)
173     {
174         return cast(__m256i) __builtin_ia32_psadbw256(cast(byte32)a, cast(byte32)b);
175     }
176     else
177     {
178         // PERF: ARM64/32 is lacking
179         byte32 ab = cast(byte32)a;
180         byte32 bb = cast(byte32)b;
181         ubyte[32] t;
182         foreach(i; 0..32)
183         {
184             int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
185             if (diff < 0) diff = -diff;
186             t.ptr[i] = cast(ubyte)(diff);
187         }
188         int8 r = cast(int8) _mm256_setzero_si256();
189         r.ptr[0] = t[0]  + t[1]  + t[2]  + t[3]  + t[4]  + t[5]  + t[6]  + t[7];
190         r.ptr[2] = t[8]  + t[9]  + t[10] + t[11] + t[12] + t[13] + t[14] + t[15];
191         r.ptr[4] = t[16] + t[17] + t[18] + t[19] + t[20] + t[21] + t[22] + t[23];
192         r.ptr[6] = t[24] + t[25] + t[26] + t[27] + t[28] + t[29] + t[30] + t[31];
193         return cast(__m256i) r;
194     }
195 }
196 unittest
197 {
198     __m256i A = _mm256_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54,
199                               3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
200     __m256i B = _mm256_set1_epi8(1);
201     int8 R = cast(int8) _mm256_sad_epu8(A, B);
202     int[8] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
203                       0,
204                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
205                       0,
206                       2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
207                       0,
208                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
209                       0];
210     assert(R.array == correct);
211 }
212 
213 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
214 __m256i _mm256_slli_epi16(__m256i a, int imm8) pure @trusted
215 {
216     static if (GDC_with_AVX2)
217     {
218         return cast(__m256i) __builtin_ia32_psllwi256(cast(short16)a, cast(ubyte)imm8);
219     }
220     else static if (LDC_with_AVX2)
221     {
222         return cast(__m256i) __builtin_ia32_psllwi256(cast(short16)a, cast(ubyte)imm8);
223     }
224     else
225     {
226         //PERF: ARM
227         short16 sa  = cast(short16)a;
228         short16 r   = cast(short16)_mm256_setzero_si256();
229         ubyte count = cast(ubyte) imm8;
230         if (count > 15)
231             return cast(__m256i)r;
232         foreach(i; 0..16)
233             r.ptr[i] = cast(short)(sa.array[i] << count);
234         return cast(__m256i)r;
235     }
236 }
237 unittest
238 {
239     __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7);
240     short16 B = cast(short16)( _mm256_slli_epi16(A, 1) );
241     short16 B2 = cast(short16)( _mm256_slli_epi16(A, 1 + 256) );
242     short[16] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14, 0, 2, 4, 6, -8, -10, 12, 14 ];
243     assert(B.array == expectedB);
244     assert(B2.array == expectedB);
245 
246     short16 C = cast(short16)( _mm256_slli_epi16(A, 16) );
247     short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ];
248     assert(C.array == expectedC);
249 }
250 
251 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
252 __m256i _mm256_slli_epi32 (__m256i a, int imm8) pure @trusted
253 {
254     static if (GDC_with_AVX2)
255     {
256         return cast(__m256i) __builtin_ia32_pslldi256(cast(int8)a, cast(ubyte)imm8);
257     }
258     else static if (LDC_with_AVX2)
259     {
260         return cast(__m256i) __builtin_ia32_pslldi256(cast(int8)a, cast(ubyte)imm8);
261     }
262     else
263     {
264         // Note: the intrinsics guarantee imm8[0..7] is taken, however
265         //       D says "It's illegal to shift by the same or more bits
266         //       than the size of the quantity being shifted"
267         //       and it's UB instead.
268         int8 a_int8 = cast(int8) a;
269         int8 r      = cast(int8) _mm256_setzero_si256();
270 
271         ubyte count = cast(ubyte) imm8;
272         if (count > 31)
273             return cast(__m256i) r;
274 
275         foreach(i; 0..8)
276             r.ptr[i] = cast(uint)(a_int8.array[i]) << count;
277         return cast(__m256i) r;
278     }
279 }
280 unittest
281 {
282     __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4);
283     int8 B = cast(int8) _mm256_slli_epi32(A, 1);
284     int8 B2 = cast(int8) _mm256_slli_epi32(A, 1 + 256);
285     int[8] expectedB = [ 0, 4, 6, -8, 0, 4, 6, -8 ];
286     assert(B.array == expectedB);
287     assert(B2.array == expectedB);
288 
289     int8 C = cast(int8) _mm256_slli_epi32(A, 0);
290     int[8] expectedC = [ 0, 2, 3, -4, 0, 2, 3, -4 ];
291     assert(C.array == expectedC);
292 
293     int8 D = cast(int8) _mm256_slli_epi32(A, 65);
294     int[8] expectedD = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
295     assert(D.array == expectedD);
296 }
297 
298 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
299 __m256i _mm256_srli_epi16 (__m256i a, int imm8) pure @trusted
300 {
301     static if (GDC_with_AVX2)
302     {
303         return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8);
304     }
305     else static if (LDC_with_AVX2)
306     {
307         return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8);
308     }
309     else
310     {
311         //PERF: ARM
312         short16 sa  = cast(short16)a;
313         ubyte count = cast(ubyte)imm8;
314         short16 r   = cast(short16) _mm256_setzero_si256();
315         if (count >= 16)
316             return cast(__m256i)r;
317 
318         foreach(i; 0..16)
319             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> count);
320         return cast(__m256i)r;
321     }
322 }
323 unittest
324 {
325     __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7);
326     short16 B = cast(short16) _mm256_srli_epi16(A, 1);
327     short16 B2 = cast(short16) _mm256_srli_epi16(A, 1 + 256);
328     short[16] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3, 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
329     assert(B.array == expectedB);
330     assert(B2.array == expectedB);
331 
332     short16 C = cast(short16) _mm256_srli_epi16(A, 16);
333     short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ];
334     assert(C.array == expectedC);
335 
336     short16 D = cast(short16) _mm256_srli_epi16(A, 0);
337     short[16] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7 ];
338     assert(D.array == expectedD);
339 }
340 
341 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
342 __m256i _mm256_srli_epi32 (__m256i a, int imm8) pure @trusted
343 {
344     static if (GDC_with_AVX2)
345     {
346         return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8);
347     }
348     else static if (LDC_with_AVX2)
349     {
350         return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8);
351     }
352     else
353     {
354         ubyte count = cast(ubyte) imm8;
355         int8 a_int8 = cast(int8) a;
356 
357         // Note: the intrinsics guarantee imm8[0..7] is taken, however
358         //       D says "It's illegal to shift by the same or more bits
359         //       than the size of the quantity being shifted"
360         //       and it's UB instead.
361         int8 r = cast(int8) _mm256_setzero_si256();
362         if (count >= 32)
363             return cast(__m256i) r;
364         r.ptr[0] = a_int8.array[0] >>> count;
365         r.ptr[1] = a_int8.array[1] >>> count;
366         r.ptr[2] = a_int8.array[2] >>> count;
367         r.ptr[3] = a_int8.array[3] >>> count;
368         r.ptr[4] = a_int8.array[4] >>> count;
369         r.ptr[5] = a_int8.array[5] >>> count;
370         r.ptr[6] = a_int8.array[6] >>> count;
371         r.ptr[7] = a_int8.array[7] >>> count;
372         return cast(__m256i) r;
373     }
374 }
375 unittest
376 {
377     __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4);
378     int8 B = cast(int8) _mm256_srli_epi32(A, 1);
379     int8 B2 = cast(int8) _mm256_srli_epi32(A, 1 + 256);
380     int[8] expectedB = [ 0, 1, 1, 0x7FFFFFFE, 0, 1, 1, 0x7FFFFFFE];
381     assert(B.array == expectedB);
382     assert(B2.array == expectedB);
383 
384     int8 C = cast(int8) _mm256_srli_epi32(A, 255);
385     int[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
386     assert(C.array == expectedC);
387 }
388 
389 /// Compute the bitwise XOR of 256 bits (representing integer data) in `a` and `b`.
390 __m256i _mm256_xor_si256 (__m256i a, __m256i b) pure @safe
391 {
392     return a ^ b;
393 }
394 // TODO unittest and thus force inline
395