1 /**
2 * AVX2 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX2
4 *
5 * Copyright: Guillaume Piolat 2022.
6 *            Johan Engelen 2022.
7 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8 */
9 module inteli.avx2intrin;
10 
11 // AVX2 instructions
12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX2
13 // Note: this header will work whether you have AVX2 enabled or not.
14 // With LDC, use "dflags-ldc": ["-mattr=+avx2"] or equivalent to actively
15 // generate AVX2 instructions.
16 // With GDC, use "dflags-gdc": ["-mavx2"] or equivalent to actively
17 // generate AVX2 instructions.
18 
19 public import inteli.types;
20 import inteli.internals;
21 
22 // Pull in all previous instruction set intrinsics.
23 public import inteli.avxintrin;
24 
25 nothrow @nogc:
26 
27 /// Add packed 32-bit integers in `a` and `b`.
28 __m256i _mm256_add_epi32(__m256i a, __m256i b) pure @safe
29 {
30     pragma(inline, true);
31     return cast(__m256i)(cast(int8)a + cast(int8)b);
32 }
33 unittest
34 {
35     __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432);
36     int8 R = cast(int8) _mm256_add_epi32(A, A);
37     int[8] correct = [ -14, -2, 0, 18, -200, 200, 468, 864 ];
38     assert(R.array == correct);
39 }
40 
41 /// Compute the bitwise AND of 256 bits (representing integer data) in `a` and `b`.
42 __m256i _mm256_and_si256 (__m256i a, __m256i b) pure @safe
43 {
44     pragma(inline, true);
45     return a & b;
46 }
47 unittest
48 {
49     __m256i A = _mm256_set1_epi32(7);
50     __m256i B = _mm256_set1_epi32(14);
51     int8 R = cast(int8) _mm256_and_si256(A, B);
52     int[8] correct = [6, 6, 6, 6, 6, 6, 6, 6];
53     assert(R.array == correct);
54 }
55 
56 /// Zero-extend packed unsigned 16-bit integers in `a` to packed 32-bit integers.
57 __m256i _mm256_cvtepu16_epi32(__m128i a) pure @trusted
58 {
59     static if (GDC_with_AVX2)
60     {
61         return cast(__m256i) __builtin_ia32_pmovzxwd256(cast(short8)a);
62     }
63     else
64     {
65         short8 sa = cast(short8)a;
66         int8 r; // PERF =void;
67         // Explicit cast to unsigned to get *zero* extension (instead of sign extension).
68         r.ptr[0] = cast(ushort)sa.array[0];
69         r.ptr[1] = cast(ushort)sa.array[1];
70         r.ptr[2] = cast(ushort)sa.array[2];
71         r.ptr[3] = cast(ushort)sa.array[3];
72         r.ptr[4] = cast(ushort)sa.array[4];
73         r.ptr[5] = cast(ushort)sa.array[5];
74         r.ptr[6] = cast(ushort)sa.array[6];
75         r.ptr[7] = cast(ushort)sa.array[7];
76         return cast(__m256i)r;
77     }
78 }
79 unittest
80 {
81     __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, -1, 0, -32768, 32767);
82     int8 C = cast(int8) _mm256_cvtepu16_epi32(A);
83     int[8] correct = [65535, 0, 32768, 32767, 65535, 0, 32768, 32767];
84     assert(C.array == correct);
85 }
86 
87 /// Extract 128 bits (composed of integer data) from `a`, selected with `imm8`.
88 __m128i _mm256_extracti128_si256(int imm8)(__m256i a) pure @trusted
89     if ( (imm8 == 0) || (imm8 == 1) )
90 {
91     pragma(inline, true);
92 
93     static if (GDC_with_AVX2)
94     {
95         return cast(__m128i) __builtin_ia32_extract128i256(a, imm8);
96     }
97     else version (LDC)
98     {
99         enum str = (imm8 == 1) ? "<i32 2, i32 3>" : "<i32 0, i32 1>";
100         enum ir = "%r = shufflevector <4 x i64> %0, <4 x i64> undef, <2 x i32>" ~ str ~ "\n" ~
101                   "ret <2 x i64> %r";
102         return cast(__m128i) LDCInlineIR!(ir, ulong2, ulong4)(cast(ulong4)a);
103     }
104     else
105     {
106         long4 al = cast(long4) a;
107         long2 ret;
108         ret.ptr[0] = (imm8==1) ? al.array[2] : al.array[0];
109         ret.ptr[1] = (imm8==1) ? al.array[3] : al.array[1];
110         return cast(__m128i) ret;
111     }
112 }
113 unittest
114 {
115     __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432 );
116     int[4] correct0 = [ -7, -1, 0, 9 ];
117     int[4] correct1 = [ -100, 100, 234, 432 ];
118     __m128i R0 = _mm256_extracti128_si256!(0)(A);
119     __m128i R1 = _mm256_extracti128_si256!(1)(A);
120     assert(R0.array == correct0);
121     assert(R1.array == correct1);
122 }
123 
124 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate
125 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers,
126 /// and pack the results in destination.
127 __m256i _mm256_madd_epi16 (__m256i a, __m256i b) pure @trusted
128 {
129     static if (GDC_with_AVX2)
130     {
131         return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b);
132     }
133     else static if (LDC_with_AVX2)
134     {
135         return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b);
136     }
137     else
138     {
139         short16 sa = cast(short16)a;
140         short16 sb = cast(short16)b;
141         int8 r; // PERF =void;
142         foreach(i; 0..8)
143         {
144             r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1];
145         }
146         return cast(__m256i) r;
147     }
148 }
149 unittest
150 {
151     short16 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767];
152     short16 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767];
153     int8 R = cast(int8) _mm256_madd_epi16(cast(__m256i)A, cast(__m256i)B);
154     int[8] correct = [1, 13, -2147483648, 2*32767*32767, 1, 13, -2147483648, 2*32767*32767];
155     assert(R.array == correct);
156 }
157 
158 /// Compute the bitwise OR of 256 bits (representing integer data) in `a` and `b`.
159 __m256i _mm256_or_si256 (__m256i a, __m256i b) pure @safe
160 {
161     return a | b;
162 }
163 // TODO unittest and thus force inline
164 
165 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each
166 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the
167 /// low 16 bits of 64-bit elements in result.
168 __m256i _mm256_sad_epu8 (__m256i a, __m256i b) pure @trusted
169 {
170     static if (GDC_with_AVX2)
171     {
172         return cast(__m256i) __builtin_ia32_psadbw256(cast(ubyte32)a, cast(ubyte32)b);
173     }
174     else static if (LDC_with_AVX2)
175     {
176         return cast(__m256i) __builtin_ia32_psadbw256(cast(byte32)a, cast(byte32)b);
177     }
178     else
179     {
180         // PERF: ARM64/32 is lacking
181         byte32 ab = cast(byte32)a;
182         byte32 bb = cast(byte32)b;
183         ubyte[32] t;
184         foreach(i; 0..32)
185         {
186             int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]);
187             if (diff < 0) diff = -diff;
188             t.ptr[i] = cast(ubyte)(diff);
189         }
190         int8 r = cast(int8) _mm256_setzero_si256();
191         r.ptr[0] = t[0]  + t[1]  + t[2]  + t[3]  + t[4]  + t[5]  + t[6]  + t[7];
192         r.ptr[2] = t[8]  + t[9]  + t[10] + t[11] + t[12] + t[13] + t[14] + t[15];
193         r.ptr[4] = t[16] + t[17] + t[18] + t[19] + t[20] + t[21] + t[22] + t[23];
194         r.ptr[6] = t[24] + t[25] + t[26] + t[27] + t[28] + t[29] + t[30] + t[31];
195         return cast(__m256i) r;
196     }
197 }
198 unittest
199 {
200     __m256i A = _mm256_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54,
201                               3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1
202     __m256i B = _mm256_set1_epi8(1);
203     int8 R = cast(int8) _mm256_sad_epu8(A, B);
204     int[8] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
205                       0,
206                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
207                       0,
208                       2 + 3 + 5 + 7 + 11 + 13 + 17 + 19,
209                       0,
210                       23 + 29 + 31 + 37 + 41 + 43 + 47 + 53,
211                       0];
212     assert(R.array == correct);
213 }
214 
215 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
216 __m256i _mm256_slli_epi16(__m256i a, int imm8) pure @trusted
217 {
218     static if (GDC_with_AVX2)
219     {
220         return cast(__m256i) __builtin_ia32_psllwi256(cast(short16)a, cast(ubyte)imm8);
221     }
222     else static if (LDC_with_AVX2)
223     {
224         return cast(__m256i) __builtin_ia32_psllwi256(cast(short16)a, cast(ubyte)imm8);
225     }
226     else
227     {
228         //PERF: ARM
229         short16 sa  = cast(short16)a;
230         short16 r   = cast(short16)_mm256_setzero_si256();
231         ubyte count = cast(ubyte) imm8;
232         if (count > 15)
233             return cast(__m256i)r;
234         foreach(i; 0..16)
235             r.ptr[i] = cast(short)(sa.array[i] << count);
236         return cast(__m256i)r;
237     }
238 }
239 unittest
240 {
241     __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7);
242     short16 B = cast(short16)( _mm256_slli_epi16(A, 1) );
243     short16 B2 = cast(short16)( _mm256_slli_epi16(A, 1 + 256) );
244     short[16] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14, 0, 2, 4, 6, -8, -10, 12, 14 ];
245     assert(B.array == expectedB);
246     assert(B2.array == expectedB);
247 
248     short16 C = cast(short16)( _mm256_slli_epi16(A, 16) );
249     short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ];
250     assert(C.array == expectedC);
251 }
252 
253 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
254 __m256i _mm256_slli_epi32 (__m256i a, int imm8) pure @trusted
255 {
256     static if (GDC_with_AVX2)
257     {
258         return cast(__m256i) __builtin_ia32_pslldi256(cast(int8)a, cast(ubyte)imm8);
259     }
260     else static if (LDC_with_AVX2)
261     {
262         return cast(__m256i) __builtin_ia32_pslldi256(cast(int8)a, cast(ubyte)imm8);
263     }
264     else
265     {
266         // Note: the intrinsics guarantee imm8[0..7] is taken, however
267         //       D says "It's illegal to shift by the same or more bits
268         //       than the size of the quantity being shifted"
269         //       and it's UB instead.
270         int8 a_int8 = cast(int8) a;
271         int8 r      = cast(int8) _mm256_setzero_si256();
272 
273         ubyte count = cast(ubyte) imm8;
274         if (count > 31)
275             return cast(__m256i) r;
276 
277         foreach(i; 0..8)
278             r.ptr[i] = cast(uint)(a_int8.array[i]) << count;
279         return cast(__m256i) r;
280     }
281 }
282 unittest
283 {
284     __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4);
285     int8 B = cast(int8) _mm256_slli_epi32(A, 1);
286     int8 B2 = cast(int8) _mm256_slli_epi32(A, 1 + 256);
287     int[8] expectedB = [ 0, 4, 6, -8, 0, 4, 6, -8 ];
288     assert(B.array == expectedB);
289     assert(B2.array == expectedB);
290 
291     int8 C = cast(int8) _mm256_slli_epi32(A, 0);
292     int[8] expectedC = [ 0, 2, 3, -4, 0, 2, 3, -4 ];
293     assert(C.array == expectedC);
294 
295     int8 D = cast(int8) _mm256_slli_epi32(A, 65);
296     int[8] expectedD = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
297     assert(D.array == expectedD);
298 }
299 
300 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros.
301 __m256i _mm256_srli_epi16 (__m256i a, int imm8) pure @trusted
302 {
303     static if (GDC_with_AVX2)
304     {
305         return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8);
306     }
307     else static if (LDC_with_AVX2)
308     {
309         return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8);
310     }
311     else
312     {
313         //PERF: ARM
314         short16 sa  = cast(short16)a;
315         ubyte count = cast(ubyte)imm8;
316         short16 r   = cast(short16) _mm256_setzero_si256();
317         if (count >= 16)
318             return cast(__m256i)r;
319 
320         foreach(i; 0..16)
321             r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> count);
322         return cast(__m256i)r;
323     }
324 }
325 unittest
326 {
327     __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7);
328     short16 B = cast(short16) _mm256_srli_epi16(A, 1);
329     short16 B2 = cast(short16) _mm256_srli_epi16(A, 1 + 256);
330     short[16] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3, 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ];
331     assert(B.array == expectedB);
332     assert(B2.array == expectedB);
333 
334     short16 C = cast(short16) _mm256_srli_epi16(A, 16);
335     short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ];
336     assert(C.array == expectedC);
337 
338     short16 D = cast(short16) _mm256_srli_epi16(A, 0);
339     short[16] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7 ];
340     assert(D.array == expectedD);
341 }
342 
343 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros.
344 __m256i _mm256_srli_epi32 (__m256i a, int imm8) pure @trusted
345 {
346     static if (GDC_with_AVX2)
347     {
348         return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8);
349     }
350     else static if (LDC_with_AVX2)
351     {
352         return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8);
353     }
354     else
355     {
356         ubyte count = cast(ubyte) imm8;
357         int8 a_int8 = cast(int8) a;
358 
359         // Note: the intrinsics guarantee imm8[0..7] is taken, however
360         //       D says "It's illegal to shift by the same or more bits
361         //       than the size of the quantity being shifted"
362         //       and it's UB instead.
363         int8 r = cast(int8) _mm256_setzero_si256();
364         if (count >= 32)
365             return cast(__m256i) r;
366         r.ptr[0] = a_int8.array[0] >>> count;
367         r.ptr[1] = a_int8.array[1] >>> count;
368         r.ptr[2] = a_int8.array[2] >>> count;
369         r.ptr[3] = a_int8.array[3] >>> count;
370         r.ptr[4] = a_int8.array[4] >>> count;
371         r.ptr[5] = a_int8.array[5] >>> count;
372         r.ptr[6] = a_int8.array[6] >>> count;
373         r.ptr[7] = a_int8.array[7] >>> count;
374         return cast(__m256i) r;
375     }
376 }
377 unittest
378 {
379     __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4);
380     int8 B = cast(int8) _mm256_srli_epi32(A, 1);
381     int8 B2 = cast(int8) _mm256_srli_epi32(A, 1 + 256);
382     int[8] expectedB = [ 0, 1, 1, 0x7FFFFFFE, 0, 1, 1, 0x7FFFFFFE];
383     assert(B.array == expectedB);
384     assert(B2.array == expectedB);
385 
386     int8 C = cast(int8) _mm256_srli_epi32(A, 255);
387     int[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ];
388     assert(C.array == expectedC);
389 }
390 
391 /// Compute the bitwise XOR of 256 bits (representing integer data) in `a` and `b`.
392 __m256i _mm256_xor_si256 (__m256i a, __m256i b) pure @safe
393 {
394     return a ^ b;
395 }
396 // TODO unittest and thus force inline
397