1 /** 2 * AVX2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX2 4 * 5 * Copyright: Guillaume Piolat 2022. 6 * Johan Engelen 2022. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.avx2intrin; 10 11 // AVX2 instructions 12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX2 13 // Note: this header will work whether you have AVX2 enabled or not. 14 // With LDC, use "dflags-ldc": ["-mattr=+avx2"] or equivalent to actively 15 // generate AVX2 instructions. 16 // With GDC, use "dflags-gdc": ["-mavx2"] or equivalent to actively 17 // generate AVX2 instructions. 18 19 public import inteli.types; 20 import inteli.internals; 21 22 // Pull in all previous instruction set intrinsics. 23 public import inteli.avxintrin; 24 25 nothrow @nogc: 26 27 /// Add packed 32-bit integers in `a` and `b`. 28 __m256i _mm256_add_epi32(__m256i a, __m256i b) pure @safe 29 { 30 pragma(inline, true); 31 return cast(__m256i)(cast(int8)a + cast(int8)b); 32 } 33 unittest 34 { 35 __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432); 36 int8 R = cast(int8) _mm256_add_epi32(A, A); 37 int[8] correct = [ -14, -2, 0, 18, -200, 200, 468, 864 ]; 38 assert(R.array == correct); 39 } 40 41 /// Compute the bitwise AND of 256 bits (representing integer data) in `a` and `b`. 42 __m256i _mm256_and_si256 (__m256i a, __m256i b) pure @safe 43 { 44 pragma(inline, true); 45 return a & b; 46 } 47 unittest 48 { 49 __m256i A = _mm256_set1_epi32(7); 50 __m256i B = _mm256_set1_epi32(14); 51 int8 R = cast(int8) _mm256_and_si256(A, B); 52 int[8] correct = [6, 6, 6, 6, 6, 6, 6, 6]; 53 assert(R.array == correct); 54 } 55 56 /// Zero-extend packed unsigned 16-bit integers in `a` to packed 32-bit integers. 57 __m256i _mm256_cvtepu16_epi32(__m128i a) pure @trusted 58 { 59 static if (GDC_with_AVX2) 60 { 61 return cast(__m256i) __builtin_ia32_pmovzxwd256(cast(short8)a); 62 } 63 else 64 { 65 short8 sa = cast(short8)a; 66 int8 r; // PERF =void; 67 // Explicit cast to unsigned to get *zero* extension (instead of sign extension). 68 r.ptr[0] = cast(ushort)sa.array[0]; 69 r.ptr[1] = cast(ushort)sa.array[1]; 70 r.ptr[2] = cast(ushort)sa.array[2]; 71 r.ptr[3] = cast(ushort)sa.array[3]; 72 r.ptr[4] = cast(ushort)sa.array[4]; 73 r.ptr[5] = cast(ushort)sa.array[5]; 74 r.ptr[6] = cast(ushort)sa.array[6]; 75 r.ptr[7] = cast(ushort)sa.array[7]; 76 return cast(__m256i)r; 77 } 78 } 79 unittest 80 { 81 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, -1, 0, -32768, 32767); 82 int8 C = cast(int8) _mm256_cvtepu16_epi32(A); 83 int[8] correct = [65535, 0, 32768, 32767, 65535, 0, 32768, 32767]; 84 assert(C.array == correct); 85 } 86 87 /// Extract 128 bits (composed of integer data) from `a`, selected with `imm8`. 88 __m128i _mm256_extracti128_si256(int imm8)(__m256i a) pure @trusted 89 if ( (imm8 == 0) || (imm8 == 1) ) 90 { 91 pragma(inline, true); 92 93 static if (GDC_with_AVX2) 94 { 95 return cast(__m128i) __builtin_ia32_extract128i256(a, imm8); 96 } 97 else version (LDC) 98 { 99 enum str = (imm8 == 1) ? "<i32 2, i32 3>" : "<i32 0, i32 1>"; 100 enum ir = "%r = shufflevector <4 x i64> %0, <4 x i64> undef, <2 x i32>" ~ str ~ "\n" ~ 101 "ret <2 x i64> %r"; 102 return cast(__m128i) LDCInlineIR!(ir, ulong2, ulong4)(cast(ulong4)a); 103 } 104 else 105 { 106 long4 al = cast(long4) a; 107 long2 ret; 108 ret.ptr[0] = (imm8==1) ? al.array[2] : al.array[0]; 109 ret.ptr[1] = (imm8==1) ? al.array[3] : al.array[1]; 110 return cast(__m128i) ret; 111 } 112 } 113 unittest 114 { 115 __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432 ); 116 int[4] correct0 = [ -7, -1, 0, 9 ]; 117 int[4] correct1 = [ -100, 100, 234, 432 ]; 118 __m128i R0 = _mm256_extracti128_si256!(0)(A); 119 __m128i R1 = _mm256_extracti128_si256!(1)(A); 120 assert(R0.array == correct0); 121 assert(R1.array == correct1); 122 } 123 124 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 125 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 126 /// and pack the results in destination. 127 __m256i _mm256_madd_epi16 (__m256i a, __m256i b) pure @trusted 128 { 129 static if (GDC_with_AVX2) 130 { 131 return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b); 132 } 133 else static if (LDC_with_AVX2) 134 { 135 return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b); 136 } 137 else 138 { 139 short16 sa = cast(short16)a; 140 short16 sb = cast(short16)b; 141 int8 r; // PERF =void; 142 foreach(i; 0..8) 143 { 144 r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1]; 145 } 146 return cast(__m256i) r; 147 } 148 } 149 unittest 150 { 151 short16 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767]; 152 short16 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767]; 153 int8 R = cast(int8) _mm256_madd_epi16(cast(__m256i)A, cast(__m256i)B); 154 int[8] correct = [1, 13, -2147483648, 2*32767*32767, 1, 13, -2147483648, 2*32767*32767]; 155 assert(R.array == correct); 156 } 157 158 /// Compute the bitwise OR of 256 bits (representing integer data) in `a` and `b`. 159 __m256i _mm256_or_si256 (__m256i a, __m256i b) pure @safe 160 { 161 return a | b; 162 } 163 // TODO unittest and thus force inline 164 165 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 166 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 167 /// low 16 bits of 64-bit elements in result. 168 __m256i _mm256_sad_epu8 (__m256i a, __m256i b) pure @trusted 169 { 170 static if (GDC_with_AVX2) 171 { 172 return cast(__m256i) __builtin_ia32_psadbw256(cast(ubyte32)a, cast(ubyte32)b); 173 } 174 else static if (LDC_with_AVX2) 175 { 176 return cast(__m256i) __builtin_ia32_psadbw256(cast(byte32)a, cast(byte32)b); 177 } 178 else 179 { 180 // PERF: ARM64/32 is lacking 181 byte32 ab = cast(byte32)a; 182 byte32 bb = cast(byte32)b; 183 ubyte[32] t; 184 foreach(i; 0..32) 185 { 186 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]); 187 if (diff < 0) diff = -diff; 188 t.ptr[i] = cast(ubyte)(diff); 189 } 190 int8 r = cast(int8) _mm256_setzero_si256(); 191 r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 192 r.ptr[2] = t[8] + t[9] + t[10] + t[11] + t[12] + t[13] + t[14] + t[15]; 193 r.ptr[4] = t[16] + t[17] + t[18] + t[19] + t[20] + t[21] + t[22] + t[23]; 194 r.ptr[6] = t[24] + t[25] + t[26] + t[27] + t[28] + t[29] + t[30] + t[31]; 195 return cast(__m256i) r; 196 } 197 } 198 unittest 199 { 200 __m256i A = _mm256_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54, 201 3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 202 __m256i B = _mm256_set1_epi8(1); 203 int8 R = cast(int8) _mm256_sad_epu8(A, B); 204 int[8] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 205 0, 206 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 207 0, 208 2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 209 0, 210 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 211 0]; 212 assert(R.array == correct); 213 } 214 215 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 216 __m256i _mm256_slli_epi16(__m256i a, int imm8) pure @trusted 217 { 218 static if (GDC_with_AVX2) 219 { 220 return cast(__m256i) __builtin_ia32_psllwi256(cast(short16)a, cast(ubyte)imm8); 221 } 222 else static if (LDC_with_AVX2) 223 { 224 return cast(__m256i) __builtin_ia32_psllwi256(cast(short16)a, cast(ubyte)imm8); 225 } 226 else 227 { 228 //PERF: ARM 229 short16 sa = cast(short16)a; 230 short16 r = cast(short16)_mm256_setzero_si256(); 231 ubyte count = cast(ubyte) imm8; 232 if (count > 15) 233 return cast(__m256i)r; 234 foreach(i; 0..16) 235 r.ptr[i] = cast(short)(sa.array[i] << count); 236 return cast(__m256i)r; 237 } 238 } 239 unittest 240 { 241 __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7); 242 short16 B = cast(short16)( _mm256_slli_epi16(A, 1) ); 243 short16 B2 = cast(short16)( _mm256_slli_epi16(A, 1 + 256) ); 244 short[16] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14, 0, 2, 4, 6, -8, -10, 12, 14 ]; 245 assert(B.array == expectedB); 246 assert(B2.array == expectedB); 247 248 short16 C = cast(short16)( _mm256_slli_epi16(A, 16) ); 249 short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]; 250 assert(C.array == expectedC); 251 } 252 253 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 254 __m256i _mm256_slli_epi32 (__m256i a, int imm8) pure @trusted 255 { 256 static if (GDC_with_AVX2) 257 { 258 return cast(__m256i) __builtin_ia32_pslldi256(cast(int8)a, cast(ubyte)imm8); 259 } 260 else static if (LDC_with_AVX2) 261 { 262 return cast(__m256i) __builtin_ia32_pslldi256(cast(int8)a, cast(ubyte)imm8); 263 } 264 else 265 { 266 // Note: the intrinsics guarantee imm8[0..7] is taken, however 267 // D says "It's illegal to shift by the same or more bits 268 // than the size of the quantity being shifted" 269 // and it's UB instead. 270 int8 a_int8 = cast(int8) a; 271 int8 r = cast(int8) _mm256_setzero_si256(); 272 273 ubyte count = cast(ubyte) imm8; 274 if (count > 31) 275 return cast(__m256i) r; 276 277 foreach(i; 0..8) 278 r.ptr[i] = cast(uint)(a_int8.array[i]) << count; 279 return cast(__m256i) r; 280 } 281 } 282 unittest 283 { 284 __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4); 285 int8 B = cast(int8) _mm256_slli_epi32(A, 1); 286 int8 B2 = cast(int8) _mm256_slli_epi32(A, 1 + 256); 287 int[8] expectedB = [ 0, 4, 6, -8, 0, 4, 6, -8 ]; 288 assert(B.array == expectedB); 289 assert(B2.array == expectedB); 290 291 int8 C = cast(int8) _mm256_slli_epi32(A, 0); 292 int[8] expectedC = [ 0, 2, 3, -4, 0, 2, 3, -4 ]; 293 assert(C.array == expectedC); 294 295 int8 D = cast(int8) _mm256_slli_epi32(A, 65); 296 int[8] expectedD = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 297 assert(D.array == expectedD); 298 } 299 300 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 301 __m256i _mm256_srli_epi16 (__m256i a, int imm8) pure @trusted 302 { 303 static if (GDC_with_AVX2) 304 { 305 return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8); 306 } 307 else static if (LDC_with_AVX2) 308 { 309 return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8); 310 } 311 else 312 { 313 //PERF: ARM 314 short16 sa = cast(short16)a; 315 ubyte count = cast(ubyte)imm8; 316 short16 r = cast(short16) _mm256_setzero_si256(); 317 if (count >= 16) 318 return cast(__m256i)r; 319 320 foreach(i; 0..16) 321 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> count); 322 return cast(__m256i)r; 323 } 324 } 325 unittest 326 { 327 __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7); 328 short16 B = cast(short16) _mm256_srli_epi16(A, 1); 329 short16 B2 = cast(short16) _mm256_srli_epi16(A, 1 + 256); 330 short[16] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3, 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 331 assert(B.array == expectedB); 332 assert(B2.array == expectedB); 333 334 short16 C = cast(short16) _mm256_srli_epi16(A, 16); 335 short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]; 336 assert(C.array == expectedC); 337 338 short16 D = cast(short16) _mm256_srli_epi16(A, 0); 339 short[16] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7 ]; 340 assert(D.array == expectedD); 341 } 342 343 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 344 __m256i _mm256_srli_epi32 (__m256i a, int imm8) pure @trusted 345 { 346 static if (GDC_with_AVX2) 347 { 348 return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8); 349 } 350 else static if (LDC_with_AVX2) 351 { 352 return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8); 353 } 354 else 355 { 356 ubyte count = cast(ubyte) imm8; 357 int8 a_int8 = cast(int8) a; 358 359 // Note: the intrinsics guarantee imm8[0..7] is taken, however 360 // D says "It's illegal to shift by the same or more bits 361 // than the size of the quantity being shifted" 362 // and it's UB instead. 363 int8 r = cast(int8) _mm256_setzero_si256(); 364 if (count >= 32) 365 return cast(__m256i) r; 366 r.ptr[0] = a_int8.array[0] >>> count; 367 r.ptr[1] = a_int8.array[1] >>> count; 368 r.ptr[2] = a_int8.array[2] >>> count; 369 r.ptr[3] = a_int8.array[3] >>> count; 370 r.ptr[4] = a_int8.array[4] >>> count; 371 r.ptr[5] = a_int8.array[5] >>> count; 372 r.ptr[6] = a_int8.array[6] >>> count; 373 r.ptr[7] = a_int8.array[7] >>> count; 374 return cast(__m256i) r; 375 } 376 } 377 unittest 378 { 379 __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4); 380 int8 B = cast(int8) _mm256_srli_epi32(A, 1); 381 int8 B2 = cast(int8) _mm256_srli_epi32(A, 1 + 256); 382 int[8] expectedB = [ 0, 1, 1, 0x7FFFFFFE, 0, 1, 1, 0x7FFFFFFE]; 383 assert(B.array == expectedB); 384 assert(B2.array == expectedB); 385 386 int8 C = cast(int8) _mm256_srli_epi32(A, 255); 387 int[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 388 assert(C.array == expectedC); 389 } 390 391 /// Compute the bitwise XOR of 256 bits (representing integer data) in `a` and `b`. 392 __m256i _mm256_xor_si256 (__m256i a, __m256i b) pure @safe 393 { 394 return a ^ b; 395 } 396 // TODO unittest and thus force inline 397