1 /** 2 * AVX2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX2 4 * 5 * Copyright: Guillaume Piolat 2022. 6 * Johan Engelen 2022. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.avx2intrin; 10 11 // AVX2 instructions 12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX2 13 // Note: this header will work whether you have AVX2 enabled or not. 14 // With LDC, use "dflags-ldc": ["-mattr=+avx2"] or equivalent to actively 15 // generate AVX2 instructions. 16 17 public import inteli.types; 18 import inteli.internals; 19 20 // Pull in all previous instruction set intrinsics. 21 public import inteli.avxintrin; 22 23 nothrow @nogc: 24 25 /// Add packed 32-bit integers in `a` and `b`. 26 __m256i _mm256_add_epi32(__m256i a, __m256i b) pure @safe 27 { 28 pragma(inline, true); 29 return cast(__m256i)(cast(int8)a + cast(int8)b); 30 } 31 unittest 32 { 33 __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432); 34 int8 R = cast(int8) _mm256_add_epi32(A, A); 35 int[8] correct = [ -14, -2, 0, 18, -200, 200, 468, 864 ]; 36 assert(R.array == correct); 37 } 38 39 /// Compute the bitwise AND of 256 bits (representing integer data) in `a` and `b`. 40 __m256i _mm256_and_si256 (__m256i a, __m256i b) pure @safe 41 { 42 pragma(inline, true); 43 return a & b; 44 } 45 unittest 46 { 47 __m256i A = _mm256_set1_epi32(7); 48 __m256i B = _mm256_set1_epi32(14); 49 int8 R = cast(int8) _mm256_and_si256(A, B); 50 int[8] correct = [6, 6, 6, 6, 6, 6, 6, 6]; 51 assert(R.array == correct); 52 } 53 54 /// Zero-extend packed unsigned 16-bit integers in `a` to packed 32-bit integers. 55 __m256i _mm256_cvtepu16_epi32(__m128i a) pure @trusted 56 { 57 static if (GDC_with_AVX2) 58 { 59 return cast(__m256i) __builtin_ia32_pmovzxwd256(cast(short8)a); 60 } 61 else 62 { 63 short8 sa = cast(short8)a; 64 int8 r; 65 // Explicit cast to unsigned to get *zero* extension (instead of sign extension). 66 r.ptr[0] = cast(ushort)sa.array[0]; 67 r.ptr[1] = cast(ushort)sa.array[1]; 68 r.ptr[2] = cast(ushort)sa.array[2]; 69 r.ptr[3] = cast(ushort)sa.array[3]; 70 r.ptr[4] = cast(ushort)sa.array[4]; 71 r.ptr[5] = cast(ushort)sa.array[5]; 72 r.ptr[6] = cast(ushort)sa.array[6]; 73 r.ptr[7] = cast(ushort)sa.array[7]; 74 return cast(__m256i)r; 75 } 76 } 77 unittest 78 { 79 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, -1, 0, -32768, 32767); 80 int8 C = cast(int8) _mm256_cvtepu16_epi32(A); 81 int[8] correct = [65535, 0, 32768, 32767, 65535, 0, 32768, 32767]; 82 assert(C.array == correct); 83 } 84 85 /// Extract 128 bits (composed of integer data) from `a`, selected with `imm8`. 86 __m128i _mm256_extracti128_si256(int imm8)(__m256i a) pure @trusted 87 if ( (imm8 == 0) || (imm8 == 1) ) 88 { 89 pragma(inline, true); 90 91 static if (GDC_with_AVX2) 92 { 93 return cast(__m128i) __builtin_ia32_extract128i256(a, imm8); 94 } 95 else version (LDC) 96 { 97 enum str = (imm8 == 1) ? "<i32 2, i32 3>" : "<i32 0, i32 1>"; 98 enum ir = "%r = shufflevector <4 x i64> %0, <4 x i64> undef, <2 x i32>" ~ str ~ "\n" ~ 99 "ret <2 x i64> %r"; 100 return cast(__m128i) LDCInlineIR!(ir, ulong2, ulong4)(cast(ulong4)a); 101 } 102 else 103 { 104 long4 al = cast(long4) a; 105 long2 ret; 106 ret.ptr[0] = (imm8==1) ? al.array[2] : al.array[0]; 107 ret.ptr[1] = (imm8==1) ? al.array[3] : al.array[1]; 108 return cast(__m128i) ret; 109 } 110 } 111 unittest 112 { 113 __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432 ); 114 int[4] correct0 = [ -7, -1, 0, 9 ]; 115 int[4] correct1 = [ -100, 100, 234, 432 ]; 116 __m128i R0 = _mm256_extracti128_si256!(0)(A); 117 __m128i R1 = _mm256_extracti128_si256!(1)(A); 118 assert(R0.array == correct0); 119 assert(R1.array == correct1); 120 } 121 122 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 123 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 124 /// and pack the results in destination. 125 __m256i _mm256_madd_epi16 (__m256i a, __m256i b) pure @trusted 126 { 127 static if (GDC_with_AVX2) 128 { 129 return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b); 130 } 131 else static if (LDC_with_AVX2) 132 { 133 return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b); 134 } 135 else 136 { 137 short16 sa = cast(short16)a; 138 short16 sb = cast(short16)b; 139 int8 r; 140 foreach(i; 0..8) 141 { 142 r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1]; 143 } 144 return cast(__m256i) r; 145 } 146 } 147 unittest 148 { 149 short16 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767]; 150 short16 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767]; 151 int8 R = cast(int8) _mm256_madd_epi16(cast(__m256i)A, cast(__m256i)B); 152 int[8] correct = [1, 13, -2147483648, 2*32767*32767, 1, 13, -2147483648, 2*32767*32767]; 153 assert(R.array == correct); 154 } 155 156 /// Compute the bitwise OR of 256 bits (representing integer data) in `a` and `b`. 157 __m256i _mm256_or_si256 (__m256i a, __m256i b) pure @safe 158 { 159 return a | b; 160 } 161 // TODO unittest and thus force inline 162 163 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 164 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 165 /// low 16 bits of 64-bit elements in result. 166 __m256i _mm256_sad_epu8 (__m256i a, __m256i b) pure @trusted 167 { 168 static if (GDC_with_AVX2) 169 { 170 return cast(__m256i) __builtin_ia32_psadbw256(cast(ubyte32)a, cast(ubyte32)b); 171 } 172 else static if (LDC_with_AVX2) 173 { 174 return cast(__m256i) __builtin_ia32_psadbw256(cast(byte32)a, cast(byte32)b); 175 } 176 else 177 { 178 // PERF: ARM64/32 is lacking 179 byte32 ab = cast(byte32)a; 180 byte32 bb = cast(byte32)b; 181 ubyte[32] t; 182 foreach(i; 0..32) 183 { 184 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]); 185 if (diff < 0) diff = -diff; 186 t.ptr[i] = cast(ubyte)(diff); 187 } 188 int8 r = cast(int8) _mm256_setzero_si256(); 189 r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 190 r.ptr[2] = t[8] + t[9] + t[10] + t[11] + t[12] + t[13] + t[14] + t[15]; 191 r.ptr[4] = t[16] + t[17] + t[18] + t[19] + t[20] + t[21] + t[22] + t[23]; 192 r.ptr[6] = t[24] + t[25] + t[26] + t[27] + t[28] + t[29] + t[30] + t[31]; 193 return cast(__m256i) r; 194 } 195 } 196 unittest 197 { 198 __m256i A = _mm256_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54, 199 3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 200 __m256i B = _mm256_set1_epi8(1); 201 int8 R = cast(int8) _mm256_sad_epu8(A, B); 202 int[8] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 203 0, 204 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 205 0, 206 2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 207 0, 208 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 209 0]; 210 assert(R.array == correct); 211 } 212 213 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 214 __m256i _mm256_slli_epi16(__m256i a, int imm8) pure @trusted 215 { 216 static if (GDC_with_AVX2) 217 { 218 return cast(__m256i) __builtin_ia32_psllwi256(cast(short16)a, cast(ubyte)imm8); 219 } 220 else static if (LDC_with_AVX2) 221 { 222 return cast(__m256i) __builtin_ia32_psllwi256(cast(short16)a, cast(ubyte)imm8); 223 } 224 else 225 { 226 //PERF: ARM 227 short16 sa = cast(short16)a; 228 short16 r = cast(short16)_mm256_setzero_si256(); 229 ubyte count = cast(ubyte) imm8; 230 if (count > 15) 231 return cast(__m256i)r; 232 foreach(i; 0..16) 233 r.ptr[i] = cast(short)(sa.array[i] << count); 234 return cast(__m256i)r; 235 } 236 } 237 unittest 238 { 239 __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7); 240 short16 B = cast(short16)( _mm256_slli_epi16(A, 1) ); 241 short16 B2 = cast(short16)( _mm256_slli_epi16(A, 1 + 256) ); 242 short[16] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14, 0, 2, 4, 6, -8, -10, 12, 14 ]; 243 assert(B.array == expectedB); 244 assert(B2.array == expectedB); 245 246 short16 C = cast(short16)( _mm256_slli_epi16(A, 16) ); 247 short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]; 248 assert(C.array == expectedC); 249 } 250 251 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 252 __m256i _mm256_slli_epi32 (__m256i a, int imm8) pure @trusted 253 { 254 static if (GDC_with_AVX2) 255 { 256 return cast(__m256i) __builtin_ia32_pslldi256(cast(int8)a, cast(ubyte)imm8); 257 } 258 else static if (LDC_with_AVX2) 259 { 260 return cast(__m256i) __builtin_ia32_pslldi256(cast(int8)a, cast(ubyte)imm8); 261 } 262 else 263 { 264 // Note: the intrinsics guarantee imm8[0..7] is taken, however 265 // D says "It's illegal to shift by the same or more bits 266 // than the size of the quantity being shifted" 267 // and it's UB instead. 268 int8 a_int8 = cast(int8) a; 269 int8 r = cast(int8) _mm256_setzero_si256(); 270 271 ubyte count = cast(ubyte) imm8; 272 if (count > 31) 273 return cast(__m256i) r; 274 275 foreach(i; 0..8) 276 r.ptr[i] = cast(uint)(a_int8.array[i]) << count; 277 return cast(__m256i) r; 278 } 279 } 280 unittest 281 { 282 __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4); 283 int8 B = cast(int8) _mm256_slli_epi32(A, 1); 284 int8 B2 = cast(int8) _mm256_slli_epi32(A, 1 + 256); 285 int[8] expectedB = [ 0, 4, 6, -8, 0, 4, 6, -8 ]; 286 assert(B.array == expectedB); 287 assert(B2.array == expectedB); 288 289 int8 C = cast(int8) _mm256_slli_epi32(A, 0); 290 int[8] expectedC = [ 0, 2, 3, -4, 0, 2, 3, -4 ]; 291 assert(C.array == expectedC); 292 293 int8 D = cast(int8) _mm256_slli_epi32(A, 65); 294 int[8] expectedD = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 295 assert(D.array == expectedD); 296 } 297 298 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 299 __m256i _mm256_srli_epi16 (__m256i a, int imm8) pure @trusted 300 { 301 static if (GDC_with_AVX2) 302 { 303 return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8); 304 } 305 else static if (LDC_with_AVX2) 306 { 307 return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8); 308 } 309 else 310 { 311 //PERF: ARM 312 short16 sa = cast(short16)a; 313 ubyte count = cast(ubyte)imm8; 314 short16 r = cast(short16) _mm256_setzero_si256(); 315 if (count >= 16) 316 return cast(__m256i)r; 317 318 foreach(i; 0..16) 319 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> count); 320 return cast(__m256i)r; 321 } 322 } 323 unittest 324 { 325 __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7); 326 short16 B = cast(short16) _mm256_srli_epi16(A, 1); 327 short16 B2 = cast(short16) _mm256_srli_epi16(A, 1 + 256); 328 short[16] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3, 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 329 assert(B.array == expectedB); 330 assert(B2.array == expectedB); 331 332 short16 C = cast(short16) _mm256_srli_epi16(A, 16); 333 short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]; 334 assert(C.array == expectedC); 335 336 short16 D = cast(short16) _mm256_srli_epi16(A, 0); 337 short[16] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7 ]; 338 assert(D.array == expectedD); 339 } 340 341 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 342 __m256i _mm256_srli_epi32 (__m256i a, int imm8) pure @trusted 343 { 344 static if (GDC_with_AVX2) 345 { 346 return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8); 347 } 348 else static if (LDC_with_AVX2) 349 { 350 return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8); 351 } 352 else 353 { 354 ubyte count = cast(ubyte) imm8; 355 int8 a_int8 = cast(int8) a; 356 357 // Note: the intrinsics guarantee imm8[0..7] is taken, however 358 // D says "It's illegal to shift by the same or more bits 359 // than the size of the quantity being shifted" 360 // and it's UB instead. 361 int8 r = cast(int8) _mm256_setzero_si256(); 362 if (count >= 32) 363 return cast(__m256i) r; 364 r.ptr[0] = a_int8.array[0] >>> count; 365 r.ptr[1] = a_int8.array[1] >>> count; 366 r.ptr[2] = a_int8.array[2] >>> count; 367 r.ptr[3] = a_int8.array[3] >>> count; 368 r.ptr[4] = a_int8.array[4] >>> count; 369 r.ptr[5] = a_int8.array[5] >>> count; 370 r.ptr[6] = a_int8.array[6] >>> count; 371 r.ptr[7] = a_int8.array[7] >>> count; 372 return cast(__m256i) r; 373 } 374 } 375 unittest 376 { 377 __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4); 378 int8 B = cast(int8) _mm256_srli_epi32(A, 1); 379 int8 B2 = cast(int8) _mm256_srli_epi32(A, 1 + 256); 380 int[8] expectedB = [ 0, 1, 1, 0x7FFFFFFE, 0, 1, 1, 0x7FFFFFFE]; 381 assert(B.array == expectedB); 382 assert(B2.array == expectedB); 383 384 int8 C = cast(int8) _mm256_srli_epi32(A, 255); 385 int[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 386 assert(C.array == expectedC); 387 } 388 389 /// Compute the bitwise XOR of 256 bits (representing integer data) in `a` and `b`. 390 __m256i _mm256_xor_si256 (__m256i a, __m256i b) pure @safe 391 { 392 return a ^ b; 393 } 394 // TODO unittest and thus force inline 395