1 /** 2 * AVX2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX2 4 * 5 * Copyright: Guillaume Piolat 2022-2025. 6 * Johan Engelen 2022. 7 * cet 2024. 8 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 9 */ 10 module inteli.avx2intrin; 11 12 // AVX2 instructions 13 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX2 14 // Note: this header will work whether you have AVX2 enabled or not. 15 // With LDC, use "dflags-ldc": ["-mattr=+avx2"] or equivalent to actively 16 // generate AVX2 instructions. 17 // With GDC, use "dflags-gdc": ["-mavx2"] or equivalent to actively 18 // generate AVX2 instructions. 19 20 21 // Note: many special cases for GDC, because when suporting SIMD_COMPARISON_MASKS_32B but not having AVX2, 22 // the replaced operators have terrible performance. Mostly a problem for -mavx on x86 23 24 public import inteli.types; 25 import inteli.internals; 26 27 // Pull in all previous instruction set intrinsics. 28 public import inteli.avxintrin; 29 30 nothrow @nogc: 31 32 /// Compute the absolute value of packed signed 16-bit integers in `a`. 33 __m256i _mm256_abs_epi16 (__m256i a) @trusted 34 { 35 // PERF DMD 36 version(LDC) 37 enum split = true; // always beneficial in LDC neon, ssse3, or even sse2 38 else 39 enum split = GDC_with_SSSE3; 40 41 static if (GDC_with_AVX2) 42 { 43 return cast(__m256i) __builtin_ia32_pabsw256(cast(short16)a); 44 } 45 else static if (__VERSION__ >= 2097 && LDC_with_AVX2) 46 { 47 // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 48 // no good way to do abs(256-bit) 49 return cast(__m256i) inteli_llvm_abs!short16(cast(short16)a, false); 50 } 51 else static if (split) 52 { 53 __m128i a_lo = _mm256_extractf128_si256!0(a); 54 __m128i a_hi = _mm256_extractf128_si256!1(a); 55 __m128i r_lo = _mm_abs_epi16(a_lo); 56 __m128i r_hi = _mm_abs_epi16(a_hi); 57 return _mm256_set_m128i(r_hi, r_lo); 58 } 59 else 60 { 61 short16 sa = cast(short16)a; 62 for (int i = 0; i < 16; ++i) 63 { 64 short s = sa.array[i]; 65 sa.ptr[i] = s >= 0 ? s : cast(short)(-cast(int)(s)); 66 } 67 return cast(__m256i)sa; 68 } 69 } 70 unittest 71 { 72 __m256i A = _mm256_setr_epi16(0, -1, -32768, 32767, 10, -10, 1000, -1000, 73 1, -1, -32768, 32767, 12, -13, 1000, -1040); 74 short16 B = cast(short16) _mm256_abs_epi16(A); 75 short[16] correct = [0, 1, -32768, 32767, 10, 10, 1000, 1000, 76 1, 1, -32768, 32767, 12, 13, 1000, 1040]; 77 assert(B.array == correct); 78 } 79 80 /// Compute the absolute value of packed signed 32-bit integers in `a`. 81 __m256i _mm256_abs_epi32 (__m256i a) @trusted 82 { 83 // PERF DMD 84 version(LDC) 85 enum split = true; // always beneficial in LDC neon, ssse3, or even sse2 86 else 87 enum split = false; // GDC manages to split and use pabsd in SSSE3 without guidance 88 89 static if (GDC_with_AVX2) 90 { 91 return cast(__m256i) __builtin_ia32_pabsd256(cast(int8)a); 92 } 93 else static if (__VERSION__ >= 2097 && LDC_with_AVX2) 94 { 95 // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 96 // no good way to do abs(256-bit) 97 return cast(__m256i) inteli_llvm_abs!int8(cast(int8)a, false); 98 } 99 else static if (split) 100 { 101 __m128i a_lo = _mm256_extractf128_si256!0(a); 102 __m128i a_hi = _mm256_extractf128_si256!1(a); 103 __m128i r_lo = _mm_abs_epi32(a_lo); 104 __m128i r_hi = _mm_abs_epi32(a_hi); 105 return _mm256_set_m128i(r_hi, r_lo); 106 } 107 else 108 { 109 int8 sa = cast(int8)a; 110 for (int i = 0; i < 8; ++i) 111 { 112 int s = sa.array[i]; 113 sa.ptr[i] = (s >= 0 ? s : -s); 114 } 115 return cast(__m256i)sa; 116 } 117 } 118 unittest 119 { 120 __m256i A = _mm256_setr_epi32(0, -1, -2_147_483_648, -2_147_483_647, -1, 0, -2_147_483_648, -2_147_483_646); 121 int8 B = cast(int8) _mm256_abs_epi32(A); 122 int[8] correct = [0, 1, -2_147_483_648, 2_147_483_647, 1, 0, -2_147_483_648, 2_147_483_646]; 123 assert(B.array == correct); 124 } 125 126 /// Compute the absolute value of packed signed 8-bit integers in `a`. 127 __m256i _mm256_abs_epi8 (__m256i a) @trusted 128 { 129 // PERF DMD 130 // PERF GDC in SSSE3 to AVX doesn't use pabsb and split is catastrophic because of _mm_min_epu8 131 version(LDC) 132 enum split = true; // always beneficial in LDC neon, ssse3, sse2 133 else 134 enum split = false; 135 136 static if (GDC_with_AVX2) 137 { 138 return cast(__m256i) __builtin_ia32_pabsb256(cast(ubyte32)a); 139 } 140 else static if (__VERSION__ >= 2097 && LDC_with_AVX2) 141 { 142 // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 143 // no good way to do abs(256-bit) 144 return cast(__m256i) inteli_llvm_abs!byte32(cast(byte32)a, false); 145 } 146 else static if (split) 147 { 148 __m128i a_lo = _mm256_extractf128_si256!0(a); 149 __m128i a_hi = _mm256_extractf128_si256!1(a); 150 __m128i r_lo = _mm_abs_epi8(a_lo); 151 __m128i r_hi = _mm_abs_epi8(a_hi); 152 return _mm256_set_m128i(r_hi, r_lo); 153 } 154 else 155 { 156 // Basically this loop is poison for LDC optimizer 157 byte32 sa = cast(byte32)a; 158 for (int i = 0; i < 32; ++i) 159 { 160 byte s = sa.array[i]; 161 sa.ptr[i] = s >= 0 ? s : cast(byte)(-cast(int)(s)); 162 } 163 return cast(__m256i)sa; 164 } 165 } 166 unittest 167 { 168 __m256i A = _mm256_setr_epi8(0, -1, -128, -127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 169 0, -1, -128, -126, 127, -6, -5, -4, -3, -2, 0, 1, 2, 3, 4, 5); 170 byte32 B = cast(byte32) _mm256_abs_epi8(A); 171 byte[32] correct = [0, 1, -128, 127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 172 0, 1, -128, 126, 127, 6, 5, 4, 3, 2, 0, 1, 2, 3, 4, 5]; 173 assert(B.array == correct); 174 } 175 176 /// Add packed 16-bit integers in `a` and `b`. 177 __m256i _mm256_add_epi16 (__m256i a, __m256i b) pure @safe 178 { 179 pragma(inline, true); 180 return cast(__m256i)(cast(short16)a + cast(short16)b); 181 } 182 unittest 183 { 184 __m256i A = _mm256_setr_epi16( -7, -1, 0, 9, -100, 100, 234, 432, -32768, 32767, 0, -1, -20000, 0, 6, -2); 185 short16 R = cast(short16) _mm256_add_epi16(A, A); 186 short[16] correct = [ -14, -2, 0, 18, -200, 200, 468, 864, 0, -2, 0, -2, 25536, 0, 12, -4 ]; 187 assert(R.array == correct); 188 } 189 190 /// Add packed 32-bit integers in `a` and `b`. 191 __m256i _mm256_add_epi32(__m256i a, __m256i b) pure @safe 192 { 193 pragma(inline, true); 194 return cast(__m256i)(cast(int8)a + cast(int8)b); 195 } 196 unittest 197 { 198 __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432); 199 int8 R = cast(int8) _mm256_add_epi32(A, A); 200 int[8] correct = [ -14, -2, 0, 18, -200, 200, 468, 864 ]; 201 assert(R.array == correct); 202 } 203 204 /// Add packed 64-bit integers in `a` and `b`. 205 __m256i _mm256_add_epi64 (__m256i a, __m256i b) pure @safe 206 { 207 pragma(inline, true); 208 return a + b; 209 } 210 unittest 211 { 212 __m256i A = _mm256_setr_epi64(-1, 0x8000_0000_0000_0000, 42, -12); 213 long4 R = cast(__m256i) _mm256_add_epi64(A, A); 214 long[4] correct = [ -2, 0, 84, -24 ]; 215 assert(R.array == correct); 216 } 217 218 /// Add packed 8-bit integers in `a` and `b`. 219 __m256i _mm256_add_epi8 (__m256i a, __m256i b) pure @safe 220 { 221 pragma(inline, true); 222 return cast(__m256i)(cast(byte32)a + cast(byte32)b); 223 } 224 unittest 225 { 226 __m256i A = _mm256_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78, 227 4, 9, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -2, 0, 10, 78); 228 byte32 R = cast(byte32) _mm256_add_epi8(A, A); 229 byte[32] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100, 230 8, 18, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -4, 0, 20, -100]; 231 assert(R.array == correct); 232 } 233 234 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 235 __m256i _mm256_adds_epi16 (__m256i a, __m256i b) pure @trusted 236 { 237 // PERF DMD 238 static if (GDC_with_AVX2) 239 { 240 return cast(__m256i) __builtin_ia32_paddsw256(cast(short16)a, cast(short16)b); 241 } 242 else static if(LDC_with_saturated_intrinsics) 243 { 244 return cast(__m256i) inteli_llvm_adds!short16(cast(short16)a, cast(short16)b); 245 } 246 else 247 { 248 short16 r; 249 short16 sa = cast(short16)a; 250 short16 sb = cast(short16)b; 251 foreach(i; 0..16) 252 r.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 253 return cast(__m256i)r; 254 } 255 } 256 unittest 257 { 258 short16 res = cast(short16) _mm256_adds_epi16(_mm256_setr_epi16( 7, 6, 5, -32768, 3, 3, 32767, 0, 7, 6, 5, -32768, 3, 3, 32767, 0), 259 _mm256_setr_epi16( 7, 6, 5, -30000, 3, 1, 1, -10, 7, 6, 5, -30000, 3, 1, 1, -10)); 260 static immutable short[16] correctResult = [14, 12, 10, -32768, 6, 4, 32767, -10, 14, 12, 10, -32768, 6, 4, 32767, -10]; 261 assert(res.array == correctResult); 262 } 263 264 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 265 __m256i _mm256_adds_epi8 (__m256i a, __m256i b) pure @trusted 266 { 267 // PERF DMD 268 static if (GDC_with_AVX2) 269 { 270 return cast(__m256i) __builtin_ia32_paddsb256(cast(ubyte32)a, cast(ubyte32)b); 271 } 272 else static if(LDC_with_saturated_intrinsics) 273 { 274 return cast(__m256i) inteli_llvm_adds!byte32(cast(byte32)a, cast(byte32)b); 275 } 276 else 277 { 278 byte32 r; 279 byte32 sa = cast(byte32)a; 280 byte32 sb = cast(byte32)b; 281 foreach(i; 0..32) 282 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i] + sb.array[i]); 283 return cast(__m256i)r; 284 } 285 } 286 unittest 287 { 288 byte32 res = cast(byte32) _mm256_adds_epi8(_mm256_setr_epi8(15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0, 15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0), 289 _mm256_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, -4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, -4, 3, 2, 1, 0)); 290 static immutable byte[32] correctResult = [30, 28, 26, 24, 22, 127,18,16,14,12,10, -128, 6, 4, 2, 0, 30, 28, 26, 24, 22, 127,18,16,14,12,10, -128, 6, 4, 2, 0]; 291 assert(res.array == correctResult); 292 } 293 294 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 295 __m256i _mm256_adds_epu16 (__m256i a, __m256i b) pure @trusted 296 { 297 // PERF DMD 298 static if (GDC_with_AVX2) 299 { 300 return cast(__m256i) __builtin_ia32_paddusw256(cast(short16)a, cast(short16)b); 301 } 302 else static if(LDC_with_saturated_intrinsics) 303 { 304 return cast(__m256i) inteli_llvm_addus!short16(cast(short16)a, cast(short16)b); 305 } 306 else 307 { 308 short16 r; 309 short16 sa = cast(short16)a; 310 short16 sb = cast(short16)b; 311 foreach(i; 0..16) 312 r.ptr[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 313 return cast(__m256i)r; 314 } 315 } 316 unittest 317 { 318 short16 res = cast(short16) _mm256_adds_epu16(_mm256_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0), 319 _mm256_set_epi16(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0)); 320 static immutable short[16] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6]; 321 assert(res.array == correctResult); 322 } 323 324 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 325 __m256i _mm256_adds_epu8 (__m256i a, __m256i b) pure @trusted 326 { 327 // PERF DMD 328 static if (GDC_with_AVX2) 329 { 330 return cast(__m256i) __builtin_ia32_paddusb256(cast(ubyte32)a, cast(ubyte32)b); 331 } 332 else static if(LDC_with_saturated_intrinsics) 333 { 334 return cast(__m256i) inteli_llvm_addus!byte32(cast(byte32)a, cast(byte32)b); 335 } 336 else 337 { 338 byte32 r; 339 byte32 sa = cast(byte32)a; 340 byte32 sb = cast(byte32)b; 341 foreach(i; 0..32) 342 r.ptr[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 343 return cast(__m256i)r; 344 } 345 } 346 unittest 347 { 348 __m256i A = _mm256_setr_epi8(0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)136, 0, 0, 0, cast(byte)136, 0, 0, 0, 0, 0, 0); 349 __m256i B = _mm256_setr_epi8(0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)136, 0, 0, 0, 40, 0, 0, 0, 0, 0, 0); 350 byte32 R = cast(byte32) _mm256_adds_epu8(A, B); 351 static immutable byte[32] correct = [0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, cast(byte)176, 0, 0, 0, 0, 0, 0]; 352 assert(R.array == correct); 353 } 354 355 /// Concatenate pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary result, shift the 356 /// result right by `imm8` bytes, and return the low 16 bytes of that in each lane. 357 __m256i _mm256_alignr_epi8(ubyte count)(__m256i a, __m256i b) pure @trusted 358 { 359 360 // PERF DMD 361 static if (GDC_with_AVX2) 362 { 363 return cast(__m256i)__builtin_ia32_palignr256(a, b, count * 8); 364 } 365 else 366 { 367 // Note that palignr 256-bit does the same as palignr 128-bit by lane. Can split. 368 // With LDC 1.24 + avx2 feature + -02, that correctly gives a AVX2 vpalignr despite being split. 369 // I guess we could do it with a big 32-items shufflevector but not sure if best. 370 // 2 inst on ARM64 neon, which is optimal. 371 __m128i a_lo = _mm256_extractf128_si256!0(a); 372 __m128i a_hi = _mm256_extractf128_si256!1(a); 373 __m128i b_lo = _mm256_extractf128_si256!0(b); 374 __m128i b_hi = _mm256_extractf128_si256!1(b); 375 __m128i r_lo = _mm_alignr_epi8!count(a_lo, b_lo); 376 __m128i r_hi = _mm_alignr_epi8!count(a_hi, b_hi); 377 return _mm256_set_m128i(r_hi, r_lo); 378 } 379 } 380 unittest 381 { 382 __m128i A = _mm_setr_epi8( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); 383 __m128i B = _mm_setr_epi8(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); 384 __m256i AA = _mm256_set_m128i(A, A); 385 __m256i BB = _mm256_set_m128i(B, B); 386 387 { 388 byte32 C = cast(byte32) _mm256_alignr_epi8!0(AA, BB); 389 byte[32] correct = [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]; 390 assert(C.array == correct); 391 } 392 { 393 byte32 C = cast(byte32) _mm256_alignr_epi8!20(AA, BB); 394 byte[32] correct = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0]; 395 assert(C.array == correct); 396 } 397 { 398 byte32 C = cast(byte32) _mm256_alignr_epi8!34(AA, BB); 399 byte[32] correct = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 400 assert(C.array == correct); 401 } 402 } 403 404 /// Compute the bitwise AND of 256 bits (representing integer data) in `a` and `b`. 405 __m256i _mm256_and_si256 (__m256i a, __m256i b) pure @safe 406 { 407 pragma(inline, true); 408 return a & b; 409 } 410 unittest 411 { 412 __m256i A = _mm256_set1_epi32(7); 413 __m256i B = _mm256_set1_epi32(14); 414 int8 R = cast(int8) _mm256_and_si256(A, B); 415 int[8] correct = [6, 6, 6, 6, 6, 6, 6, 6]; 416 assert(R.array == correct); 417 } 418 419 /// Compute the bitwise NOT of 256 bits (representing integer data) in `a` and then AND with `b`. 420 __m256i _mm256_andnot_si256 (__m256i a, __m256i b) pure @safe 421 { 422 // See: https://issues.dlang.org/show_bug.cgi?id=24283, 423 // need workaround if we ever use DMD AVX codegen 424 425 pragma(inline, true); 426 return (~a) & b; 427 } 428 unittest 429 { 430 __m256i A = _mm256_setr_epi32(7, -2, 9, 54654, 7, -2, 9, 54654); 431 __m256i B = _mm256_setr_epi32(14, 78, 111, -256, 14, 78, 111, -256); 432 int8 R = cast(int8) _mm256_andnot_si256(A, B); 433 int[8] correct = [8, 0, 102, -54784, 8, 0, 102, -54784]; 434 assert(R.array == correct); 435 } 436 437 /// Average packed unsigned 16-bit integers in `a` and `b`. 438 __m256i _mm256_avg_epu16 (__m256i a, __m256i b) pure @trusted 439 { 440 static if (GDC_with_AVX2) 441 { 442 return cast(__m256i) __builtin_ia32_pavgw256(cast(short16)a, cast(short16)b); 443 } 444 else static if (LDC_with_AVX2 && __VERSION__ >= 2094) 445 { 446 return cast(__m256i) __builtin_ia32_pavgw256(cast(short16)a, cast(short16)b); 447 } 448 else 449 { 450 // Splitting is always beneficial here, except -O0 451 __m128i a_lo = _mm256_extractf128_si256!0(a); 452 __m128i a_hi = _mm256_extractf128_si256!1(a); 453 __m128i b_lo = _mm256_extractf128_si256!0(b); 454 __m128i b_hi = _mm256_extractf128_si256!1(b); 455 __m128i r_lo = _mm_avg_epu16(a_lo, b_lo); 456 __m128i r_hi = _mm_avg_epu16(a_hi, b_hi); 457 return _mm256_set_m128i(r_hi, r_lo); 458 } 459 } 460 unittest 461 { 462 __m256i A = _mm256_set1_epi16(31457); 463 __m256i B = _mm256_set1_epi16(cast(short)64000); 464 short16 avg = cast(short16)(_mm256_avg_epu16(A, B)); 465 foreach(i; 0..16) 466 assert(avg.array[i] == cast(short)47729); 467 } 468 469 /// Average packed unsigned 8-bit integers in `a` and `b`. 470 __m256i _mm256_avg_epu8 (__m256i a, __m256i b) pure @trusted 471 { 472 static if (GDC_with_AVX2) 473 { 474 return cast(__m256i) __builtin_ia32_pavgb256(cast(ubyte32)a, cast(ubyte32)b); 475 } 476 else static if (LDC_with_AVX2 && __VERSION__ >= 2094) 477 { 478 return cast(__m256i) __builtin_ia32_pavgb256(cast(byte32)a, cast(byte32)b); 479 } 480 else 481 { 482 // Splitting is always beneficial here, except -O0 483 __m128i a_lo = _mm256_extractf128_si256!0(a); 484 __m128i a_hi = _mm256_extractf128_si256!1(a); 485 __m128i b_lo = _mm256_extractf128_si256!0(b); 486 __m128i b_hi = _mm256_extractf128_si256!1(b); 487 __m128i r_lo = _mm_avg_epu8(a_lo, b_lo); 488 __m128i r_hi = _mm_avg_epu8(a_hi, b_hi); 489 return _mm256_set_m128i(r_hi, r_lo); 490 } 491 } 492 unittest 493 { 494 __m256i A = _mm256_set1_epi8(-1); 495 __m256i B = _mm256_set1_epi8(13); 496 byte32 avg = cast(byte32)(_mm256_avg_epu8(A, B)); 497 foreach(i; 0..32) 498 assert(avg.array[i] == cast(byte)134); 499 } 500 501 /// Blend packed 16-bit integers from `a` and `b` within 128-bit lanes using 8-bit control 502 /// mask `imm8`, in each of the two lanes. 503 /// Note: this is functionally equivalent to two `_mm_blend_epi16`. 504 __m256i _mm256_blend_epi16(int imm8) (__m256i a, __m256i b) pure @trusted 505 { 506 // PERF DMD 507 assert(imm8 >= 0 && imm8 < 256); 508 enum bool split = true; // makes things better, except on ARM32 which is no better than naive 509 510 static if (GDC_with_AVX2) 511 { 512 return cast(__m256i) __builtin_ia32_pblendw256(cast(short16)a, cast(short16)b, imm8); 513 } 514 else static if (split) 515 { 516 __m128i a_lo = _mm256_extractf128_si256!0(a); 517 __m128i a_hi = _mm256_extractf128_si256!1(a); 518 __m128i b_lo = _mm256_extractf128_si256!0(b); 519 __m128i b_hi = _mm256_extractf128_si256!1(b); 520 __m128i r_lo = _mm_blend_epi16!(imm8)(a_lo, b_lo); 521 __m128i r_hi = _mm_blend_epi16!(imm8)(a_hi, b_hi); 522 return _mm256_set_m128i(r_hi, r_lo); 523 } 524 } 525 unittest 526 { 527 __m256i A = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 0, -1, -2, -3, -4, -5, -6, -7); 528 __m256i B = _mm256_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15, -8, -9, -10, -11, -12, -13, -14, -15); 529 short16 C = cast(short16) _mm256_blend_epi16!147(A, B); // 10010011 10010011 530 short[16] correct = [8, 9, 2, 3, 12, 5, 6, 15, -8, -9, -2, -3, -12, -5, -6, -15]; 531 assert(C.array == correct); 532 } 533 534 /// Blend packed 32-bit integers from `a` and `b` using 4-bit control mask `imm8`. 535 __m128i _mm_blend_epi32(int imm8)(__m128i a, __m128i b) pure @trusted 536 { 537 // This one is interesting, it is functionally equivalent to SSE4.1 blendps (_mm_blend_ps) 538 // So without AVX2 we can always fallback to _mm_blend_ps 539 // And indeed, a shufflevector!int4 doesn't even use vpblendd with LDC, and prefer 540 // blendps and shufps so why bother. 541 542 // PERF DMD 543 static assert(imm8 >= 0 && imm8 < 16); 544 static if (GDC_with_AVX2) 545 { 546 return __builtin_ia32_pblendd128(a, b, imm8); 547 } 548 else 549 { 550 return cast(__m128i) _mm_blend_ps!imm8(cast(__m128)a, cast(__m128)b); 551 } 552 } 553 unittest 554 { 555 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 556 __m128i B = _mm_setr_epi32(8, 9, 10, 11); 557 int4 C = _mm_blend_epi32!13(A, B); // 1101 558 int[4] correct = [8, 1, 10, 11]; 559 assert(C.array == correct); 560 } 561 562 /// Blend packed 32-bit integers from `a` and `b` using 8-bit control mask `imm8`. 563 __m256i _mm256_blend_epi32(int imm8)(__m256i a, __m256i b) pure @trusted 564 { 565 // This one is functionally equivalent to AVX _mm256_blend_ps, except with integers. 566 // With LDC, doing a shufflevector here would select the vblendps instruction anyway, 567 // so we might as well defer to _mm256_blend_ps. 568 569 // PERF DMD 570 static assert(imm8 >= 0 && imm8 < 256); 571 static if (GDC_with_AVX2) 572 { 573 return cast(__m256i) __builtin_ia32_pblendd256 (cast(int8)a, cast(int8)b, imm8); 574 } 575 else 576 { 577 return cast(__m256i) _mm256_blend_ps!imm8(cast(__m256)a, cast(__m256)b); 578 } 579 } 580 unittest 581 { 582 __m256i A = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); 583 __m256i B = _mm256_setr_epi32(8, 9, 10, 11, 12, 13, 147, 15); 584 int8 C = cast(int8) _mm256_blend_epi32!0xe7(A, B); 585 int[8] correct = [8, 9, 10, 3, 4, 13, 147, 15]; 586 assert(C.array == correct); 587 } 588 589 /// Blend packed 8-bit integers from `a` and `b` using `mask`. 590 /// Select from `b` if the high-order bit of the corresponding 8-bit element in `mask` is set, else select from `a`. 591 __m256i _mm256_blendv_epi8 (__m256i a, __m256i b, __m256i mask) pure @safe 592 { 593 // BUG PERF: this would fail the CI with GDC 12 594 /*static if (GDC_with_AVX2) 595 return cast(__m256i)__builtin_ia32_pblendvb256(cast(ubyte32)a, cast(ubyte32)b, cast(ubyte32)mask); 596 else 597 */ 598 599 static if (LDC_with_AVX2) 600 { 601 return cast(__m256i) __builtin_ia32_pblendvb256(cast(byte32)a, cast(byte32)b, cast(byte32)mask); 602 } 603 else 604 { 605 __m128i a_lo = _mm256_extractf128_si256!0(a); 606 __m128i a_hi = _mm256_extractf128_si256!1(a); 607 __m128i b_lo = _mm256_extractf128_si256!0(b); 608 __m128i b_hi = _mm256_extractf128_si256!1(b); 609 __m128i m_lo = _mm256_extractf128_si256!0(mask); 610 __m128i m_hi = _mm256_extractf128_si256!1(mask); 611 __m128i r_lo = _mm_blendv_epi8(a_lo, b_lo, m_lo); 612 __m128i r_hi = _mm_blendv_epi8(a_hi, b_hi, m_hi); 613 return _mm256_set_m128i(r_hi, r_lo); 614 } 615 } 616 unittest 617 { 618 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 619 8, 9, 10, 11, 12, 13, 14, 15); 620 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 621 24, 25, 26, 27, 28, 29, 30, 31); 622 __m128i M = _mm_setr_epi8( 1, -1, 1, 1, -4, 1, -8, 127, 623 1, 1, -1, -1, 4, 1, 8, -128); 624 __m256i AA = _mm256_set_m128i(A, A); 625 __m256i BB = _mm256_set_m128i(B, B); 626 __m256i MM = _mm256_set_m128i(M, M); 627 byte32 R = cast(byte32) _mm256_blendv_epi8(AA, BB, MM); 628 byte[32] correct = [ 0, 17, 2, 3, 20, 5, 22, 7, 8, 9, 26, 27, 12, 13, 14, 31, 629 0, 17, 2, 3, 20, 5, 22, 7, 8, 9, 26, 27, 12, 13, 14, 31 ]; 630 assert(R.array == correct); 631 } 632 633 /// Broadcast the low packed 8-bit integer from `a` to all elements of result. 634 __m128i _mm_broadcastb_epi8 (__m128i a) pure @safe 635 { 636 byte16 ba = cast(byte16)a; 637 byte16 r; 638 r = ba.array[0]; 639 return cast(__m128i)r; 640 } 641 unittest 642 { 643 byte16 A; 644 A.ptr[0] = 2; 645 byte16 B = cast(byte16) _mm_broadcastb_epi8(cast(__m128i)A); 646 byte[16] correct = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]; 647 assert(B.array == correct); 648 } 649 650 /// Bro0adcast the low packed 8-bit integer from `a` to all elements of result. 651 __m256i _mm256_broadcastb_epi8(__m128i a) pure @safe 652 { 653 byte16 ba = cast(byte16)a; 654 byte32 r; 655 r = ba.array[0]; 656 return cast(__m256i)r; 657 } 658 unittest 659 { 660 byte16 A; 661 A.ptr[0] = 2; 662 byte32 B = cast(byte32) _mm256_broadcastb_epi8(cast(__m128i)A); 663 byte[32] correct = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 664 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]; 665 assert(B.array == correct); 666 } 667 668 /// Broadcast the low packed 32-bit integer from `a` to all elements of result. 669 __m128i _mm_broadcastd_epi32 (__m128i a) pure @safe 670 { 671 int4 ba = cast(int4)a; 672 int4 r; 673 r = ba.array[0]; 674 return cast(__m128i)r; 675 } 676 unittest 677 { 678 int4 A; 679 A.ptr[0] = -2; 680 int4 B = cast(int4) _mm_broadcastd_epi32(cast(__m128i)A); 681 int[4] correct = [-2, -2, -2, -2]; 682 assert(B.array == correct); 683 } 684 685 /// Broadcast the low packed 32-bit integer from `a` to all elements of result. 686 __m256i _mm256_broadcastd_epi32 (__m128i a) pure @safe 687 { 688 int4 ba = cast(int4)a; 689 int8 r; 690 r = ba.array[0]; 691 return cast(__m256i)r; 692 } 693 unittest 694 { 695 int4 A; 696 A.ptr[0] = -2; 697 int8 B = cast(int8) _mm256_broadcastd_epi32(cast(__m128i)A); 698 int[8] correct = [-2, -2, -2, -2, -2, -2, -2, -2]; 699 assert(B.array == correct); 700 } 701 702 /// Broadcast the low packed 64-bit integer from `a` to all elements of result. 703 __m128i _mm_broadcastq_epi64 (__m128i a) pure @safe 704 { 705 long2 ba = cast(long2)a; 706 long2 r; 707 r = ba.array[0]; 708 return cast(__m128i)r; 709 } 710 unittest 711 { 712 long2 A; 713 A.ptr[0] = -2; 714 long2 B = cast(long2) _mm_broadcastq_epi64(cast(__m128i)A); 715 long[2] correct = [-2, -2]; 716 assert(B.array == correct); 717 } 718 719 /// Broadcast the low packed 64-bit integer from `a` to all elements of result. 720 __m256i _mm256_broadcastq_epi64 (__m128i a) pure @safe 721 { 722 long2 ba = cast(long2)a; 723 long4 r; 724 r = ba.array[0]; 725 return cast(__m256i)r; 726 } 727 unittest 728 { 729 long2 A; 730 A.ptr[0] = -2; 731 long4 B = cast(long4) _mm256_broadcastq_epi64(cast(__m128i)A); 732 long[4] correct = [-2, -2, -2, -2]; 733 assert(B.array == correct); 734 } 735 736 /// Broadcast the low double-precision (64-bit) floating-point element from `a` to all elements of result. 737 __m128d _mm_broadcastsd_pd (__m128d a) pure @safe 738 { 739 double2 r; 740 r = a.array[0]; 741 return r; 742 } 743 unittest 744 { 745 double2 A; 746 A.ptr[0] = 2; 747 double2 B = _mm_broadcastsd_pd(A); 748 double[2] correct = [2.0, 2.0]; 749 assert(B.array == correct); 750 } 751 752 /// Broadcast the low double-precision (64-bit) floating-point element from `a` to all elements of result. 753 __m256d _mm256_broadcastsd_pd (__m128d a) pure @safe 754 { 755 double4 r; 756 r = a.array[0]; 757 return r; 758 } 759 unittest 760 { 761 double2 A; 762 A.ptr[0] = 3; 763 double4 B = _mm256_broadcastsd_pd(A); 764 double[4] correct = [3.0, 3, 3, 3]; 765 assert(B.array == correct); 766 } 767 768 /// Broadcast 128 bits of integer data from ``a to all 128-bit lanes in result. 769 /// Note: also exist with name `_mm256_broadcastsi128_si256` which is identical. 770 __m256i _mm_broadcastsi128_si256 (__m128i a) pure @trusted 771 { 772 // Note that GDC will prefer vinserti128 to vbroadcast, for some reason 773 // So in the end it's the same as naive code. 774 // For this reason, __builtin_ia32_vbroadcastsi256 isn't used 775 long2 ba = cast(long2)a; 776 long4 r; 777 r.ptr[0] = ba.array[0]; 778 r.ptr[1] = ba.array[1]; 779 r.ptr[2] = ba.array[0]; 780 r.ptr[3] = ba.array[1]; 781 return cast(__m256i)r; 782 } 783 unittest 784 { 785 long2 A; 786 A.ptr[0] = 34; 787 A.ptr[1] = -56; 788 long4 B = cast(long4) _mm_broadcastsi128_si256(cast(__m128i)A); 789 long[4] correct = [34, -56, 34, -56]; 790 assert(B.array == correct); 791 } 792 793 ///ditto 794 alias _mm256_broadcastsi128_si256 = _mm_broadcastsi128_si256; // intrinsics is duplicated in the Guide, for some reason 795 796 /// Broadcast the low single-precision (32-bit) floating-point element from `a` to all elements of result. 797 __m128 _mm_broadcastss_ps (__m128 a) pure @safe 798 { 799 float4 r; 800 r = a.array[0]; 801 return r; 802 } 803 unittest 804 { 805 float4 A; 806 A.ptr[0] = 2; 807 float4 B = _mm_broadcastss_ps(A); 808 float[4] correct = [2.0f, 2, 2, 2]; 809 assert(B.array == correct); 810 } 811 812 /// Broadcast the low single-precision (32-bit) floating-point element from `a` to all elements of result. 813 __m256 _mm256_broadcastss_ps (__m128 a) pure @safe 814 { 815 float8 r; 816 r = a.array[0]; 817 return r; 818 } 819 unittest 820 { 821 float4 A; 822 A.ptr[0] = 2; 823 float8 B = _mm256_broadcastss_ps(A); 824 float[8] correct = [2.0f, 2, 2, 2, 2, 2, 2, 2]; 825 assert(B.array == correct); 826 } 827 828 /// Broadcast the low packed 16-bit integer from `a` to all elements of result. 829 __m128i _mm_broadcastw_epi16 (__m128i a) pure @safe 830 { 831 short8 ba = cast(short8)a; 832 short8 r; 833 r = ba.array[0]; 834 return cast(__m128i)r; 835 } 836 unittest 837 { 838 short8 A; 839 A.ptr[0] = 13; 840 short8 B = cast(short8) _mm_broadcastw_epi16(cast(__m128i)A); 841 short[8] correct = [13, 13, 13, 13, 13, 13, 13, 13]; 842 assert(B.array == correct); 843 } 844 845 /// Broadcast the low packed 16-bit integer from `a` to all elements of result. 846 __m256i _mm256_broadcastw_epi16 (__m128i a) pure @safe 847 { 848 short8 ba = cast(short8)a; 849 short16 r; 850 r = ba.array[0]; 851 return cast(__m256i)r; 852 } 853 unittest 854 { 855 short8 A; 856 A.ptr[0] = 13; 857 short16 B = cast(short16) _mm256_broadcastw_epi16(cast(__m128i)A); 858 short[16] correct = [13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]; 859 assert(B.array == correct); 860 } 861 862 863 /// Shift 128-bit lanes in `a` left by `bytes` bytes while shifting in zeroes. 864 __m256i _mm256_bslli_epi128(ubyte bytes)(__m256i a) pure @trusted 865 { 866 // Note: can't use __builtin_ia32_pslldqi256 with GDC, wants an immediate 867 // and even string mixin do not make it 868 // PERF: hence GDC AVX2 doesn't use the instruction, and nothing inlines very well in GDC either 869 static if (bytes >= 16) 870 { 871 return _mm256_setzero_si256(); 872 } 873 else static if (LDC_with_AVX2) 874 { 875 return cast(__m256i)__asm!(long4)("vpslldq $2, $1, $0", "=v,v,I", a, bytes); 876 } 877 else // split 878 { 879 __m128i lo = _mm_slli_si128!bytes(_mm256_extractf128_si256!0(a)); 880 __m128i hi = _mm_slli_si128!bytes(_mm256_extractf128_si256!1(a)); 881 return _mm256_set_m128i(hi, lo); 882 } 883 } 884 unittest 885 { 886 __m256i a = _mm256_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); 887 assert(_mm256_bslli_epi128!7(a).array == [72057594037927936, 650777868590383874, 1224979098644774912, 1808220633999610642]); 888 } 889 890 /// Shift 128-bit lanes in `a` right by `bytes` bytes while shifting in zeroes. 891 __m256i _mm256_bsrli_epi128(ubyte bytes)(__m256i a) pure @trusted 892 { 893 // Note: can't use __builtin_ia32_psrldqi256 with GDC, wants an immediate 894 // and even string mixin do not make it 895 // PERF: hence GDC AVX2 doesn't use the instruction, and nothing inlines very well in GDC either 896 static if (bytes >= 16) 897 { 898 return _mm256_setzero_si256(); 899 } 900 else static if (LDC_with_AVX2) 901 { 902 return cast(__m256i)__asm!(long4)("vpsrldq $2, $1, $0", "=v,v,I", a, bytes); 903 } 904 else // split 905 { 906 __m128i lo = _mm_srli_si128!bytes(_mm256_extractf128_si256!0(a)); 907 __m128i hi = _mm_srli_si128!bytes(_mm256_extractf128_si256!1(a)); 908 return _mm256_set_m128i(hi, lo); 909 } 910 } 911 unittest 912 { 913 __m256i a = _mm256_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); 914 assert(_mm256_bsrli_epi128!7(a).array == [1084818905618843912, 16, 2242261671028070680, 32]); 915 } 916 917 /// Compare packed 16-bit integers in `a` and `b` for equality. 918 __m256i _mm256_cmpeq_epi16 (__m256i a, __m256i b) pure @trusted 919 { 920 // PERF: GDC without AVX 921 // PERF: DMD 922 static if (SIMD_COMPARISON_MASKS_32B) 923 { 924 // PERF: catastrophic in GDC without AVX2 925 return cast(__m256i)(cast(short16)a == cast(short16)b); 926 } 927 else static if (GDC_with_AVX2) 928 { 929 return cast(__m256i) __builtin_ia32_pcmpeqw256(cast(short16)a, cast(short16)b); 930 } 931 else version(LDC) 932 { 933 return cast(__m256i) equalMask!short16(cast(short16)a, cast(short16)b); 934 } 935 else 936 { 937 short16 sa = cast(short16)a; 938 short16 sb = cast(short16)b; 939 short16 sr; 940 for (int n = 0; n < 16; ++n) 941 { 942 bool cond = sa.array[n] == sb.array[n]; 943 sr.ptr[n] = cond ? -1 : 0; 944 } 945 return cast(__m256i) sr; 946 } 947 } 948 unittest 949 { 950 short16 A = [-3, -2, -1, 0, 0, 1, 2, 3, -3, -2, -1, 0, 0, 1, 2, 3]; 951 short16 B = [ 4, 3, 2, 1, 0, -1, -2, -3, -3, 3, 2, 1, 0, -1, -2, -3]; 952 short[16] E = [ 0, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0]; 953 short16 R = cast(short16)(_mm256_cmpeq_epi16(cast(__m256i)A, cast(__m256i)B)); 954 assert(R.array == E); 955 } 956 957 /// Compare packed 32-bit integers in `a` and `b` for equality. 958 __m256i _mm256_cmpeq_epi32 (__m256i a, __m256i b) pure @trusted 959 { 960 // PERF: GDC without AVX 961 // PERF: DMD 962 static if (SIMD_COMPARISON_MASKS_32B) 963 { 964 // Quite bad in GDC -mavx (with no AVX2) 965 return cast(__m256i)(cast(int8)a == cast(int8)b); 966 } 967 else static if (GDC_with_AVX2) 968 { 969 return cast(__m256i) __builtin_ia32_pcmpeqd256(cast(int8)a, cast(int8)b); 970 } 971 else version(LDC) 972 { 973 return cast(__m256i) equalMask!int8(cast(int8)a, cast(int8)b); 974 } 975 else 976 { 977 int8 ia = cast(int8)a; 978 int8 ib = cast(int8)b; 979 int8 ir; 980 for (int n = 0; n < 8; ++n) 981 { 982 bool cond = ia.array[n] == ib.array[n]; 983 ir.ptr[n] = cond ? -1 : 0; 984 } 985 return cast(__m256i) ir; 986 } 987 } 988 unittest 989 { 990 int8 A = [-3, -2, -1, 0, -3, -2, -1, 0]; 991 int8 B = [ 4, -2, 2, 0, 4, -2, 2, 0]; 992 int[8] E = [ 0, -1, 0, -1, 0, -1, 0, -1]; 993 int8 R = cast(int8)(_mm256_cmpeq_epi32(cast(__m256i)A, cast(__m256i)B)); 994 assert(R.array == E); 995 } 996 997 /// Compare packed 64-bit integers in `a` and `b` for equality. 998 __m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b) pure @trusted 999 { 1000 // PERF: GDC without AVX 1001 // PERF: DMD 1002 static if (SIMD_COMPARISON_MASKS_32B) 1003 { 1004 // Note: enabling this with DMD will probably lead to same bug as _mm_cmpeq_epi64 1005 return cast(__m256i)(cast(long4)a == cast(long4)b); 1006 } 1007 else static if (GDC_with_AVX2) 1008 { 1009 return cast(__m256i)__builtin_ia32_pcmpeqq256(cast(long4)a, cast(long4)b); 1010 } 1011 else version(LDC) 1012 { 1013 return cast(__m256i) equalMask!long4(cast(long4)a, cast(long4)b); 1014 } 1015 else 1016 { 1017 long4 la = cast(long4)a; 1018 long4 lb = cast(long4)b; 1019 long4 res; 1020 res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0; 1021 res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0; 1022 res.ptr[2] = (la.array[2] == lb.array[2]) ? -1 : 0; 1023 res.ptr[3] = (la.array[3] == lb.array[3]) ? -1 : 0; 1024 return cast(__m256i)res; 1025 } 1026 } 1027 unittest 1028 { 1029 __m256i A = _mm256_setr_epi64(-1, -2, -1, -2); 1030 __m256i B = _mm256_setr_epi64(-3, -2, -3, -3); 1031 __m256i C = _mm256_setr_epi64(-1, -4, -1, -2); 1032 long4 AB = cast(long4) _mm256_cmpeq_epi64(A, B); 1033 long4 AC = cast(long4) _mm256_cmpeq_epi64(A, C); 1034 long[4] correct1 = [ 0, -1, 0, 0]; 1035 long[4] correct2 = [-1, 0, -1, -1]; 1036 assert(AB.array == correct1); 1037 assert(AC.array == correct2); 1038 } 1039 1040 /// Compare packed 8-bit integers in `a` and `b` for equality. 1041 __m256i _mm256_cmpeq_epi8 (__m256i a, __m256i b) pure @trusted 1042 { 1043 // PERF: GDC without AVX2, need split 1044 // PERF: DMD 1045 static if (SIMD_COMPARISON_MASKS_32B) 1046 { 1047 return cast(__m256i)(cast(byte32)a == cast(byte32)b); 1048 } 1049 else static if (GDC_with_AVX2) 1050 { 1051 return cast(__m256i) __builtin_ia32_pcmpeqb256(cast(ubyte32)a, cast(ubyte32)b); 1052 } 1053 else version(LDC) 1054 { 1055 return cast(__m256i) equalMask!byte32(cast(byte32)a, cast(byte32)b); 1056 } 1057 else 1058 { 1059 byte32 ba = cast(byte32)a; 1060 byte32 bb = cast(byte32)b; 1061 byte32 br; 1062 for (int n = 0; n < 32; ++n) 1063 { 1064 bool cond = ba.array[n] == bb.array[n]; 1065 br.ptr[n] = cond ? -1 : 0; 1066 } 1067 return cast(__m256i) br; 1068 } 1069 } 1070 unittest 1071 { 1072 __m256i A = _mm256_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1073 1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 42); 1074 __m256i B = _mm256_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1, 1075 2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 1076 byte32 C = cast(byte32) _mm256_cmpeq_epi8(A, B); 1077 byte[32] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1, 1078 0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, 0]; 1079 assert(C.array == correct); 1080 } 1081 1082 /// Compare packed signed 16-bit integers in `a` and `b` for greater-than. 1083 __m256i _mm256_cmpgt_epi16 (__m256i a, __m256i b) pure @safe 1084 { 1085 version(GNU) 1086 enum bool mayUseComparisonOperator = GDC_with_AVX2; // too slow in GDC without AVX2 1087 else 1088 enum bool mayUseComparisonOperator = true; 1089 1090 static if (SIMD_COMPARISON_MASKS_32B && mayUseComparisonOperator) 1091 { 1092 return cast(__m256i)(cast(short16)a > cast(short16)b); 1093 } 1094 else static if (GDC_with_AVX2) 1095 { 1096 return cast(__m256i) __builtin_ia32_pcmpgtw256(cast(short16)a, cast(short16)b); 1097 } 1098 else // split 1099 { 1100 __m128i a_lo = _mm256_extractf128_si256!0(a); 1101 __m128i a_hi = _mm256_extractf128_si256!1(a); 1102 __m128i b_lo = _mm256_extractf128_si256!0(b); 1103 __m128i b_hi = _mm256_extractf128_si256!1(b); 1104 __m128i r_lo = _mm_cmpgt_epi16(a_lo, b_lo); 1105 __m128i r_hi = _mm_cmpgt_epi16(a_hi, b_hi); 1106 return _mm256_set_m128i(r_hi, r_lo); 1107 } 1108 } 1109 unittest 1110 { 1111 short16 A = [-3, -2, -1, 0, 0, 1, 2, 3, -3, -2, -1, 0, 0, 1, 2, 3]; 1112 short16 B = [ 4, 3, 2, 1, 0, -1, -2, -3, 4, -3, 2, 1, 0, -1, -2, -3]; 1113 short[16] E = [ 0, 0, 0, 0, 0, -1, -1, -1, 0, -1, 0, 0, 0, -1, -1, -1]; 1114 short16 R = cast(short16)(_mm256_cmpgt_epi16(cast(__m256i)A, cast(__m256i)B)); 1115 assert(R.array == E); 1116 } 1117 1118 /// Compare packed signed 32-bit integers in `a` and `b` for greater-than. 1119 __m256i _mm256_cmpgt_epi32 (__m256i a, __m256i b) pure @safe 1120 { 1121 version(GNU) 1122 enum bool mayUseComparisonOperator = GDC_with_AVX2; // too slow in GDC else 1123 else 1124 enum bool mayUseComparisonOperator = true; 1125 1126 static if (SIMD_COMPARISON_MASKS_32B && mayUseComparisonOperator) 1127 { 1128 return cast(__m256i)(cast(int8)a > cast(int8)b); 1129 } 1130 else static if (GDC_with_AVX2) 1131 { 1132 return cast(__m256i) __builtin_ia32_pcmpgtd256(cast(int8)a, cast(int8)b); 1133 } 1134 else // split 1135 { 1136 __m128i a_lo = _mm256_extractf128_si256!0(a); 1137 __m128i a_hi = _mm256_extractf128_si256!1(a); 1138 __m128i b_lo = _mm256_extractf128_si256!0(b); 1139 __m128i b_hi = _mm256_extractf128_si256!1(b); 1140 __m128i r_lo = _mm_cmpgt_epi32(a_lo, b_lo); 1141 __m128i r_hi = _mm_cmpgt_epi32(a_hi, b_hi); 1142 return _mm256_set_m128i(r_hi, r_lo); 1143 } 1144 } 1145 unittest 1146 { 1147 int8 A = [-3, 2, -1, 0, -3, 2, -1, 0]; 1148 int8 B = [ 4, -2, 2, 0, 4, -2, 2, 0]; 1149 int[8] E = [ 0, -1, 0, 0, 0, -1, 0, 0]; 1150 int8 R = cast(int8) _mm256_cmpgt_epi32(cast(__m256i)A, cast(__m256i)B); 1151 assert(R.array == E); 1152 } 1153 1154 __m256i _mm256_cmpgt_epi64 (__m256i a, __m256i b) pure @safe 1155 { 1156 version(GNU) 1157 enum bool mayUseComparisonOperator = GDC_with_AVX2; // too slow in GDC else 1158 else 1159 enum bool mayUseComparisonOperator = true; 1160 1161 static if (SIMD_COMPARISON_MASKS_32B && mayUseComparisonOperator) 1162 { 1163 return cast(__m256i)(cast(long4)a > cast(long4)b); 1164 } 1165 else static if (GDC_with_AVX2) 1166 { 1167 return cast(__m256i) __builtin_ia32_pcmpgtq256(cast(long4)a, cast(long4)b); 1168 } 1169 else // split 1170 { 1171 __m128i a_lo = _mm256_extractf128_si256!0(a); 1172 __m128i a_hi = _mm256_extractf128_si256!1(a); 1173 __m128i b_lo = _mm256_extractf128_si256!0(b); 1174 __m128i b_hi = _mm256_extractf128_si256!1(b); 1175 __m128i r_lo = _mm_cmpgt_epi64(a_lo, b_lo); 1176 __m128i r_hi = _mm_cmpgt_epi64(a_hi, b_hi); 1177 return _mm256_set_m128i(r_hi, r_lo); 1178 } 1179 } 1180 unittest 1181 { 1182 __m256i A = _mm256_setr_epi64(-3, 2, 70, 2); 1183 __m256i B = _mm256_setr_epi64 (4, -2, 4, -2); 1184 long[4] correct = [ 0, -1, -1, -1 ]; 1185 long4 R = cast(long4)(_mm256_cmpgt_epi64(A, B)); 1186 assert(R.array == correct); 1187 } 1188 1189 /// Compare packed signed 8-bit integers in `a` and `b` for greater-than. 1190 __m256i _mm256_cmpgt_epi8 (__m256i a, __m256i b) pure @safe 1191 { 1192 version(GNU) 1193 { 1194 // too slow in GDC without AVX2, but also doesn't 1195 // work in CI? BUG PERF 1196 enum bool mayUseComparisonOperator = false; 1197 } 1198 else 1199 enum bool mayUseComparisonOperator = true; 1200 1201 static if (SIMD_COMPARISON_MASKS_32B && mayUseComparisonOperator) 1202 { 1203 return cast(__m256i)(cast(byte32)a > cast(byte32)b); 1204 } 1205 /*else static if (GDC_with_AVX2) 1206 { 1207 return cast(__m256i) __builtin_ia32_pcmpgtb256(cast(ubyte32)a, cast(ubyte32)b); 1208 }*/ 1209 else // split 1210 { 1211 __m128i a_lo = _mm256_extractf128_si256!0(a); 1212 __m128i a_hi = _mm256_extractf128_si256!1(a); 1213 __m128i b_lo = _mm256_extractf128_si256!0(b); 1214 __m128i b_hi = _mm256_extractf128_si256!1(b); 1215 __m128i r_lo = _mm_cmpgt_epi8(a_lo, b_lo); 1216 __m128i r_hi = _mm_cmpgt_epi8(a_hi, b_hi); 1217 return _mm256_set_m128i(r_hi, r_lo); 1218 } 1219 } 1220 unittest 1221 { 1222 __m256i A = _mm256_setr_epi8(1, 2, 3, 1, 127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1, 2, 3, 1, 127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 1223 __m256i B = _mm256_setr_epi8(2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1, 2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 0); 1224 byte32 C = cast(byte32) _mm256_cmpgt_epi8(A, B); 1225 byte[32] correct = [0, 0,-1, 0, -1, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0, 0, 0,-1, 0, -1, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1,-1]; 1226 assert(C.array == correct); 1227 } 1228 1229 1230 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers. 1231 __m256i _mm256_cvtepi16_epi32 (__m128i a) pure @trusted 1232 { 1233 static if (GDC_with_AVX2) 1234 { 1235 return cast(__m256i) __builtin_ia32_pmovsxwd256(cast(short8)a); 1236 } 1237 else static if (LDC_with_optimizations) 1238 { 1239 enum ir = ` 1240 %r = sext <8 x i16> %0 to <8 x i32> 1241 ret <8 x i32> %r`; 1242 return cast(__m256i) LDCInlineIR!(ir, int8, short8)(cast(short8)a); 1243 } 1244 else 1245 { 1246 short8 sa = cast(short8)a; 1247 int8 r; 1248 r.ptr[0] = sa.array[0]; 1249 r.ptr[1] = sa.array[1]; 1250 r.ptr[2] = sa.array[2]; 1251 r.ptr[3] = sa.array[3]; 1252 r.ptr[4] = sa.array[4]; 1253 r.ptr[5] = sa.array[5]; 1254 r.ptr[6] = sa.array[6]; 1255 r.ptr[7] = sa.array[7]; 1256 return cast(__m256i)r; 1257 } 1258 } 1259 unittest 1260 { 1261 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, -1, 0, -32768, 32767); 1262 int8 C = cast(int8) _mm256_cvtepi16_epi32(A); 1263 int[8] correct = [-1, 0, -32768, 32767, -1, 0, -32768, 32767]; 1264 assert(C.array == correct); 1265 } 1266 1267 1268 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers. 1269 __m256i _mm256_cvtepi16_epi64 (__m128i a) pure @trusted 1270 { 1271 static if (GDC_with_AVX2) 1272 { 1273 return cast(__m256i) __builtin_ia32_pmovsxwq256(cast(short8)a); 1274 } 1275 else static if (LDC_with_optimizations) 1276 { 1277 enum ir = ` 1278 %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3> 1279 %r = sext <4 x i16> %v to <4 x i64> 1280 ret <4 x i64> %r`; 1281 return cast(__m256i) LDCInlineIR!(ir, long4, short8)(cast(short8)a); 1282 } 1283 else 1284 { 1285 // LDC x86 generates vpmovsxwq since LDC 1.12 -O1 1286 short8 sa = cast(short8)a; 1287 long4 r; 1288 r.ptr[0] = sa.array[0]; 1289 r.ptr[1] = sa.array[1]; 1290 r.ptr[2] = sa.array[2]; 1291 r.ptr[3] = sa.array[3]; 1292 return cast(__m256i)r; 1293 } 1294 } 1295 unittest 1296 { 1297 __m128i A = _mm_setr_epi16(-1, 0, short.min, short.max, 2, 3, 4, 5); 1298 long4 C = cast(long4) _mm256_cvtepi16_epi64(A); 1299 long[4] correct = [-1, 0, short.min, short.max]; 1300 assert(C.array == correct); 1301 } 1302 1303 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers. 1304 __m256i _mm256_cvtepi32_epi64 (__m128i a) pure @trusted 1305 { 1306 long4 r; 1307 r.ptr[0] = a.array[0]; 1308 r.ptr[1] = a.array[1]; 1309 r.ptr[2] = a.array[2]; 1310 r.ptr[3] = a.array[3]; 1311 return cast(__m256i)r; 1312 } 1313 unittest 1314 { 1315 __m128i A = _mm_setr_epi32(-1, 0, int.min, int.max); 1316 long4 C = cast(long4) _mm256_cvtepi32_epi64(A); 1317 long[4] correct = [-1, 0, int.min, int.max]; 1318 assert(C.array == correct); 1319 } 1320 1321 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers. 1322 __m256i _mm256_cvtepi8_epi16 (__m128i a) pure @trusted 1323 { 1324 static if (GDC_with_AVX2) 1325 { 1326 return cast(__m256i) __builtin_ia32_pmovsxbw256(cast(ubyte16)a); 1327 } 1328 else static if (LDC_with_optimizations) 1329 { 1330 enum ir = ` 1331 %r = sext <16 x i8> %0 to <16 x i16> 1332 ret <16 x i16> %r`; 1333 return cast(__m256i) LDCInlineIR!(ir, short16, byte16)(cast(byte16)a); 1334 } 1335 else 1336 { 1337 short16 r; 1338 byte16 ba = cast(byte16)a; 1339 for (int n = 0; n < 16; ++n) 1340 { 1341 r.ptr[n] = ba.array[n]; 1342 } 1343 return cast(__m256i)r; 1344 } 1345 } 1346 unittest 1347 { 1348 __m128i A = _mm_setr_epi8(-1, 0, byte.min, byte.max, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13); 1349 short16 C = cast(short16) _mm256_cvtepi8_epi16(A); 1350 short[16] correct = [-1, 0, byte.min, byte.max, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]; 1351 assert(C.array == correct); 1352 } 1353 1354 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers. 1355 __m256i _mm256_cvtepi8_epi32 (__m128i a) pure @trusted 1356 { 1357 static if (GDC_with_AVX2) 1358 { 1359 return cast(__m256i) __builtin_ia32_pmovsxbd256(cast(ubyte16)a); 1360 } 1361 else static if (LDC_with_optimizations) 1362 { 1363 enum ir = ` 1364 %v = shufflevector <16 x i8> %0,<16 x i8> undef, <8 x i32> <i32 0, i32 1,i32 2, i32 3, i32 4, i32 5,i32 6, i32 7> 1365 %r = sext <8 x i8> %v to <8 x i32> 1366 ret <8 x i32> %r`; 1367 return cast(__m256i) LDCInlineIR!(ir, int8, byte16)(cast(byte16)a); 1368 } 1369 else 1370 { 1371 // PERF This is rather bad in GDC without AVX, or with DMD 1372 // should split that 1373 int8 r; 1374 byte16 ba = cast(byte16)a; 1375 for (int n = 0; n < 8; ++n) 1376 { 1377 r.ptr[n] = ba.array[n]; 1378 } 1379 return cast(__m256i)r; 1380 } 1381 } 1382 unittest 1383 { 1384 __m128i A = _mm_setr_epi8(-1, 0, byte.min, byte.max, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13); 1385 int8 C = cast(int8) _mm256_cvtepi8_epi32(A); 1386 int[8] correct = [-1, 0, byte.min, byte.max, 2, 3, 4, 5]; 1387 assert(C.array == correct); 1388 } 1389 1390 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. 1391 __m256i _mm256_cvtepi8_epi64 (__m128i a) pure @trusted 1392 { 1393 // PERF This is rather bad in GDC without AVX 1394 static if (GDC_with_AVX2) 1395 { 1396 return cast(__m256i) __builtin_ia32_pmovsxbq256(cast(ubyte16)a); 1397 } 1398 else static if (LDC_with_ARM64) 1399 { 1400 // 4 inst since LDC 1.22 -O2 1401 return _mm256_cvtepi16_epi64(_mm_cvtepi8_epi16(a)); 1402 } 1403 else static if (LDC_with_optimizations) 1404 { 1405 enum ir = ` 1406 %v = shufflevector <16 x i8> %0,<16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1407 %r = sext <4 x i8> %v to <4 x i64> 1408 ret <4 x i64> %r`; 1409 return cast(__m256i) LDCInlineIR!(ir, long4, byte16)(cast(byte16)a); 1410 } 1411 else 1412 { 1413 long4 r; 1414 byte16 ba = cast(byte16)a; 1415 for (int n = 0; n < 4; ++n) 1416 { 1417 r.ptr[n] = ba.array[n]; 1418 } 1419 return cast(__m256i)r; 1420 } 1421 } 1422 unittest 1423 { 1424 __m128i A = _mm_setr_epi8(-1, 0, byte.min, byte.max, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13); 1425 long4 C = cast(long4) _mm256_cvtepi8_epi64(A); 1426 long[4] correct = [-1, 0, byte.min, byte.max]; 1427 assert(C.array == correct); 1428 } 1429 1430 /// Zero-extend packed unsigned 16-bit integers in `a` to packed 32-bit integers. 1431 __m256i _mm256_cvtepu16_epi32(__m128i a) pure @trusted 1432 { 1433 static if (GDC_with_AVX2) 1434 { 1435 return cast(__m256i) __builtin_ia32_pmovzxwd256(cast(short8)a); 1436 } 1437 else 1438 { 1439 short8 sa = cast(short8)a; 1440 int8 r; 1441 r.ptr[0] = cast(ushort)sa.array[0]; 1442 r.ptr[1] = cast(ushort)sa.array[1]; 1443 r.ptr[2] = cast(ushort)sa.array[2]; 1444 r.ptr[3] = cast(ushort)sa.array[3]; 1445 r.ptr[4] = cast(ushort)sa.array[4]; 1446 r.ptr[5] = cast(ushort)sa.array[5]; 1447 r.ptr[6] = cast(ushort)sa.array[6]; 1448 r.ptr[7] = cast(ushort)sa.array[7]; 1449 return cast(__m256i)r; 1450 } 1451 } 1452 unittest 1453 { 1454 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, -1, 0, -32768, 32767); 1455 int8 C = cast(int8) _mm256_cvtepu16_epi32(A); 1456 int[8] correct = [65535, 0, 32768, 32767, 65535, 0, 32768, 32767]; 1457 assert(C.array == correct); 1458 } 1459 1460 /// Zero-extend packed unsigned 16-bit integers in `a` to packed 64-bit integers. 1461 __m256i _mm256_cvtepu16_epi64(__m128i a) pure @trusted 1462 { 1463 static if (GDC_with_AVX2) 1464 { 1465 return cast(__m256i) __builtin_ia32_pmovzxwq256(cast(short8)a); 1466 } 1467 else static if (LDC_with_optimizations) 1468 { 1469 enum ir = ` 1470 %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3> 1471 %r = zext <4 x i16> %v to <4 x i64> 1472 ret <4 x i64> %r`; 1473 return cast(__m256i) LDCInlineIR!(ir, long4, short8)(cast(short8)a); 1474 } 1475 else 1476 { 1477 short8 sa = cast(short8)a; 1478 long4 r; 1479 r.ptr[0] = cast(ushort)sa.array[0]; 1480 r.ptr[1] = cast(ushort)sa.array[1]; 1481 r.ptr[2] = cast(ushort)sa.array[2]; 1482 r.ptr[3] = cast(ushort)sa.array[3]; 1483 return cast(__m256i)r; 1484 } 1485 } 1486 unittest 1487 { 1488 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 2, 3, 4, 5); 1489 long4 C = cast(long4) _mm256_cvtepu16_epi64(A); 1490 long[4] correct = [65535, 0, 32768, 32767]; 1491 assert(C.array == correct); 1492 } 1493 1494 /// Zero-extend packed unsigned 32-bit integers in `a` to packed 64-bit integers. 1495 __m256i _mm256_cvtepu32_epi64 (__m128i a) pure @trusted 1496 { 1497 static if (GDC_with_AVX2) 1498 { 1499 return cast(__m256i) __builtin_ia32_pmovzxdq256(cast(int4)a); 1500 } 1501 else static if (LDC_with_optimizations) 1502 { 1503 enum ir = ` 1504 %r = zext <4 x i32> %0 to <4 x i64> 1505 ret <4 x i64> %r`; 1506 return cast(__m256i) LDCInlineIR!(ir, long4, int4)(cast(int4)a); 1507 } 1508 else 1509 { 1510 long4 r; 1511 r.ptr[0] = cast(uint)a.array[0]; 1512 r.ptr[1] = cast(uint)a.array[1]; 1513 r.ptr[2] = cast(uint)a.array[2]; 1514 r.ptr[3] = cast(uint)a.array[3]; 1515 return cast(__m256i)r; 1516 } 1517 } 1518 unittest 1519 { 1520 __m128i A = _mm_setr_epi32(-1, 0, int.min, int.max); 1521 long4 C = cast(long4) _mm256_cvtepu32_epi64(A); 1522 long[4] correct = [uint.max, 0, 2_147_483_648, int.max]; 1523 assert(C.array == correct); 1524 } 1525 1526 /// Zero-extend packed unsigned 8-bit integers in `a` to packed 16-bit integers. 1527 __m256i _mm256_cvtepu8_epi16 (__m128i a) pure @trusted 1528 { 1529 static if (GDC_with_AVX2) 1530 { 1531 return cast(__m256i) __builtin_ia32_pmovzxbw256(cast(ubyte16)a); 1532 } 1533 else static if (LDC_with_optimizations) 1534 { 1535 enum ir = ` 1536 %r = zext <16 x i8> %0 to <16 x i16> 1537 ret <16 x i16> %r`; 1538 return cast(__m256i) LDCInlineIR!(ir, short16, byte16)(cast(byte16)a); 1539 } 1540 else 1541 { 1542 short16 r; 1543 byte16 ba = cast(byte16)a; 1544 for (int n = 0; n < 16; ++n) 1545 { 1546 r.ptr[n] = cast(ubyte)ba.array[n]; 1547 } 1548 return cast(__m256i)r; 1549 } 1550 } 1551 unittest 1552 { 1553 __m128i A = _mm_setr_epi8(-1, 0, -128, 127, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13); 1554 short16 C = cast(short16) _mm256_cvtepu8_epi16(A); 1555 short[16] correct = [255, 0, 128, 127, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]; 1556 assert(C.array == correct); 1557 } 1558 1559 /// Zero-extend packed unsigned 8-bit integers in `a` to packed 32-bit integers. 1560 __m256i _mm256_cvtepu8_epi32 (__m128i a) pure @trusted 1561 { 1562 static if (GDC_with_AVX2) 1563 { 1564 return cast(__m256i) __builtin_ia32_pmovzxbd256(cast(ubyte16)a); 1565 } 1566 else static if (LDC_with_optimizations) 1567 { 1568 enum ir = ` 1569 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1,i32 2, i32 3, i32 4, i32 5,i32 6, i32 7> 1570 %r = zext <8 x i8> %v to <8 x i32> 1571 ret <8 x i32> %r`; 1572 return cast(__m256i) LDCInlineIR!(ir, int8, byte16)(cast(byte16)a); 1573 } 1574 else 1575 { 1576 int8 r; 1577 byte16 ba = cast(byte16)a; 1578 for (int n = 0; n < 8; ++n) 1579 { 1580 r.ptr[n] = cast(ubyte)ba.array[n]; 1581 } 1582 return cast(__m256i)r; 1583 } 1584 } 1585 unittest 1586 { 1587 __m128i A = _mm_setr_epi8(-1, 0, -128, 127, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13); 1588 int8 C = cast(int8) _mm256_cvtepu8_epi32(A); 1589 int[8] correct = [255, 0, 128, 127, 2, 3, 4, 5]; 1590 assert(C.array == correct); 1591 } 1592 1593 /// Zero-extend packed unsigned 8-bit integers in `a` to packed 64-bit integers. 1594 __m256i _mm256_cvtepu8_epi64 (__m128i a) pure @trusted 1595 { 1596 // PERF ARM64+LDC, not awesome 1597 static if (GDC_with_AVX2) 1598 { 1599 return cast(__m256i) __builtin_ia32_pmovzxbq256(cast(ubyte16)a); 1600 } 1601 else static if (LDC_with_optimizations) 1602 { 1603 enum ir = ` 1604 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3> 1605 %r = zext <4 x i8> %v to <4 x i64> 1606 ret <4 x i64> %r`; 1607 return cast(__m256i) LDCInlineIR!(ir, long4, byte16)(cast(byte16)a); 1608 } 1609 else 1610 { 1611 long4 r; 1612 byte16 ba = cast(byte16)a; 1613 for (int n = 0; n < 4; ++n) 1614 { 1615 r.ptr[n] = cast(ubyte)ba.array[n]; 1616 } 1617 return cast(__m256i)r; 1618 } 1619 } 1620 unittest 1621 { 1622 __m128i A = _mm_setr_epi8(-1, 0, -128, 127, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13); 1623 long4 C = cast(long4) _mm256_cvtepu8_epi64(A); 1624 long[4] correct = [255, 0, 128, 127]; 1625 assert(C.array == correct); 1626 } 1627 1628 /// Extract a 16-bit integer from `a`, selected with index. 1629 int _mm256_extract_epi16 (__m256i a, int index) pure @trusted 1630 { 1631 short16 sa = cast(short16)a; 1632 return sa.ptr[index & 15]; 1633 } 1634 unittest 1635 { 1636 short16 b; 1637 b = 43; 1638 assert(_mm256_extract_epi16(cast(__m256i)b, 7) == 43); 1639 } 1640 1641 /// Extract a 8-bit integer from `a`, selected with index. 1642 int _mm256_extract_epi8 (__m256i a, int index) pure @trusted 1643 { 1644 byte32 sa = cast(byte32)a; 1645 return sa.ptr[index & 31]; 1646 } 1647 unittest 1648 { 1649 byte32 b; 1650 b = -44; 1651 assert(_mm256_extract_epi8(cast(__m256i)b, 5) == -44); 1652 assert(_mm256_extract_epi8(cast(__m256i)b, 5 + 32) == -44); 1653 } 1654 1655 /// Extract 128 bits (composed of integer data) from `a`, selected with `imm8`. 1656 __m128i _mm256_extracti128_si256(int imm8)(__m256i a) pure @trusted 1657 if ( (imm8 == 0) || (imm8 == 1) ) 1658 { 1659 pragma(inline, true); 1660 1661 static if (GDC_with_AVX2) 1662 { 1663 return cast(__m128i) __builtin_ia32_extract128i256(a, imm8); 1664 } 1665 else static if (LDC_with_optimizations) 1666 { 1667 enum str = (imm8 == 1) ? "<i32 2, i32 3>" : "<i32 0, i32 1>"; 1668 enum ir = "%r = shufflevector <4 x i64> %0, <4 x i64> undef, <2 x i32>" ~ str ~ "\n" ~ 1669 "ret <2 x i64> %r"; 1670 return cast(__m128i) LDCInlineIR!(ir, ulong2, ulong4)(cast(ulong4)a); 1671 } 1672 else 1673 { 1674 long4 al = cast(long4) a; 1675 long2 ret; 1676 ret.ptr[0] = (imm8==1) ? al.array[2] : al.array[0]; 1677 ret.ptr[1] = (imm8==1) ? al.array[3] : al.array[1]; 1678 return cast(__m128i) ret; 1679 } 1680 } 1681 unittest 1682 { 1683 __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432 ); 1684 int[4] correct0 = [ -7, -1, 0, 9 ]; 1685 int[4] correct1 = [ -100, 100, 234, 432 ]; 1686 __m128i R0 = _mm256_extracti128_si256!(0)(A); 1687 __m128i R1 = _mm256_extracti128_si256!(1)(A); 1688 assert(R0.array == correct0); 1689 assert(R1.array == correct1); 1690 } 1691 1692 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results. 1693 __m256i _mm256_hadd_epi16 (__m256i a, __m256i b) pure @safe 1694 { 1695 static if (GDC_or_LDC_with_AVX2) 1696 { 1697 return cast(__m256i) __builtin_ia32_phaddw256(cast(short16)a, cast(short16)b); 1698 } 1699 else 1700 { 1701 __m128i a_lo = _mm256_extractf128_si256!0(a); 1702 __m128i a_hi = _mm256_extractf128_si256!1(a); 1703 __m128i b_lo = _mm256_extractf128_si256!0(b); 1704 __m128i b_hi = _mm256_extractf128_si256!1(b); 1705 __m128i r_lo = _mm_hadd_epi16(a_lo, b_lo); 1706 __m128i r_hi = _mm_hadd_epi16(a_hi, b_hi); 1707 return _mm256_set_m128i(r_hi, r_lo); 1708 } 1709 } 1710 unittest 1711 { 1712 __m256i A = _mm256_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768, 1, -2, 4, 8, 16, 32, -1, -32768); 1713 short16 C = cast(short16) _mm256_hadd_epi16(A, A); 1714 short[16] correct = [ -1, 12, 48, 32767, -1, 12, 48, 32767, -1, 12, 48, 32767, -1, 12, 48, 32767]; 1715 assert(C.array == correct); 1716 } 1717 1718 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results. 1719 __m256i _mm256_hadd_epi32 (__m256i a, __m256i b) pure @safe 1720 { 1721 static if (GDC_or_LDC_with_AVX2) 1722 { 1723 return cast(__m256i) __builtin_ia32_phaddd256(cast(int8)a, cast(int8)b); 1724 } 1725 else 1726 { 1727 __m128i a_lo = _mm256_extractf128_si256!0(a); 1728 __m128i a_hi = _mm256_extractf128_si256!1(a); 1729 __m128i b_lo = _mm256_extractf128_si256!0(b); 1730 __m128i b_hi = _mm256_extractf128_si256!1(b); 1731 __m128i r_lo = _mm_hadd_epi32(a_lo, b_lo); 1732 __m128i r_hi = _mm_hadd_epi32(a_hi, b_hi); 1733 return _mm256_set_m128i(r_hi, r_lo); 1734 } 1735 } 1736 unittest 1737 { 1738 __m256i A = _mm256_setr_epi32(1, -2, int.min, -1, 1, -2, int.min, -1); 1739 __m256i B = _mm256_setr_epi32(1, int.max, 4, -4, 1, int.max, 4, -4); 1740 int8 C = cast(int8) _mm256_hadd_epi32(A, B); 1741 int[8] correct = [ -1, int.max, int.min, 0, -1, int.max, int.min, 0 ]; 1742 assert(C.array == correct); 1743 } 1744 1745 /// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, and pack the signed 16-bit results. 1746 __m256i _mm256_hadds_epi16 (__m256i a, __m256i b) pure @safe 1747 { 1748 static if (GDC_or_LDC_with_AVX2) 1749 { 1750 return cast(__m256i) __builtin_ia32_phaddsw256(cast(short16)a, cast(short16)b); 1751 } 1752 else 1753 { 1754 __m128i a_lo = _mm256_extractf128_si256!0(a); 1755 __m128i a_hi = _mm256_extractf128_si256!1(a); 1756 __m128i b_lo = _mm256_extractf128_si256!0(b); 1757 __m128i b_hi = _mm256_extractf128_si256!1(b); 1758 __m128i r_lo = _mm_hadds_epi16(a_lo, b_lo); 1759 __m128i r_hi = _mm_hadds_epi16(a_hi, b_hi); 1760 return _mm256_set_m128i(r_hi, r_lo); 1761 } 1762 } 1763 unittest 1764 { 1765 __m256i A = _mm256_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768, 1, -2, 4, 8, 16, 32, -1, -32768); 1766 short16 C = cast(short16) _mm256_hadds_epi16(A, A); 1767 short[16] correct = [ -1, 12, 48, -32768, -1, 12, 48, -32768, -1, 12, 48, -32768, -1, 12, 48, -32768]; 1768 assert(C.array == correct); 1769 } 1770 1771 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results. 1772 __m256i _mm256_hsub_epi16 (__m256i a, __m256i b) pure @safe 1773 { 1774 static if (GDC_or_LDC_with_AVX2) 1775 { 1776 return cast(__m256i) __builtin_ia32_phsubw256(cast(short16)a, cast(short16)b); 1777 } 1778 else 1779 { 1780 __m128i a_lo = _mm256_extractf128_si256!0(a); 1781 __m128i a_hi = _mm256_extractf128_si256!1(a); 1782 __m128i b_lo = _mm256_extractf128_si256!0(b); 1783 __m128i b_hi = _mm256_extractf128_si256!1(b); 1784 __m128i r_lo = _mm_hsub_epi32(a_lo, b_lo); 1785 __m128i r_hi = _mm_hsub_epi32(a_hi, b_hi); 1786 return _mm256_set_m128i(r_hi, r_lo); 1787 } 1788 } 1789 unittest 1790 { 1791 __m256i A = _mm256_setr_epi32(1, 2, int.min, 1, 1, 2, int.min, 1); 1792 __m256i B = _mm256_setr_epi32(int.max, -1, 4, 4, int.max, -1, 4, 4); 1793 int8 C = cast(int8) _mm256_hsub_epi32(A, B); 1794 int[8] correct = [ -1, int.max, int.min, 0, -1, int.max, int.min, 0 ]; 1795 assert(C.array == correct); 1796 } 1797 1798 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results. 1799 __m256i _mm256_hsub_epi32 (__m256i a, __m256i b) pure @safe 1800 { 1801 static if (GDC_or_LDC_with_AVX2) 1802 { 1803 return cast(__m256i) __builtin_ia32_phsubd256(cast(int8)a, cast(int8)b); 1804 } 1805 else 1806 { 1807 __m128i a_lo = _mm256_extractf128_si256!0(a); 1808 __m128i a_hi = _mm256_extractf128_si256!1(a); 1809 __m128i b_lo = _mm256_extractf128_si256!0(b); 1810 __m128i b_hi = _mm256_extractf128_si256!1(b); 1811 __m128i r_lo = _mm_hsub_epi32(a_lo, b_lo); 1812 __m128i r_hi = _mm_hsub_epi32(a_hi, b_hi); 1813 return _mm256_set_m128i(r_hi, r_lo); 1814 } 1815 } 1816 unittest 1817 { 1818 __m256i A = _mm256_setr_epi32(1, 2, int.min, 1, 1, 2, int.min, 1); 1819 __m256i B = _mm256_setr_epi32(int.max, -1, 4, 4, int.max, -1, 4, 4); 1820 int8 C = cast(int8) _mm256_hsub_epi32(A, B); 1821 int[8] correct = [ -1, int.max, int.min, 0, -1, int.max, int.min, 0 ]; 1822 assert(C.array == correct); 1823 } 1824 1825 /// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, and pack the signed 16-bit results. 1826 __m256i _mm256_hsubs_epi16 (__m256i a, __m256i b) pure @safe 1827 { 1828 static if (GDC_or_LDC_with_AVX2) 1829 { 1830 return cast(__m256i) __builtin_ia32_phsubsw256(cast(short16)a, cast(short16)b); 1831 } 1832 else 1833 { 1834 __m128i a_lo = _mm256_extractf128_si256!0(a); 1835 __m128i a_hi = _mm256_extractf128_si256!1(a); 1836 __m128i b_lo = _mm256_extractf128_si256!0(b); 1837 __m128i b_hi = _mm256_extractf128_si256!1(b); 1838 __m128i r_lo = _mm_hsubs_epi16(a_lo, b_lo); 1839 __m128i r_hi = _mm_hsubs_epi16(a_hi, b_hi); 1840 return _mm256_set_m128i(r_hi, r_lo); 1841 } 1842 } 1843 unittest 1844 { 1845 __m256i A = _mm256_setr_epi16(1, -2, 4, 8, 32767, -1, -10, 32767, 1, -2, 4, 8, 32767, -1, -10, 32767); 1846 short16 C = cast(short16) _mm256_hsubs_epi16(A, A); 1847 short[16] correct = [ 3, -4, 32767, -32768, 3, -4, 32767, -32768, 3, -4, 32767, -32768, 3, -4, 32767, -32768 ]; 1848 assert(C.array == correct); 1849 } 1850 1851 /// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded 1852 /// from addresses starting at `base_addr` and offset by each 32-bit element in `vindex` 1853 /// (each index is scaled by the factor in `scale`). Return gathered elements. 1854 /// `scale` should be 1, 2, 4 or 8. 1855 __m128i _mm_i32gather_epi32(int scale)(const(int)* base_addr, __m128i vindex) @system 1856 { 1857 __m128i src; 1858 return _mm_mask_i32gather_epi32!scale(src, base_addr, vindex, _mm_set1_epi32(-1)); 1859 } 1860 unittest 1861 { 1862 int[8] data = [0, 1, 2, 3, 1863 4, 5, 6, 7]; 1864 __m128i vindex = _mm_setr_epi32(-2, 0, 4, 6); 1865 int4 A = cast(int4) _mm_i32gather_epi32!2(&data[1], vindex); 1866 int[4] correctA = [0, 1, 3, 4]; 1867 assert(A.array == correctA); 1868 } 1869 1870 /// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded 1871 /// from addresses starting at `base_addr` and offset by each 32-bit element in `vindex` 1872 /// (each index is scaled by the factor in `scale`). Gathered elements are merged using 1873 /// `mask` (elements are copied from `src` when the highest bit is not set in the 1874 /// corresponding element). `scale` should be 1, 2, 4 or 8. 1875 __m128i _mm_mask_i32gather_epi32(int scale)(__m128i src, const(int)* base_addr, __m128i vindex, __m128i mask) @system 1876 { 1877 static assert(isValidSIBScale(scale)); 1878 static if (LDC_with_AVX2) 1879 { 1880 return cast(__m128i) __builtin_ia32_gatherd_d(src, base_addr, vindex, mask, cast(ubyte)scale); 1881 } 1882 else static if (GDC_with_AVX2) 1883 { 1884 // Not pure, so the intrinsic cannot be pure. 1885 return cast(__m128i) __builtin_ia32_gathersiv4si (src, base_addr, vindex, mask, scale); 1886 } 1887 else 1888 { 1889 __m128i r; 1890 for (int n = 0; n < 4; ++n) 1891 { 1892 int index = vindex.array[n]; 1893 long offset = cast(long)index * scale; 1894 void* p = cast(void*)(base_addr); 1895 if (mask.array[n] < 0) 1896 r.ptr[n] = *cast(int*)(p + offset); 1897 else 1898 r.ptr[n] = src.ptr[n]; 1899 } 1900 return r; 1901 } 1902 } 1903 unittest 1904 { 1905 int[24] data = [0, 1, 2, 3, 1906 4, 5, 6, 7, 1907 8, 9, 10, 11, 1908 12, 13, 14, 15, 1909 16, 17, 18, 19, 1910 20, 21, 22, 23]; 1911 __m128i src = _mm_setr_epi32(-1, -2, -3, -4); 1912 __m128i mask = _mm_setr_epi32(-4, 4, -1, -2); 1913 __m128i vindex = _mm_setr_epi32(-4, 4, 0, 8); 1914 1915 int4 A = cast(int4) _mm_mask_i32gather_epi32!1(src, &data[10], vindex, mask); 1916 int4 B = cast(int4) _mm_mask_i32gather_epi32!2(src, &data[10], vindex, mask); 1917 int4 C = cast(int4) _mm_mask_i32gather_epi32!4(src, &data[10], vindex, mask); 1918 int[4] correctA = [9, -2, 10, 12]; 1919 int[4] correctB = [8, -2, 10, 14]; 1920 int[4] correctC = [6, -2, 10, 18]; 1921 assert(A.array == correctA); 1922 assert(B.array == correctB); 1923 assert(C.array == correctC); 1924 } 1925 1926 /// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded 1927 /// from addresses starting at `base_addr` and offset by each 32-bit element in `vindex` 1928 /// (each index is scaled by the factor in `scale`). Gathered elements are returned. 1929 /// `scale` should be 1, 2, 4 or 8. 1930 __m256i _mm256_i32gather_epi32(int scale)(const(int)* base_addr, __m256i vindex) @system 1931 { 1932 __m256i src; 1933 return _mm256_mask_i32gather_epi32!scale(src, base_addr, vindex, _mm256_set1_epi32(-1)); 1934 } 1935 unittest 1936 { 1937 int[8] data = [0, 1, 2, 3, 1938 4, 5, 6, 7]; 1939 __m256i vindex = _mm256_setr_epi32(-1, 0, 2, 1, -2, -1, 1, 1); 1940 int8 A = cast(int8) _mm256_i32gather_epi32!4(&data[3], vindex); 1941 int[8] correctA = [2, 3, 5, 4, 1, 2, 4, 4]; 1942 assert(A.array == correctA); 1943 } 1944 1945 /// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded 1946 /// from addresses starting at `base_addr` and offset by each 32-bit element in `vindex` 1947 /// (each index is scaled by the factor in `scale`). Gathered elements are merged using mask 1948 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 1949 /// `scale` should be 1, 2, 4 or 8. 1950 __m256i _mm256_mask_i32gather_epi32(int scale)(__m256i src, const(int)* base_addr, __m256i vindex, __m256i mask) @system 1951 { 1952 static assert(isValidSIBScale(scale)); 1953 static if (LDC_with_AVX2) 1954 { 1955 // Not pure, so the intrinsic cannot be pure. 1956 return cast(__m256i) __builtin_ia32_gatherd_d256(cast(int8)src, base_addr, cast(int8)vindex, cast(int8)mask, cast(ubyte)scale); 1957 } 1958 else static if (GDC_with_AVX2) 1959 { 1960 return cast(__m256i) __builtin_ia32_gathersiv8si (cast(int8)src, base_addr, cast(int8)vindex, cast(int8)mask, scale); 1961 } 1962 else 1963 { 1964 int8 r; 1965 int8 vindexi = cast(int8)vindex; 1966 int8 srci = cast(int8)src; 1967 int8 maski = cast(int8)mask; 1968 for (int n = 0; n < 8; ++n) 1969 { 1970 int index = vindexi.array[n]; 1971 long offset = cast(long)index * scale; 1972 void* p = cast(void*)(base_addr); 1973 if (maski.array[n] < 0) 1974 r.ptr[n] = *cast(int*)(p + offset); 1975 else 1976 r.ptr[n] = srci.ptr[n]; 1977 } 1978 return cast(__m256i)r; 1979 } 1980 } 1981 unittest 1982 { 1983 int[24] data = [0, 1, 2, 3, 1984 4, 5, 6, 7, 1985 8, 9, 10, 11, 1986 12, 13, 14, 15, 1987 16, 17, 18, 19, 1988 20, 21, 22, 23]; 1989 __m256i src = _mm256_setr_epi32(-1, -2, -3, -4, -5, -6, -7, -8); 1990 __m256i mask = _mm256_setr_epi32(-4, 4, -1, -2, 0, 0, -8, -9); 1991 __m256i vindex = _mm256_setr_epi32(-4, 4, 0, 8, 0, 12, -8, 4); 1992 1993 int8 A = cast(int8) _mm256_mask_i32gather_epi32!1(src, &data[10], vindex, mask); 1994 int8 B = cast(int8) _mm256_mask_i32gather_epi32!2(src, &data[10], vindex, mask); 1995 int8 C = cast(int8) _mm256_mask_i32gather_epi32!4(src, &data[10], vindex, mask); 1996 int[8] correctA = [9, -2, 10, 12, -5, -6, 8, 11]; 1997 int[8] correctB = [8, -2, 10, 14, -5, -6, 6, 12]; 1998 int[8] correctC = [6, -2, 10, 18, -5, -6, 2, 14]; 1999 assert(A.array == correctA); 2000 assert(B.array == correctB); 2001 assert(C.array == correctC); 2002 } 2003 2004 /// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded 2005 /// from addresses starting at `base_addr` and offset by each 32-bit element in `vindex` 2006 /// (each index is scaled by the factor in `scale`). Gathered elements are returned. 2007 /// `scale` should be 1, 2, 4 or 8. 2008 __m128i _mm_i32gather_epi64(int scale)(const(long)* base_addr, __m128i vindex) @system 2009 { 2010 __m128i src; 2011 return _mm_mask_i32gather_epi64!scale(src, base_addr, vindex, _mm_set1_epi64x(-1)); 2012 } 2013 unittest 2014 { 2015 long[8] data = [0, 1, 2, 3, 2016 4, 5, 6, 7]; 2017 __m128i vindex = _mm_setr_epi32(-4, 24, 420, 420); 2018 long2 A = cast(long2) _mm_i32gather_epi64!2(&data[1], vindex); 2019 long[2] correctA = [0, 7]; 2020 assert(A.array == correctA); 2021 } 2022 2023 /// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded 2024 /// from addresses starting at `base_addr` and offset by each 32-bit element in `vindex` 2025 /// (each index is scaled by the factor in `scale`). Gathered elements are merged using mask 2026 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 2027 /// `scale` should be 1, 2, 4 or 8. 2028 __m128i _mm_mask_i32gather_epi64(int scale)(__m128i src, const(long)* base_addr, __m128i vindex, __m128i mask) @system 2029 { 2030 static assert(isValidSIBScale(scale)); 2031 static if (GDC_with_AVX2) 2032 { 2033 return cast(__m128i) __builtin_ia32_gathersiv2di(cast(long2)src, base_addr, cast(int4)vindex, cast(long2)mask, scale); 2034 } 2035 else static if (LDC_with_AVX2) 2036 { 2037 return cast(__m128i) __builtin_ia32_gatherd_q(cast(long2)src, base_addr, cast(int4)vindex, cast(long2)mask, scale); 2038 } 2039 else 2040 { 2041 // Note: top 2 indexes in vindex are unused 2042 long2 r; 2043 int4 vindexi = cast(int4)vindex; 2044 long2 srci = cast(long2)src; 2045 long2 maski = cast(long2)mask; 2046 for (int n = 0; n < 2; ++n) 2047 { 2048 int index = vindexi.array[n]; 2049 long offset = cast(long)index * scale; 2050 void* p = cast(void*)(base_addr); 2051 if (maski.array[n] < 0) 2052 r.ptr[n] = *cast(long*)(p + offset); 2053 else 2054 r.ptr[n] = srci.ptr[n]; 2055 } 2056 return cast(__m128i)r; 2057 } 2058 } 2059 unittest 2060 { 2061 long[8] data = [0, 1, 2, 3, 2062 4, 5, 6, 7]; 2063 __m128i src = _mm_setr_epi64(-1, -2); 2064 __m128i mask = _mm_setr_epi64(0, -1); 2065 __m128i vindex = _mm_setr_epi32(-400, 3*8, 420, 420); 2066 long2 A = cast(long2) _mm_mask_i32gather_epi64!2(src, &data[1], vindex, mask); 2067 long[2] correctA = [-1, 7]; 2068 assert(A.array == correctA); 2069 } 2070 2071 /// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded 2072 /// from addresses starting at `base_addr` and offset by each 32-bit element in `vindex` 2073 /// (each index is scaled by the factor in `scale`). Gathered elements are returned. 2074 /// `scale` should be 1, 2, 4 or 8. 2075 __m256i _mm256_i32gather_epi64(int scale)(const(long)* base_addr, __m128i vindex) @system 2076 { 2077 __m256i src; 2078 return _mm256_mask_i32gather_epi64!scale(src, base_addr, vindex, _mm256_set1_epi64x(-1)); 2079 } 2080 unittest 2081 { 2082 long[8] data = [0, 1, 2, 3, 2083 4, 5, 6, 7]; 2084 __m128i vindex = _mm_setr_epi32(-4, 24, 0, 12); 2085 long4 A = cast(long4) _mm256_i32gather_epi64!2(&data[1], vindex); 2086 long[4] correctA = [0, 7, 1, 4]; 2087 assert(A.array == correctA); 2088 } 2089 2090 /// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded 2091 /// from addresses starting at `base_addr` and offset by each 32-bit element in `vindex` 2092 /// (each index is scaled by the factor in `scale`). Gathered elements are merged using mask 2093 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 2094 /// `scale` should be 1, 2, 4 or 8. 2095 __m256i _mm256_mask_i32gather_epi64(int scale)(__m256i src, const(long)* base_addr, __m128i vindex, __m256i mask) @system 2096 { 2097 static assert(isValidSIBScale(scale)); 2098 static if (LDC_with_AVX2) 2099 { 2100 return cast(__m256i) __builtin_ia32_gatherd_q256(cast(long4)src, base_addr, cast(int4)vindex, cast(long4)mask, cast(ubyte)scale); 2101 } 2102 else static if (GDC_with_AVX2) 2103 { 2104 return cast(__m256i) __builtin_ia32_gathersiv4di (cast(long4)src, base_addr, cast(int4)vindex, cast(long4)mask, scale); 2105 } 2106 else 2107 { 2108 long4 r; 2109 int4 vindexi = cast(int4)vindex; 2110 long4 srci = cast(long4)src; 2111 long4 maski = cast(long4)mask; 2112 for (int n = 0; n < 4; ++n) 2113 { 2114 int index = vindexi.array[n]; 2115 long offset = cast(long)index * scale; 2116 void* p = cast(void*)(base_addr); 2117 if (maski.array[n] < 0) 2118 r.ptr[n] = *cast(long*)(p + offset); 2119 else 2120 r.ptr[n] = srci.ptr[n]; 2121 } 2122 return cast(__m256i)r; 2123 } 2124 } 2125 unittest 2126 { 2127 long[8] data = [0, 1, 2, 3, 2128 4, 5, 6, 7]; 2129 __m256i src = _mm256_setr_epi64(-1, -2, -3, -4); 2130 __m256i mask = _mm256_setr_epi64(0, -1, 0, -1); 2131 __m128i vindex = _mm_setr_epi32(-400, 3*8, 420, 4); 2132 long4 A = cast(long4) _mm256_mask_i32gather_epi64!2(src, &data[1], vindex, mask); 2133 long[4] correctA = [-1, 7, -3, 2]; 2134 assert(A.array == correctA); 2135 } 2136 2137 // Note: the floating point gathers reuse the integer gathers 2138 2139 /// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 2140 /// 64-bit elements are loaded from addresses starting at `base_addr` and offset by each 32-bit 2141 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are returned. 2142 /// `scale` should be 1, 2, 4 or 8. 2143 __m128d _mm_i32gather_pd(int scale)(const(double)* base_addr, __m128i vindex) @system 2144 { 2145 return cast(__m128d) _mm_i32gather_epi64!scale(cast(const(long)*) base_addr, vindex); 2146 } 2147 unittest 2148 { 2149 double[8] data = [0.0, 1.0, 2.0, 3.0, 2150 4.0, 5.0, 6.0, 7.0]; 2151 __m128i vindex = _mm_setr_epi32(-4, 24, 420, 420); 2152 __m128d A = _mm_i32gather_pd!2(&data[1], vindex); 2153 double[2] correctA = [0.0, 7.0]; 2154 assert(A.array == correctA); 2155 } 2156 2157 /// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 2158 /// 64-bit elements are loaded from addresses starting at `base_addr` and offset by each 32-bit 2159 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are merged using `mask` 2160 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 2161 /// `scale` should be 1, 2, 4 or 8. 2162 __m128d _mm_mask_i32gather_pd(int scale)(__m128d src, const(double)* base_addr, __m128i vindex, __m128d mask) @system 2163 { 2164 return cast(__m128d) _mm_mask_i32gather_epi64!scale(cast(__m128i)src, cast(const(long)*) base_addr, vindex, cast(__m128i)mask); 2165 } 2166 unittest 2167 { 2168 double[8] data = [0.0, 1.0, 2.0, 3.0, 2169 4.0, 5.0, 6.0, 7.0]; 2170 __m128d src = _mm_setr_pd(-1.0, -2.0); 2171 __m128d mask = _mm_setr_pd(0.0, -1.0); 2172 __m128i vindex = _mm_setr_epi32(-400, 3*8, 420, 420); 2173 __m128d A = _mm_mask_i32gather_pd!2(src, &data[1], vindex, mask); 2174 double[2] correctA = [-1.0, 7.0]; 2175 assert(A.array == correctA); 2176 } 2177 2178 /// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 2179 /// 64-bit elements are loaded from addresses starting at `base_addr` and offset by each 32-bit 2180 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are returned. 2181 /// `scale` should be 1, 2, 4 or 8. 2182 __m256d _mm256_i32gather_pd(int scale)(const(double)* base_addr, __m128i vindex) @system 2183 { 2184 return cast(__m256d) _mm256_i32gather_epi64!scale(cast(const(long)*) base_addr, vindex); 2185 } 2186 unittest 2187 { 2188 double[8] data = [0.0, 1.0, 2.0, 3.0, 2189 4.0, 5.0, 6.0, 7.0]; 2190 __m128i vindex = _mm_setr_epi32(-4, 24, 0, 12); 2191 __m256d A = _mm256_i32gather_pd!2(&data[1], vindex); 2192 double[4] correctA = [0.0, 7.0, 1.0, 4.0]; 2193 assert(A.array == correctA); 2194 } 2195 2196 /// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 2197 /// 64-bit elements are loaded from addresses starting at `base_addr` and offset by each 32-bit 2198 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are merged using `mask` 2199 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 2200 /// `scale` should be 1, 2, 4 or 8. 2201 __m256d _mm256_mask_i32gather_pd(int scale)(__m256d src, const(double)* base_addr, __m128i vindex, __m256d mask) @system 2202 { 2203 return cast(__m256d) _mm256_mask_i32gather_epi64!scale(cast(__m256i)src, cast(const(long)*) base_addr, vindex, cast(__m256i)mask); 2204 } 2205 unittest 2206 { 2207 double[8] data = [0.0, 1.0, 2.0, 3.0, 2208 4.0, 5.0, 6.0, 7.0]; 2209 __m256d src = _mm256_setr_pd(-1.0, -2.0, -3.0, -4.0); 2210 __m256d mask = _mm256_setr_pd(0.0, -1.0, 0.0, -1.0); 2211 __m128i vindex = _mm_setr_epi32(-400, 3*8, 420, 4); 2212 __m256d A = _mm256_mask_i32gather_pd!2(src, &data[1], vindex, mask); 2213 double[4] correctA = [-1.0, 7.0, -3.0, 2.0]; 2214 assert(A.array == correctA); 2215 } 2216 2217 /// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 2218 /// 32-bit elements are loaded from addresses starting at `base_addr` and offset by each 32-bit 2219 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are returned. 2220 /// `scale` should be 1, 2, 4 or 8. 2221 __m128 _mm_i32gather_ps(int scale)(const(float)* base_addr, __m128i vindex) @system 2222 { 2223 return cast(__m128) _mm_i32gather_epi32!scale(cast(const(int)*) base_addr, vindex); 2224 } 2225 unittest 2226 { 2227 float[8] data = [0.0f, 1.0f, 2.0f, 3.0f, 2228 4.0f, 5.0f, 6.0f, 7.0f]; 2229 __m128i vindex = _mm_setr_epi32(-2, 12, 0, 4); 2230 __m128 A = _mm_i32gather_ps!2(&data[1], vindex); 2231 float[4] correctA = [0.0f, 7.0f, 1.0f, 3.0f]; 2232 assert(A.array == correctA); 2233 } 2234 2235 /// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 2236 /// 32-bit elements are loaded from addresses starting at `base_addr` and offset by each 32-bit 2237 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are merged using `mask` 2238 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 2239 /// `scale` should be 1, 2, 4 or 8. 2240 __m128 _mm_mask_i32gather_ps(int scale)(__m128 src, const(float)* base_addr, __m128i vindex, __m128 mask) @system 2241 { 2242 return cast(__m128) _mm_mask_i32gather_epi32!scale(cast(__m128i)src, cast(const(int)*) base_addr, vindex, cast(__m128i)mask); 2243 } 2244 unittest 2245 { 2246 float[24] data = [0.0f, 1.0f, 2.0f, 3.0f, 2247 4.0f, 5.0f, 6.0f, 7.0f, 2248 8.0f, 9.0f, 10.0f, 11.0f, 2249 12.0f, 13.0f, 14.0f, 15.0f, 2250 16.0f, 17.0f, 18.0f, 19.0f, 2251 20.0f, 21.0f, 22.0f, 23.0f]; 2252 __m128 src = _mm_setr_ps(-1.0f, -2.0f, -3.0f, -4.0f); 2253 __m128 mask = _mm_setr_ps(-4.0f, 4.0f, -1.0f, -2.0f); 2254 __m128i vindex = _mm_setr_epi32(-4, 4, 0, 8); 2255 __m128 A = _mm_mask_i32gather_ps!1(src, &data[10], vindex, mask); 2256 float[4] correctA = [9.0f, -2.0f, 10.0f, 12.0f]; 2257 assert(A.array == correctA); 2258 } 2259 2260 /// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 2261 /// 32-bit elements are loaded from addresses starting at `base_addr` and offset by each 32-bit 2262 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are returned. 2263 /// `scale` should be 1, 2, 4 or 8. 2264 __m256 _mm256_i32gather_ps(int scale)(const(float)* base_addr, __m256i vindex) @system 2265 { 2266 return cast(__m256) _mm256_i32gather_epi32!scale(cast(const(int)*) base_addr, vindex); 2267 } 2268 unittest 2269 { 2270 float[8] data = [0.0f, 1.0f, 2.0f, 3.0f, 2271 4.0f, 5.0f, 6.0f, 7.0f]; 2272 __m256i vindex = _mm256_setr_epi32(-1, 0, 2, 1, -2, -1, 1, 1); 2273 __m256 A = _mm256_i32gather_ps!4(&data[3], vindex); 2274 float[8] correctA = [2.0f, 3.0f, 5.0f, 4.0f, 1.0f, 2.0f, 4.0f, 4.0f]; 2275 assert(A.array == correctA); 2276 } 2277 2278 /// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 2279 /// 32-bit elements are loaded from addresses starting at `base_addr` and offset by each 32-bit 2280 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are merged using `mask` 2281 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 2282 /// `scale` should be 1, 2, 4 or 8. 2283 __m256 _mm256_mask_i32gather_ps(int scale)(__m256 src, const(float)* base_addr, __m256i vindex, __m256 mask) @system 2284 { 2285 return cast(__m256) _mm256_mask_i32gather_epi32!scale(cast(__m256i)src, cast(const(int)*) base_addr, vindex, cast(__m256i)mask); 2286 } 2287 unittest 2288 { 2289 float[24] data = [0.0f, 1.0f, 2.0f, 3.0f, 2290 4.0f, 5.0f, 6.0f, 7.0f, 2291 8.0f, 9.0f, 10.0f, 11.0f, 2292 12.0f, 13.0f, 14.0f, 15.0f, 2293 16.0f, 17.0f, 18.0f, 19.0f, 2294 20.0f, 21.0f, 22.0f, 23.0f]; 2295 __m256 src = _mm256_setr_ps(-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f); 2296 __m256 mask = _mm256_setr_ps(-4.0f, 4.0f, -1.0f, -2.0f, 0.0f, 0.0f, -8.0f, -9.0f); 2297 __m256i vindex = _mm256_setr_epi32(-4, 4, 0, 8, 0, 12, -8, 4); 2298 2299 __m256 A = _mm256_mask_i32gather_ps!2(src, &data[10], vindex, mask); 2300 float[8] correctA = [8.0f, -2.0f, 10.0f, 14.0f, -5.0f, -6.0f, 6.0f, 12.0f]; 2301 assert(A.array == correctA); 2302 } 2303 2304 /// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded 2305 /// from addresses starting at `base_addr` and offset by each 64-bit element in `vindex` 2306 /// (each index is scaled by the factor in `scale`). Return gathered elements. 2307 /// `scale` should be 1, 2, 4 or 8. 2308 __m128i _mm_i64gather_epi32(int scale)(const(int)* base_addr, __m128i vindex) @system 2309 { 2310 __m128i src; 2311 return _mm_mask_i64gather_epi32!scale(src, base_addr, vindex, _mm_set1_epi32(-1)); 2312 } 2313 unittest 2314 { 2315 int[8] data = [0, 1, 2, 3, 2316 4, 5, 6, 7]; 2317 __m128i vindex = _mm_setr_epi64(-2, 4); 2318 int4 A = cast(int4) _mm_i64gather_epi32!2(&data[1], vindex); 2319 int[4] correctA = [0, 3, 0, 0]; 2320 assert(A.array == correctA); 2321 } 2322 2323 /// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded 2324 /// from addresses starting at `base_addr` and offset by each 64-bit element in `vindex` 2325 /// (each index is scaled by the factor in `scale`). Gathered elements are merged using 2326 /// `mask` (elements are copied from `src` when the highest bit is not set in the 2327 /// corresponding element). `scale` should be 1, 2, 4 or 8. 2328 __m128i _mm_mask_i64gather_epi32(int scale)(__m128i src, const(int)* base_addr, __m128i vindex, __m128i mask) @system 2329 { 2330 static assert(isValidSIBScale(scale)); 2331 2332 static if (GDC_with_AVX2) 2333 { 2334 return cast(__m128i) __builtin_ia32_gatherdiv4si(cast(int4)src, base_addr, cast(long2)vindex, cast(int4)mask, scale); 2335 } 2336 else static if (LDC_with_AVX2) 2337 { 2338 return cast(__m128i) __builtin_ia32_gatherq_d(cast(int4)src, base_addr, cast(long2)vindex, cast(int4)mask, scale); 2339 } 2340 else 2341 { 2342 __m128i r; 2343 long2 vindexl = cast(long2)vindex; 2344 int4 srci = cast(int4)src; 2345 int4 maski = cast(int4)mask; 2346 for (int n = 0; n < 2; ++n) 2347 { 2348 long index = vindexl.array[n]; 2349 long offset = index * scale; 2350 void* p = cast(void*)(base_addr); 2351 if (maski.array[n] < 0) 2352 r.ptr[n] = *cast(int*)(p + offset); 2353 else 2354 r.ptr[n] = srci.array[n]; 2355 } 2356 r.ptr[2] = 0; 2357 r.ptr[3] = 0; 2358 return r; 2359 } 2360 } 2361 unittest 2362 { 2363 int[24] data = [0, 1, 2, 3, 2364 4, 5, 6, 7, 2365 8, 9, 10, 11, 2366 12, 13, 14, 15, 2367 16, 17, 18, 19, 2368 20, 21, 22, 23]; 2369 __m128i src = _mm_setr_epi32(-1, -2, -3, -4); 2370 __m128i mask = _mm_setr_epi32(-4, 4, -1, -2); 2371 __m128i vindex = _mm_setr_epi64(-4, 8); 2372 int4 C = cast(int4) _mm_mask_i64gather_epi32!4(src, &data[10], vindex, mask); 2373 int[4] correctC = [6, -2, 0, 0]; 2374 assert(C.array == correctC); 2375 } 2376 2377 /// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded 2378 /// from addresses starting at `base_addr` and offset by each 64-bit element in `vindex` 2379 /// (each index is scaled by the factor in `scale`). Return gathered elements. 2380 /// `scale` should be 1, 2, 4 or 8. 2381 __m128i _mm256_i64gather_epi32(int scale)(const(int)* base_addr, __m256i vindex) @system 2382 { 2383 __m128i src; 2384 return _mm256_mask_i64gather_epi32!scale(src, base_addr, vindex, _mm_set1_epi32(-1)); 2385 } 2386 unittest 2387 { 2388 int[8] data = [0, 1, 2, 3, 2389 4, 5, 6, 7]; 2390 __m256i vindex = _mm256_setr_epi64(-2, 4, 0, 2); 2391 int4 A = cast(int4) _mm256_i64gather_epi32!2(&data[1], vindex); 2392 int[4] correctA = [0, 3, 1, 2]; 2393 assert(A.array == correctA); 2394 } 2395 2396 /// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded 2397 /// from addresses starting at `base_addr` and offset by each 64-bit element in `vindex` 2398 /// (each index is scaled by the factor in `scale`). Gathered elements are merged using 2399 /// `mask` (elements are copied from `src` when the highest bit is not set in the 2400 /// corresponding element). `scale` should be 1, 2, 4 or 8. 2401 __m128i _mm256_mask_i64gather_epi32(int scale)(__m128i src, const(int)* base_addr, __m256i vindex, __m128i mask) @system 2402 { 2403 static assert(isValidSIBScale(scale)); 2404 2405 static if (GDC_with_AVX2) 2406 { 2407 return cast(__m128i) __builtin_ia32_gatherdiv4si256(cast(int4)src, base_addr, cast(long4)vindex, cast(int4)mask, scale); 2408 } 2409 else static if (LDC_with_AVX2) 2410 { 2411 return cast(__m128i) __builtin_ia32_gatherq_d256(cast(int4)src, base_addr, cast(long4)vindex, cast(int4)mask, cast(ubyte)scale); 2412 } 2413 else 2414 { 2415 __m128i r = src; 2416 long4 vindexl = cast(long4)vindex; 2417 int4 srci = cast(int4)src; 2418 int4 maski = cast(int4)mask; 2419 for (int n = 0; n < 4; ++n) 2420 { 2421 long index = vindexl.array[n]; 2422 long offset = index * scale; 2423 void* p = cast(void*)(base_addr); 2424 if (maski.array[n] < 0) 2425 r.ptr[n] = *cast(int*)(p + offset); 2426 else 2427 r.ptr[n] = srci.ptr[n]; 2428 } 2429 return r; 2430 } 2431 } 2432 unittest 2433 { 2434 int[24] data = [0, 1, 2, 3, 2435 4, 5, 6, 7, 2436 8, 9, 10, 11, 2437 12, 13, 14, 15, 2438 16, 17, 18, 19, 2439 20, 21, 22, 23]; 2440 __m128i src = _mm_setr_epi32(-1, -2, -3, -4); 2441 __m128i mask = _mm_setr_epi32(-4, 4, -1, -2); 2442 __m256i vindex = _mm256_setr_epi64(-4, 8, 0, 12); 2443 2444 int4 A = cast(int4) _mm256_mask_i64gather_epi32!1(src, &data[10], vindex, mask); 2445 int4 B = cast(int4) _mm256_mask_i64gather_epi32!2(src, &data[10], vindex, mask); 2446 int4 C = cast(int4) _mm256_mask_i64gather_epi32!4(src, &data[10], vindex, mask); 2447 int[4] correctA = [9, -2, 10, 13]; 2448 int[4] correctB = [8, -2, 10, 16]; 2449 int[4] correctC = [6, -2, 10, 22]; 2450 assert(A.array == correctA); 2451 assert(B.array == correctB); 2452 assert(C.array == correctC); 2453 } 2454 2455 /// Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded 2456 /// from addresses starting at `base_addr` and offset by each 64-bit element in `vindex` 2457 /// (each index is scaled by the factor in `scale`). Gathered elements are returned. 2458 /// `scale` should be 1, 2, 4 or 8. 2459 __m128i _mm_i64gather_epi64(int scale)(const(long)* base_addr, __m128i vindex) @system 2460 { 2461 __m128i src; 2462 return _mm_mask_i64gather_epi64!scale(src, base_addr, vindex, _mm_set1_epi64x(-1)); 2463 } 2464 unittest 2465 { 2466 long[8] data = [0, 1, 2, 3, 2467 4, 5, 6, 7]; 2468 __m128i vindex = _mm_setr_epi64(-4, 24); 2469 long2 A = cast(long2) _mm_i64gather_epi64!2(&data[1], vindex); 2470 long[2] correctA = [0, 7]; 2471 assert(A.array == correctA); 2472 } 2473 2474 /// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded 2475 /// from addresses starting at `base_addr` and offset by each 32-bit element in `vindex` 2476 /// (each index is scaled by the factor in `scale`). Gathered elements are merged using mask 2477 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 2478 /// `scale` should be 1, 2, 4 or 8. 2479 __m128i _mm_mask_i64gather_epi64(int scale)(__m128i src, const(long)* base_addr, __m128i vindex, __m128i mask) @system 2480 { 2481 static assert(isValidSIBScale(scale)); 2482 2483 static if (GDC_with_AVX2) 2484 { 2485 return cast(__m128i) __builtin_ia32_gatherdiv2di(cast(long2)src, base_addr, cast(long2)vindex, cast(long2)mask, scale); 2486 } 2487 else static if (LDC_with_AVX2) 2488 { 2489 return cast(__m128i) __builtin_ia32_gatherq_q(cast(long2)src, base_addr, cast(long2)vindex, cast(long2)mask, scale); 2490 } 2491 else 2492 { 2493 // Note: top 2 indexes in vindex are unused 2494 long2 r; 2495 long2 vindexi = cast(long2)vindex; 2496 long2 srci = cast(long2)src; 2497 long2 maski = cast(long2)mask; 2498 for (int n = 0; n < 2; ++n) 2499 { 2500 long index = vindexi.array[n]; 2501 long offset = index * scale; 2502 void* p = cast(void*)(base_addr); 2503 if (maski.array[n] < 0) 2504 r.ptr[n] = *cast(long*)(p + offset); 2505 else 2506 r.ptr[n] = srci.array[n]; 2507 } 2508 return cast(__m128i)r; 2509 } 2510 } 2511 unittest 2512 { 2513 long[8] data = [0, 1, 2, 3, 2514 4, 5, 6, 7]; 2515 __m128i src = _mm_setr_epi64(-1, -2); 2516 __m128i mask = _mm_setr_epi64(0, -1); 2517 __m128i vindex = _mm_setr_epi64(-400, 3*8); 2518 long2 A = cast(long2) _mm_mask_i64gather_epi64!2(src, &data[1], vindex, mask); 2519 long2 B = cast(long2) _mm_mask_i64gather_epi64!1(src, &data[1], vindex, mask); 2520 long[2] correctA = [-1, 7]; 2521 long[2] correctB = [-1, 4]; 2522 assert(A.array == correctA); 2523 assert(B.array == correctB); 2524 } 2525 2526 /// Gather 64-bit integers from memory using 64-bit indices. 2527 /// 64-bit elements are loaded from addresses starting at `base_addr` and 2528 /// offset by each 64-bit element in `vindex` (each index is scaled by the 2529 /// factor in `scale`). Gathered elements are returned. 2530 /// `scale` should be 1, 2, 4 or 8. 2531 __m256i _mm256_i64gather_epi64(int scale)(const(long)* base_addr, __m256i vindex) @system 2532 { 2533 __m256i src; 2534 return _mm256_mask_i64gather_epi64!scale(src, base_addr, vindex, _mm256_set1_epi64x(-1)); 2535 } 2536 unittest 2537 { 2538 long[8] data = [0, 1, 2, 3, 2539 4, 5, 6, 7]; 2540 __m256i vindex = _mm256_setr_epi64(-4, 24, 12, 4); 2541 long4 A = cast(long4) _mm256_i64gather_epi64!2(&data[1], vindex); 2542 long[4] correctA = [0, 7, 4, 2]; 2543 assert(A.array == correctA); 2544 } 2545 2546 /// Gather 64-bit integers from memory using 64-bit indices. 2547 /// 64-bit elements are loaded from addresses starting at `base_addr` and offset by each 2548 /// 64-bit element in `vindex` (each index is scaled by the factor in `scale`). 2549 /// Gathered elements are merged into `dst` using mask (elements are copied from `src` 2550 /// when the highest bit is not set in the corresponding element). 2551 /// `scale` should be 1, 2, 4 or 8. 2552 __m256i _mm256_mask_i64gather_epi64(int scale)(__m256i src, const(long)* base_addr, __m256i vindex, __m256i mask) @system 2553 { 2554 static assert(isValidSIBScale(scale)); 2555 static if (LDC_with_AVX2) 2556 { 2557 return cast(__m256i) __builtin_ia32_gatherq_q256(cast(long4)src, base_addr, cast(long4)vindex, cast(long4)mask, scale); 2558 } 2559 else static if (GDC_with_AVX2) 2560 { 2561 return cast(__m256i) __builtin_ia32_gatherdiv4di (cast(long4)src, base_addr, cast(long4)vindex, cast(long4)mask, scale); 2562 } 2563 else 2564 { 2565 long4 r; 2566 long4 vindexi = cast(long4)vindex; 2567 long4 srci = cast(long4)src; 2568 long4 maski = cast(long4)mask; 2569 for (int n = 0; n < 4; ++n) 2570 { 2571 long index = vindexi.array[n]; 2572 long offset = index * scale; 2573 void* p = cast(void*)(base_addr); 2574 if (maski.array[n] < 0) 2575 r.ptr[n] = *cast(long*)(p + offset); 2576 else 2577 r.ptr[n] = srci.array[n]; 2578 } 2579 return cast(__m256i)r; 2580 } 2581 } 2582 unittest 2583 { 2584 long[8] data = [0, 1, 2, 3, 2585 4, 5, 6, 7]; 2586 __m256i src = _mm256_setr_epi64(-1, -2, -3, -4); 2587 __m256i mask = _mm256_setr_epi64(0, -1, 0, -1); 2588 __m256i vindex = _mm256_setr_epi64(-400, 3*8, 420, 4); 2589 long4 A = cast(long4) _mm256_mask_i64gather_epi64!2(src, &data[1], vindex, mask); 2590 long[4] correctA = [-1, 7, -3, 2]; 2591 assert(A.array == correctA); 2592 } 2593 2594 /// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 2595 /// 64-bit elements are loaded from addresses starting at `base_addr` and offset by each 64-bit 2596 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are 2597 /// returned. `scale` should be 1, 2, 4 or 8. 2598 __m128d _mm_i64gather_pd(int scale)(const(double)* base_addr, __m128i vindex) @system 2599 { 2600 return cast(__m128d) _mm_i64gather_epi64!scale(cast(const(long)*)base_addr, vindex); 2601 } 2602 unittest 2603 { 2604 double[8] data = [0.0, 1.0, 2.0, 3.0, 2605 4.0, 5.0, 6.0, 7.0]; 2606 __m128i vindex = _mm_setr_epi64(-4, 24); 2607 __m128d A = _mm_i64gather_pd!2(&data[1], vindex); 2608 double[2] correctA = [0.0, 7.0]; 2609 assert(A.array == correctA); 2610 } 2611 2612 /// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 2613 /// 64-bit elements are loaded from addresses starting at `base_addr` and offset by each 64-bit 2614 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are merged 2615 /// into `dst` using `mask` (elements are copied from `src` when the highest bit is not set in the 2616 /// corresponding element). `scale` should be 1, 2, 4 or 8. 2617 __m128d _mm_mask_i64gather_pd(int scale)(__m128d src, const(double)* base_addr, __m128i vindex, __m128d mask) @system 2618 { 2619 return cast(__m128d) _mm_mask_i64gather_epi64!scale(cast(__m128i)src, cast(const(long)*)base_addr, vindex, cast(__m128i) mask); 2620 } 2621 unittest 2622 { 2623 double[8] data = [0.0, 1.0, 2.0, 3.0, 2624 4.0, 5.0, 6.0, 7.0]; 2625 __m128d src = _mm_setr_pd(-1.0, -2.0); 2626 __m128d mask = _mm_setr_pd(0.0, -1.0); 2627 __m128i vindex = _mm_setr_epi64(-400, 3*8); 2628 __m128d A = _mm_mask_i64gather_pd!2(src, &data[1], vindex, mask); 2629 double[2] correctA = [-1.0, 7.0]; 2630 assert(A.array == correctA); 2631 } 2632 2633 /// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 2634 /// 64-bit elements are loaded from addresses starting at `base_addr` and offset by each 64-bit 2635 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are returned. 2636 /// `scale` should be 1, 2, 4 or 8. 2637 __m256d _mm256_i64gather_pd(int scale)(const(double)* base_addr, __m256i vindex) @system 2638 { 2639 return cast(__m256d) _mm256_i64gather_epi64!scale(cast(const(long)*)base_addr, vindex); 2640 } 2641 unittest 2642 { 2643 double[8] data = [0.0, 1.0, 2.0, 3.0, 2644 4.0, 5.0, 6.0, 7.0]; 2645 __m256i vindex = _mm256_setr_epi64(-4, 24, 0, 12); 2646 __m256d A = _mm256_i64gather_pd!2(&data[1], vindex); 2647 double[4] correctA = [0.0, 7.0, 1.0, 4.0]; 2648 assert(A.array == correctA); 2649 } 2650 2651 /// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 2652 /// 64-bit elements are loaded from addresses starting at `base_addr` and offset by each 64-bit 2653 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are merged using `mask` 2654 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 2655 /// `scale` should be 1, 2, 4 or 8. 2656 __m256d _mm256_mask_i64gather_pd(int scale)(__m256d src, const(double)* base_addr, __m256i vindex, __m256d mask) @system 2657 { 2658 return cast(__m256d) _mm256_mask_i64gather_epi64!scale(cast(__m256i)src, cast(const(long)*)base_addr, vindex, cast(__m256i) mask); 2659 } 2660 unittest 2661 { 2662 double[8] data = [0.0, 1.0, 2.0, 3.0, 2663 4.0, 5.0, 6.0, 7.0]; 2664 __m256d src = _mm256_setr_pd(-1.0, -2.0, -3.0, -4.0); 2665 __m256d mask = _mm256_setr_pd(0.0, -1.0, 0.0, -1.0); 2666 __m256i vindex = _mm256_setr_epi64(-400, 3*8, 420, 4); 2667 __m256d A = _mm256_mask_i64gather_pd!2(src, &data[1], vindex, mask); 2668 double[4] correctA = [-1.0, 7.0, -3.0, 2.0]; 2669 assert(A.array == correctA); 2670 } 2671 2672 /// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 2673 /// 32-bit elements are loaded from addresses starting at `base_addr` and offset by each 64-bit 2674 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are returned. 2675 /// `scale` should be 1, 2, 4 or 8. 2676 __m128 _mm_i64gather_ps(int scale)(const(float)* base_addr, __m128i vindex) @system 2677 { 2678 return cast(__m128) _mm_i64gather_epi32!scale(cast(const(int)*)base_addr, vindex); 2679 } 2680 unittest 2681 { 2682 float[8] data = [0.0f, 1.0f, 2.0f, 3.0f, 2683 4.0f, 5.0f, 6.0f, 7.0f]; 2684 __m128i vindex = _mm_setr_epi64(-2, 12); 2685 __m128 A = _mm_i64gather_ps!2(&data[1], vindex); 2686 float[4] correctA = [0.0f, 7.0f, 0.0f, 0.0f]; 2687 assert(A.array == correctA); 2688 } 2689 2690 /// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 2691 /// 32-bit elements are loaded from addresses starting at `base_addr` and offset by each 64-bit 2692 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are merged using `mask` 2693 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 2694 /// `scale` should be 1, 2, 4 or 8. 2695 __m128 _mm_mask_i64gather_ps(int scale)(__m128 src, const(float)* base_addr, __m128i vindex, __m128 mask) @system 2696 { 2697 return cast(__m128) _mm_mask_i64gather_epi32!scale(cast(__m128i) src, cast(const(int)*) base_addr, vindex, cast(__m128i) mask); 2698 } 2699 unittest 2700 { 2701 float[24] data = [0.0f, 1.0f, 2.0f, 3.0f, 2702 4.0f, 5.0f, 6.0f, 7.0f, 2703 8.0f, 9.0f, 10.0f, 11.0f, 2704 12.0f, 13.0f, 14.0f, 15.0f, 2705 16.0f, 17.0f, 18.0f, 19.0f, 2706 20.0f, 21.0f, 22.0f, 23.0f]; 2707 __m128 src = _mm_setr_ps(-1.0f, -2.0f, -3.0f, -4.0f); 2708 __m128 mask = _mm_setr_ps(-4.0f, 4.0f, -1.0f, -2.0f); 2709 __m128i vindex = _mm_setr_epi64(-4, 4); 2710 __m128 A = _mm_mask_i64gather_ps!1(src, &data[10], vindex, mask); 2711 float[4] correctA = [9.0f, -2.0f, 0.0f, 0.0f]; 2712 assert(A.array == correctA); 2713 } 2714 2715 /// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 2716 /// 32-bit elements are loaded from addresses starting at `base_addr` and offset by each 64-bit 2717 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are returned. 2718 /// `scale` should be 1, 2, 4 or 8. 2719 __m128 _mm256_i64gather_ps(int scale)(const(float)* base_addr, __m256i vindex) @system 2720 { 2721 return cast(__m128) _mm256_i64gather_epi32!scale(cast(const(int)*)base_addr, vindex); 2722 } 2723 unittest 2724 { 2725 float[8] data = [0.0f, 1.0f, 2.0f, 3.0f, 2726 4.0f, 5.0f, 6.0f, 7.0f]; 2727 __m256i vindex = _mm256_setr_epi64(-1, 0, 2, 1); 2728 __m128 A = _mm256_i64gather_ps!4(&data[3], vindex); 2729 float[4] correctA = [2.0f, 3.0f, 5.0f, 4.0f]; 2730 assert(A.array == correctA); 2731 } 2732 2733 /// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 2734 /// 32-bit elements are loaded from addresses starting at `base_addr` and offset by each 64-bit 2735 /// element in `vindex` (each index is scaled by the factor in `scale`). Gathered elements are merged using `mask` 2736 /// (elements are copied from `src` when the highest bit is not set in the corresponding element). 2737 /// `scale` should be 1, 2, 4 or 8. 2738 __m128 _mm256_mask_i64gather_ps(int scale)(__m128 src, const(float)* base_addr, __m256i vindex, __m128 mask) @system 2739 { 2740 return cast(__m128) _mm256_mask_i64gather_epi32!scale(cast(__m128i) src, cast(const(int)*) base_addr, vindex, cast(__m128i) mask); 2741 } 2742 unittest 2743 { 2744 float[24] data = [0.0f, 1.0f, 2.0f, 3.0f, 2745 4.0f, 5.0f, 6.0f, 7.0f, 2746 8.0f, 9.0f, 10.0f, 11.0f, 2747 12.0f, 13.0f, 14.0f, 15.0f, 2748 16.0f, 17.0f, 18.0f, 19.0f, 2749 20.0f, 21.0f, 22.0f, 23.0f]; 2750 __m128 src = _mm_setr_ps(-1.0f, -2.0f, -3.0f, -4.0f); 2751 __m128 mask = _mm_setr_ps(-4.0f, 4.0f, -1.0f, -2.0f); 2752 __m256i vindex = _mm256_setr_epi64(-4, 4, 0, 8); 2753 __m128 A = _mm256_mask_i64gather_ps!2(src, &data[10], vindex, mask); 2754 float[4] correctA = [8.0f, -2.0f, 10.0f, 14.0f]; 2755 assert(A.array == correctA); 2756 } 2757 2758 /// Copy `a` to result, then insert 128 bits from `b` into result at the location specified by 2759 /// `imm8`. 2760 __m256i _mm256_inserti128_si256 (__m256i a, __m128i b, const int imm8) pure @trusted 2761 { 2762 long2 lb = cast(long2)b; 2763 a.ptr[(imm8 & 1)*2 ] = lb.array[0]; 2764 a.ptr[(imm8 & 1)*2+1] = lb.array[1]; 2765 return a; 2766 } 2767 unittest 2768 { 2769 __m256i A = [0, 1, 2, 3]; 2770 long2 B = [4, 5]; 2771 __m256i C = _mm256_inserti128_si256(A, cast(__m128i)B, 0 + 8); 2772 __m256i D = _mm256_inserti128_si256(A, cast(__m128i)B, 1); 2773 long[4] correctC = [4, 5, 2, 3]; 2774 long[4] correctD = [0, 1, 4, 5]; 2775 assert(C.array == correctC); 2776 assert(D.array == correctD); 2777 } 2778 2779 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 2780 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 2781 /// and pack the results in destination. 2782 __m256i _mm256_madd_epi16 (__m256i a, __m256i b) pure @trusted 2783 { 2784 static if (GDC_with_AVX2) 2785 { 2786 return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b); 2787 } 2788 else static if (LDC_with_AVX2) 2789 { 2790 return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b); 2791 } 2792 else 2793 { 2794 // split is beneficial for ARM64, LDC and GDC without AVX2 2795 __m128i a_lo = _mm256_extractf128_si256!0(a); 2796 __m128i a_hi = _mm256_extractf128_si256!1(a); 2797 __m128i b_lo = _mm256_extractf128_si256!0(b); 2798 __m128i b_hi = _mm256_extractf128_si256!1(b); 2799 __m128i r_lo = _mm_madd_epi16(a_lo, b_lo); 2800 __m128i r_hi = _mm_madd_epi16(a_hi, b_hi); 2801 return _mm256_set_m128i(r_hi, r_lo); 2802 } 2803 } 2804 unittest 2805 { 2806 short16 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2807 short16 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2808 int8 R = cast(int8) _mm256_madd_epi16(cast(__m256i)A, cast(__m256i)B); 2809 int[8] correct = [1, 13, -2147483648, 2*32767*32767, 1, 13, -2147483648, 2*32767*32767]; 2810 assert(R.array == correct); 2811 } 2812 2813 /// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 2814 /// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 2815 /// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 2816 /// and pack the saturated results. 2817 __m256i _mm256_maddubs_epi16 (__m256i a, __m256i b) @safe 2818 { 2819 static if (GDC_with_AVX2) 2820 { 2821 return cast(__m256i)__builtin_ia32_pmaddubsw256(cast(ubyte32)a, cast(ubyte32)b); 2822 } 2823 else static if (LDC_with_AVX2) 2824 { 2825 return cast(__m256i)__builtin_ia32_pmaddubsw256(cast(byte32)a, cast(byte32)b); 2826 } 2827 else 2828 { 2829 __m128i a_lo = _mm256_extractf128_si256!0(a); 2830 __m128i a_hi = _mm256_extractf128_si256!1(a); 2831 __m128i b_lo = _mm256_extractf128_si256!0(b); 2832 __m128i b_hi = _mm256_extractf128_si256!1(b); 2833 __m128i r_lo = _mm_maddubs_epi16(a_lo, b_lo); 2834 __m128i r_hi = _mm_maddubs_epi16(a_hi, b_hi); 2835 return _mm256_set_m128i(r_hi, r_lo); 2836 } 2837 } 2838 unittest 2839 { 2840 __m128i A = _mm_setr_epi8( -1, 10, 100, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // u8 2841 __m128i B = _mm_setr_epi8(-128, -30, 100, 127, -1, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0); // i8 2842 __m256i AA = _mm256_set_m128i(A, A); 2843 __m256i BB = _mm256_set_m128i(B, B); 2844 short16 C = cast(short16) _mm256_maddubs_epi16(AA, BB); 2845 short[16] correct = [ -32768, 26256, 0, 0, 0, 0, 0, 0, 2846 -32768, 26256, 0, 0, 0, 0, 0, 0]; 2847 assert(C.array == correct); 2848 } 2849 2850 version(DigitalMars) 2851 { 2852 // this avoids a bug with DMD < 2.099 -a x86 -O 2853 private enum bool maskLoadWorkaroundDMD = (__VERSION__ < 2099); 2854 } 2855 else 2856 { 2857 private enum bool maskLoadWorkaroundDMD = false; 2858 } 2859 2860 /// Load packed 32-bit integers from memory using `mask` (elements are zeroed out when the highest 2861 /// bit is not set in the corresponding element). 2862 /// Warning: See "Note about mask load/store" to know why you must address valid memory only. 2863 __m128i _mm_maskload_epi32 (const(int)* mem_addr, __m128i mask) /* pure */ @system 2864 { 2865 // PERF DMD 2866 static if (LDC_with_AVX2) 2867 { 2868 // MAYDO report that the builtin is impure 2869 return __builtin_ia32_maskloadd(mem_addr, mask); 2870 } 2871 else static if (GDC_with_AVX2) 2872 { 2873 return __builtin_ia32_maskloadd(cast(__m128i*)mem_addr, mask); 2874 } 2875 else 2876 { 2877 return cast(__m128i) _mm_maskload_ps(cast(const(float)*)mem_addr, mask); 2878 } 2879 } 2880 unittest 2881 { 2882 static if (!maskLoadWorkaroundDMD) 2883 { 2884 int[4] A = [7, 1, 2, 3]; 2885 int4 B = _mm_maskload_epi32(A.ptr, _mm_setr_epi32(1, -1, -1, 1)); // can NOT address invalid memory with mask load and writes! 2886 int[4] correct = [0, 1, 2, 0]; 2887 assert(B.array == correct); 2888 } 2889 } 2890 2891 /// Load packed 32-bit integers from memory using `mask` (elements are zeroed out when the highest 2892 /// bit is not set in the corresponding element). 2893 /// Warning: See "Note about mask load/store" to know why you must address valid memory only. 2894 __m256i _mm256_maskload_epi32 (const(int)* mem_addr, __m256i mask) /* pure */ @system 2895 { 2896 static if (LDC_with_AVX2) 2897 { 2898 return cast(__m256i) __builtin_ia32_maskloadd256(mem_addr, cast(int8)mask); 2899 } 2900 else static if (GDC_with_AVX2) 2901 { 2902 return cast(__m256i)__builtin_ia32_maskloadd256(cast(__m256i*)mem_addr, cast(int8)mask); 2903 } 2904 else 2905 { 2906 return cast(__m256i) _mm256_maskload_ps(cast(const(float*)) mem_addr, mask); 2907 } 2908 } 2909 unittest 2910 { 2911 int[8] A = [7, 1, 2, 3, 8, -2, 4, 5]; 2912 int8 B = cast(int8) _mm256_maskload_epi32(A.ptr, _mm256_setr_epi32(1, -1, -1, 1, -1, -1, 1, 1)); 2913 int[8] correct = [0, 1, 2, 0, 8, -2, 0, 0]; 2914 assert(B.array == correct); 2915 } 2916 2917 /// Load packed 64-bit integers from memory using `mask` (elements are zeroed out when the highest 2918 /// bit is not set in the corresponding element). 2919 /// Warning: See "Note about mask load/store" to know why you must address valid memory only. 2920 __m128i _mm_maskload_epi64 (const(long)* mem_addr, __m128i mask) @system 2921 { 2922 // PERF DMD 2923 static if (LDC_with_AVX2) 2924 { 2925 return cast(__m128i) __builtin_ia32_maskloadq(mem_addr, cast(long2) mask); 2926 } 2927 else static if (GDC_with_AVX2) 2928 { 2929 return cast(__m128i) __builtin_ia32_maskloadq(cast(long2*)mem_addr, cast(long2) mask); 2930 } 2931 else 2932 { 2933 return cast(__m128i) _mm_maskload_pd(cast(const(double)*)mem_addr, mask); 2934 } 2935 } 2936 unittest 2937 { 2938 static if (!maskLoadWorkaroundDMD) 2939 { 2940 long[2] A = [-7, -8]; 2941 long2 B = cast(long2) _mm_maskload_epi64(A.ptr, _mm_setr_epi64(1, -1)); 2942 long[2] correct = [0, -8]; 2943 assert(B.array == correct); 2944 } 2945 } 2946 2947 /// Load packed 64-bit integers from memory using `mask` (elements are zeroed out when the highest 2948 /// bit is not set in the corresponding element). 2949 /// Warning: See "Note about mask load/store" to know why you must address valid memory only. 2950 __m256i _mm256_maskload_epi64 (const(long)* mem_addr, __m256i mask) /* pure */ @system 2951 { 2952 static if (LDC_with_AVX2) 2953 { 2954 return cast(__m256i) __builtin_ia32_maskloadq256(mem_addr, cast(long4)mask); 2955 } 2956 else static if (GDC_with_AVX2) 2957 { 2958 return cast(__m256i)__builtin_ia32_maskloadq256(cast(__m256i*)mem_addr, cast(long4)mask); 2959 } 2960 else 2961 { 2962 return cast(__m256i) _mm256_maskload_pd(cast(const(double*)) mem_addr, mask); 2963 } 2964 } 2965 unittest 2966 { 2967 long[4] A = [ 8, -2, 4, 5]; 2968 long4 B = cast(long4) _mm256_maskload_epi64(A.ptr, _mm256_setr_epi64(1, -1, -1, 1)); 2969 long[4] correct = [0, -2, 4, 0]; 2970 } 2971 2972 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values. 2973 __m256i _mm256_max_epi16 (__m256i a, __m256i b) pure @safe 2974 { 2975 // PERF D_SIMD 2976 version(GNU) 2977 enum bool split = true; 2978 else static if (SIMD_COMPARISON_MASKS_32B) 2979 enum bool split = false; 2980 else 2981 enum bool split = true; 2982 2983 static if (GDC_with_AVX2) 2984 { 2985 return cast(__m256i) __builtin_ia32_pmaxsw256(cast(short16)a, cast(short16)b); 2986 } 2987 else static if (split) 2988 { 2989 // split 2990 __m128i a_lo = _mm256_extractf128_si256!0(a); 2991 __m128i a_hi = _mm256_extractf128_si256!1(a); 2992 __m128i b_lo = _mm256_extractf128_si256!0(b); 2993 __m128i b_hi = _mm256_extractf128_si256!1(b); 2994 __m128i r_lo = _mm_max_epi16(a_lo, b_lo); 2995 __m128i r_hi = _mm_max_epi16(a_hi, b_hi); 2996 return _mm256_set_m128i(r_hi, r_lo); 2997 } 2998 else static if (SIMD_COMPARISON_MASKS_32B) 2999 { 3000 // catastrophic with GDC x86 for some reason. Sad. 3001 short16 sa = cast(short16)a; 3002 short16 sb = cast(short16)b; 3003 short16 greater = sa > sb; 3004 return cast(__m256i)( (greater & sa) | (~greater & sb) ); 3005 } 3006 else 3007 static assert(0); 3008 } 3009 unittest 3010 { 3011 short16 R = cast(short16) _mm256_max_epi16(_mm256_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57, 1, 0, 0, 0, 1, 0, 0, 0), 3012 _mm256_setr_epi16( -4,-8, 9, 7, 0,-32768, 0, 0, 0, 2, 0, 4, 2, 1, 2, -4)); 3013 short[16] correct = [32767, 1, 9, 7, 9, 7, 0, 0, 1, 2, 0, 4, 2, 1, 2, 0]; 3014 assert(R.array == correct); 3015 } 3016 3017 /// Compare packed signed 32-bit integers in `a` and `b`, and return packed maximum values. 3018 __m256i _mm256_max_epi32 (__m256i a, __m256i b) pure @safe 3019 { 3020 // PERF D_SIMD 3021 version(GNU) 3022 enum bool split = true; 3023 else static if (SIMD_COMPARISON_MASKS_32B) 3024 enum bool split = false; 3025 else 3026 enum bool split = true; 3027 3028 static if (GDC_with_AVX2) 3029 { 3030 return cast(__m256i) __builtin_ia32_pmaxsd256(cast(int8)a, cast(int8)b); 3031 } 3032 else static if (split) 3033 { 3034 // split 3035 __m128i a_lo = _mm256_extractf128_si256!0(a); 3036 __m128i a_hi = _mm256_extractf128_si256!1(a); 3037 __m128i b_lo = _mm256_extractf128_si256!0(b); 3038 __m128i b_hi = _mm256_extractf128_si256!1(b); 3039 __m128i r_lo = _mm_max_epi32(a_lo, b_lo); 3040 __m128i r_hi = _mm_max_epi32(a_hi, b_hi); 3041 return _mm256_set_m128i(r_hi, r_lo); 3042 } 3043 else static if (SIMD_COMPARISON_MASKS_32B) 3044 { 3045 // catastrophic with GDC x86 for some reason, like for 16-bit numbers. 3046 int8 sa = cast(int8)a; 3047 int8 sb = cast(int8)b; 3048 int8 greater = sa > sb; 3049 return cast(__m256i)( (greater & sa) | (~greater & sb) ); 3050 } 3051 else 3052 static assert(0); 3053 } 3054 unittest 3055 { 3056 int8 R = cast(int8) _mm256_max_epi32(_mm256_setr_epi32(0x7fffffff, 1, -4, 7, 0x7fffffff, 2, -4, 7), 3057 _mm256_setr_epi32( -4,-8, 9, -8,-0x80000000,-8, 9, -8)); 3058 int[8] correct = [0x7fffffff, 1, 9, 7, 0x7fffffff, 2, 9, 7]; 3059 assert(R.array == correct); 3060 } 3061 3062 /// Compare packed signed 8-bit integers in `a` and `b`, and return packed maximum values. 3063 __m256i _mm256_max_epi8 (__m256i a, __m256i b) pure @trusted 3064 { 3065 // PERF D_SIMD 3066 version(GNU) 3067 enum bool split = true; 3068 else static if (SIMD_COMPARISON_MASKS_32B) 3069 enum bool split = false; 3070 else 3071 enum bool split = true; 3072 static if (GDC_with_AVX2) 3073 { 3074 // Strangely, GDC asks for unsigned ubyte32 3075 return cast(__m256i) __builtin_ia32_pmaxsb256(cast(ubyte32)a, cast(ubyte32)b); 3076 } 3077 else static if (split) 3078 { 3079 // split 3080 __m128i a_lo = _mm256_extractf128_si256!0(a); 3081 __m128i a_hi = _mm256_extractf128_si256!1(a); 3082 __m128i b_lo = _mm256_extractf128_si256!0(b); 3083 __m128i b_hi = _mm256_extractf128_si256!1(b); 3084 __m128i r_lo = _mm_max_epi8(a_lo, b_lo); 3085 __m128i r_hi = _mm_max_epi8(a_hi, b_hi); 3086 return _mm256_set_m128i(r_hi, r_lo); 3087 } 3088 else static if (SIMD_COMPARISON_MASKS_32B) 3089 { 3090 // This is real bad with GDC, again 3091 byte32 sa = cast(byte32)a; 3092 byte32 sb = cast(byte32)b; 3093 byte32 greater = cast(byte32)(sa > sb); 3094 return cast(__m256i)( (greater & sa) | (~greater & sb) ); 3095 } 3096 else 3097 static assert(false); 3098 } 3099 unittest 3100 { 3101 __m256i A = _mm256_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0, 127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); 3102 __m256i B = _mm256_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0); 3103 byte32 R = cast(byte32) _mm256_max_epi8(A, B); 3104 byte[32] correct = [127, 1, 9, -7, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0, 127, 1, 9, -7, 9, 7, 0, 57, 0, 0, 0, 0, 0, 4, 0, 0]; 3105 assert(R.array == correct); 3106 } 3107 3108 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed maximum values. 3109 __m256i _mm256_max_epu16 (__m256i a, __m256i b) pure @trusted 3110 { 3111 // PERF D_SIMD 3112 version(GNU) 3113 enum bool split = true; 3114 else static if (SIMD_COMPARISON_MASKS_32B) 3115 enum bool split = false; 3116 else 3117 enum bool split = true; 3118 3119 static if (GDC_with_AVX2) 3120 { 3121 return cast(__m256i) __builtin_ia32_pmaxuw256(cast(short16)a, cast(short16)b); 3122 } 3123 else static if (split) 3124 { 3125 // split 3126 __m128i a_lo = _mm256_extractf128_si256!0(a); 3127 __m128i a_hi = _mm256_extractf128_si256!1(a); 3128 __m128i b_lo = _mm256_extractf128_si256!0(b); 3129 __m128i b_hi = _mm256_extractf128_si256!1(b); 3130 __m128i r_lo = _mm_max_epu16(a_lo, b_lo); 3131 __m128i r_hi = _mm_max_epu16(a_hi, b_hi); 3132 return _mm256_set_m128i(r_hi, r_lo); 3133 } 3134 else static if (SIMD_COMPARISON_MASKS_32B) 3135 { 3136 // catastrophic with GDC x86_64, good with LDC 3137 short16 sa = cast(short16)a; 3138 short16 sb = cast(short16)b; 3139 short16 greater = cast(short16)(cast(ushort16)sa > cast(ushort16)sb); 3140 return cast(__m256i)( (greater & sa) | (~greater & sb) ); 3141 } 3142 else 3143 static assert(false); 3144 } 3145 unittest 3146 { 3147 short16 R = cast(short16) _mm256_max_epu16(_mm256_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57, 1, 0, 0, 0, 1, 0, 0, -6), 3148 _mm256_setr_epi16( -4,-8, 9, 7, 0,-32768, 0, 0, 0, 2, 0, 4, 2, 1, 2, -4)); 3149 short[16] correct = [-4,-8, -4, -8, 9,-32768, 0,-57, 1, 2, 0, 4, 2, 1, 2, -4]; 3150 assert(R.array == correct); 3151 } 3152 3153 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed maximum values. 3154 __m256i _mm256_max_epu32 (__m256i a, __m256i b) pure @safe 3155 { 3156 // PERF D_SIMD 3157 version(GNU) 3158 enum bool split = true; 3159 else static if (SIMD_COMPARISON_MASKS_32B) 3160 enum bool split = false; 3161 else 3162 enum bool split = true; 3163 3164 static if (GDC_with_AVX2) 3165 { 3166 return cast(__m256i) __builtin_ia32_pmaxud256(cast(int8)a, cast(int8)b); 3167 } 3168 else static if (split) 3169 { 3170 // split 3171 __m128i a_lo = _mm256_extractf128_si256!0(a); 3172 __m128i a_hi = _mm256_extractf128_si256!1(a); 3173 __m128i b_lo = _mm256_extractf128_si256!0(b); 3174 __m128i b_hi = _mm256_extractf128_si256!1(b); 3175 __m128i r_lo = _mm_max_epu32(a_lo, b_lo); 3176 __m128i r_hi = _mm_max_epu32(a_hi, b_hi); 3177 return _mm256_set_m128i(r_hi, r_lo); 3178 } 3179 else static if (SIMD_COMPARISON_MASKS_32B) 3180 { 3181 // catastrophic with GDC x86 for some reason, like for 16-bit numbers. 3182 uint8 sa = cast(uint8)a; 3183 uint8 sb = cast(uint8)b; 3184 uint8 greater = sa > sb; 3185 return cast(__m256i)( (greater & sa) | (~greater & sb) ); 3186 } 3187 else 3188 static assert(0); 3189 } 3190 unittest 3191 { 3192 int8 R = cast(int8) _mm256_max_epu32(_mm256_setr_epi32(0x7fffffff, 1, 4, -7, 0x7fffffff, 1, 11, -7), 3193 _mm256_setr_epi32( -4,-8, 9, -8, -4,-8, 9, -8)); 3194 int[8] correct = [ -4,-8, 9, -7, -4,-8, 11, -7]; 3195 assert(R.array == correct); 3196 } 3197 3198 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed maximum values. 3199 __m256i _mm256_max_epu8 (__m256i a, __m256i b) pure @safe 3200 { 3201 // PERF D_SIMD 3202 version(GNU) 3203 enum bool split = true; 3204 else static if (SIMD_COMPARISON_MASKS_32B) 3205 enum bool split = false; 3206 else 3207 enum bool split = true; 3208 static if (GDC_with_AVX2) 3209 { 3210 return cast(__m256i) __builtin_ia32_pmaxub256(cast(ubyte32)a, cast(ubyte32)b); 3211 } 3212 else static if (split) 3213 { 3214 // split 3215 __m128i a_lo = _mm256_extractf128_si256!0(a); 3216 __m128i a_hi = _mm256_extractf128_si256!1(a); 3217 __m128i b_lo = _mm256_extractf128_si256!0(b); 3218 __m128i b_hi = _mm256_extractf128_si256!1(b); 3219 __m128i r_lo = _mm_max_epu8(a_lo, b_lo); 3220 __m128i r_hi = _mm_max_epu8(a_hi, b_hi); 3221 return _mm256_set_m128i(r_hi, r_lo); 3222 } 3223 else static if (SIMD_COMPARISON_MASKS_32B) 3224 { 3225 // This is real bad with GDC, again 3226 ubyte32 sa = cast(ubyte32)a; 3227 ubyte32 sb = cast(ubyte32)b; 3228 ubyte32 greater = cast(ubyte32)(sa > sb); 3229 return cast(__m256i)( (greater & sa) | (~greater & sb) ); 3230 } 3231 else 3232 static assert(false); 3233 } 3234 unittest 3235 { 3236 byte32 R = cast(byte32) _mm256_max_epu8(_mm256_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 3237 _mm256_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 3238 byte[32] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 3239 assert(R.array == correct); 3240 } 3241 3242 // Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values. 3243 __m256i _mm256_min_epi16 (__m256i a, __m256i b) pure @safe 3244 { 3245 // PERF D_SIMD 3246 version(GNU) 3247 enum bool split = true; 3248 else static if (SIMD_COMPARISON_MASKS_32B) 3249 enum bool split = false; 3250 else 3251 enum bool split = true; 3252 3253 static if (GDC_with_AVX2) 3254 { 3255 return cast(__m256i) __builtin_ia32_pminsw256(cast(short16)a, cast(short16)b); 3256 } 3257 else static if (split) 3258 { 3259 // split 3260 __m128i a_lo = _mm256_extractf128_si256!0(a); 3261 __m128i a_hi = _mm256_extractf128_si256!1(a); 3262 __m128i b_lo = _mm256_extractf128_si256!0(b); 3263 __m128i b_hi = _mm256_extractf128_si256!1(b); 3264 __m128i r_lo = _mm_min_epi16(a_lo, b_lo); 3265 __m128i r_hi = _mm_min_epi16(a_hi, b_hi); 3266 return _mm256_set_m128i(r_hi, r_lo); 3267 } 3268 else static if (SIMD_COMPARISON_MASKS_32B) 3269 { 3270 // same as _mm256_min_epi16, this is catastrophic with GDC -mavx 3271 short16 sa = cast(short16)a; 3272 short16 sb = cast(short16)b; 3273 short16 greater = sa > sb; 3274 return cast(__m256i)( (~greater & sa) | (greater & sb) ); 3275 } 3276 else 3277 static assert(0); 3278 } 3279 unittest 3280 { 3281 short16 R = cast(short16) _mm256_min_epi16(_mm256_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57, 1, 0, 0, 0, 1, 0, 0, 0), 3282 _mm256_setr_epi16( -4,-8, 9, 7, 0,-32768, 0, 0, 0, 2, 0, 4, 2, 1, 2, -4)); 3283 short[16] correct = [ -4,-8, -4, -8, 0,-32768, 0,-57, 0, 0, 0, 0, 1, 0, 0, -4]; 3284 assert(R.array == correct); 3285 } 3286 3287 /// Compare packed signed 32-bit integers in `a` and `b`, and return packed minimum values. 3288 __m256i _mm256_min_epi32 (__m256i a, __m256i b) pure @safe 3289 { 3290 // PERF D_SIMD 3291 version(GNU) 3292 enum bool split = true; 3293 else static if (SIMD_COMPARISON_MASKS_32B) 3294 enum bool split = false; 3295 else 3296 enum bool split = true; 3297 3298 static if (GDC_with_AVX2) 3299 { 3300 return cast(__m256i) __builtin_ia32_pminsd256(cast(int8)a, cast(int8)b); 3301 } 3302 else static if (split) 3303 { 3304 // split 3305 __m128i a_lo = _mm256_extractf128_si256!0(a); 3306 __m128i a_hi = _mm256_extractf128_si256!1(a); 3307 __m128i b_lo = _mm256_extractf128_si256!0(b); 3308 __m128i b_hi = _mm256_extractf128_si256!1(b); 3309 __m128i r_lo = _mm_min_epi32(a_lo, b_lo); 3310 __m128i r_hi = _mm_min_epi32(a_hi, b_hi); 3311 return _mm256_set_m128i(r_hi, r_lo); 3312 } 3313 else static if (SIMD_COMPARISON_MASKS_32B) 3314 { 3315 // Not checked this one, probably same badness issue with GDC 3316 int8 sa = cast(int8)a; 3317 int8 sb = cast(int8)b; 3318 int8 greater = sa > sb; 3319 return cast(__m256i)( (~greater & sa) | (greater & sb) ); 3320 } 3321 else 3322 static assert(0); 3323 } 3324 unittest 3325 { 3326 int8 R = cast(int8) _mm256_min_epi32(_mm256_setr_epi32(0x7fffffff, 1, -4, 7, 0x7fffffff, 2, -4, 7), 3327 _mm256_setr_epi32( -4,-8, 9, -8,-0x80000000,-8, 9, -8)); 3328 int[8] correct = [ - 4,-8, -4, -8,-0x80000000,-8, -4, -8]; 3329 assert(R.array == correct); 3330 } 3331 3332 3333 /// Compare packed signed 8-bit integers in `a` and `b`, and return packed minimum values. 3334 __m256i _mm256_min_epi8 (__m256i a, __m256i b) pure @trusted 3335 { 3336 // PERF D_SIMD 3337 version(GNU) 3338 enum bool split = true; 3339 else static if (SIMD_COMPARISON_MASKS_32B) 3340 enum bool split = false; 3341 else 3342 enum bool split = true; 3343 static if (GDC_with_AVX2) 3344 { 3345 // Strangely, GDC asks for unsigned ubyte32 3346 return cast(__m256i) __builtin_ia32_pminsb256(cast(ubyte32)a, cast(ubyte32)b); 3347 } 3348 else static if (split) 3349 { 3350 // split 3351 __m128i a_lo = _mm256_extractf128_si256!0(a); 3352 __m128i a_hi = _mm256_extractf128_si256!1(a); 3353 __m128i b_lo = _mm256_extractf128_si256!0(b); 3354 __m128i b_hi = _mm256_extractf128_si256!1(b); 3355 __m128i r_lo = _mm_min_epi8(a_lo, b_lo); 3356 __m128i r_hi = _mm_min_epi8(a_hi, b_hi); 3357 return _mm256_set_m128i(r_hi, r_lo); 3358 } 3359 else static if (SIMD_COMPARISON_MASKS_32B) 3360 { 3361 // This is real bad with GDC, again 3362 byte32 sa = cast(byte32)a; 3363 byte32 sb = cast(byte32)b; 3364 byte32 greater = cast(byte32)(sa > sb); 3365 return cast(__m256i)( (~greater & sa) | (greater & sb) ); 3366 } 3367 else 3368 static assert(false); 3369 } 3370 unittest 3371 { 3372 __m256i A = _mm256_setr_epi8(127, 1, -4, -8, 9, 7, 0, -57, 0, 0, 0, 0, 0, 0, 0, 0, 127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); 3373 __m256i B = _mm256_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, -4, 0, 0); 3374 byte32 R = cast(byte32) _mm256_min_epi8(A, B); 3375 byte[32] correct = [ 4, -8, -4, -8, 0, -128, 0, -57, 0, 0, 0, 0, 0, 0, 0, 0, 4, -8, -4, -8, 0, -128, 0, 0, 0, 0, 0, 0, 0, -4, 0, 0]; 3376 assert(R.array == correct); 3377 } 3378 3379 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed minimum values. 3380 __m256i _mm256_min_epu16 (__m256i a, __m256i b) pure @trusted 3381 { 3382 // PERF D_SIMD 3383 version(GNU) 3384 enum bool split = true; 3385 else static if (SIMD_COMPARISON_MASKS_32B) 3386 enum bool split = false; 3387 else 3388 enum bool split = true; 3389 3390 static if (GDC_with_AVX2) 3391 { 3392 return cast(__m256i) __builtin_ia32_pminuw256(cast(short16)a, cast(short16)b); 3393 } 3394 else static if (split) 3395 { 3396 // split 3397 __m128i a_lo = _mm256_extractf128_si256!0(a); 3398 __m128i a_hi = _mm256_extractf128_si256!1(a); 3399 __m128i b_lo = _mm256_extractf128_si256!0(b); 3400 __m128i b_hi = _mm256_extractf128_si256!1(b); 3401 __m128i r_lo = _mm_min_epu16(a_lo, b_lo); 3402 __m128i r_hi = _mm_min_epu16(a_hi, b_hi); 3403 return _mm256_set_m128i(r_hi, r_lo); 3404 } 3405 else static if (SIMD_COMPARISON_MASKS_32B) 3406 { 3407 // catastrophic with GDC x86_64 3408 short16 sa = cast(short16)a; 3409 short16 sb = cast(short16)b; 3410 short16 greater = cast(short16)(cast(ushort16)sa > cast(ushort16)sb); 3411 return cast(__m256i)( (~greater & sa) | (greater & sb) ); 3412 } 3413 else 3414 static assert(false); 3415 } 3416 unittest 3417 { 3418 short16 R = cast(short16) _mm256_min_epu16(_mm256_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57, 1, 0, 0, 0, 1, 0, 0, -6), 3419 _mm256_setr_epi16( -4, -8, 9, 7, 0,-32768, 0, 0, 0, 2, 0, 4, 2, 1, 2, -4)); 3420 short[16] correct = [32767, 1, 9, 7, 0, 7, 0, 0, 0, 0, 0, 0, 1, 0, 0, -6]; 3421 assert(R.array == correct); 3422 } 3423 3424 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed minimum values. 3425 __m256i _mm256_min_epu32 (__m256i a, __m256i b) pure @safe 3426 { 3427 // PERF D_SIMD 3428 version(GNU) 3429 enum bool split = true; 3430 else static if (SIMD_COMPARISON_MASKS_32B) 3431 enum bool split = false; 3432 else 3433 enum bool split = true; 3434 3435 static if (GDC_with_AVX2) 3436 { 3437 return cast(__m256i) __builtin_ia32_pminud256(cast(int8)a, cast(int8)b); 3438 } 3439 else static if (split) 3440 { 3441 // split 3442 __m128i a_lo = _mm256_extractf128_si256!0(a); 3443 __m128i a_hi = _mm256_extractf128_si256!1(a); 3444 __m128i b_lo = _mm256_extractf128_si256!0(b); 3445 __m128i b_hi = _mm256_extractf128_si256!1(b); 3446 __m128i r_lo = _mm_min_epu32(a_lo, b_lo); 3447 __m128i r_hi = _mm_min_epu32(a_hi, b_hi); 3448 return _mm256_set_m128i(r_hi, r_lo); 3449 } 3450 else static if (SIMD_COMPARISON_MASKS_32B) 3451 { 3452 // catastrophic with GDC, so in this case split instead 3453 uint8 sa = cast(uint8)a; 3454 uint8 sb = cast(uint8)b; 3455 uint8 greater = sa > sb; 3456 return cast(__m256i)( (greater & sb) | (~greater & sa) ); 3457 } 3458 else 3459 static assert(0); 3460 } 3461 unittest 3462 { 3463 int8 R = cast(int8) _mm256_min_epu32(_mm256_setr_epi32(0x7fffffff, 1, 4, -7, 0x7fffffff, 1, 11, -7), 3464 _mm256_setr_epi32( -4,-8, 9, -8, -4,-8, 9, -8)); 3465 int[8] correct = [0x7fffffff, 1, 4, -8, 0x7fffffff, 1, 9, -8]; 3466 assert(R.array == correct); 3467 } 3468 3469 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values. 3470 __m256i _mm256_min_epu8 (__m256i a, __m256i b) pure @safe 3471 { 3472 // PERF D_SIMD 3473 version(GNU) 3474 enum bool split = true; 3475 else static if (SIMD_COMPARISON_MASKS_32B) 3476 enum bool split = false; 3477 else 3478 enum bool split = true; 3479 static if (GDC_with_AVX2) 3480 { 3481 return cast(__m256i) __builtin_ia32_pminub256(cast(ubyte32)a, cast(ubyte32)b); 3482 } 3483 else static if (split) 3484 { 3485 // split 3486 __m128i a_lo = _mm256_extractf128_si256!0(a); 3487 __m128i a_hi = _mm256_extractf128_si256!1(a); 3488 __m128i b_lo = _mm256_extractf128_si256!0(b); 3489 __m128i b_hi = _mm256_extractf128_si256!1(b); 3490 __m128i r_lo = _mm_min_epu8(a_lo, b_lo); 3491 __m128i r_hi = _mm_min_epu8(a_hi, b_hi); 3492 return _mm256_set_m128i(r_hi, r_lo); 3493 } 3494 else static if (SIMD_COMPARISON_MASKS_32B) 3495 { 3496 ubyte32 sa = cast(ubyte32)a; 3497 ubyte32 sb = cast(ubyte32)b; 3498 ubyte32 greater = cast(ubyte32)(sa > sb); 3499 return cast(__m256i)( (~greater & sa) | (greater & sb) ); 3500 } 3501 else 3502 static assert(false); 3503 } 3504 unittest 3505 { 3506 byte32 R = cast(byte32) _mm256_min_epu8(_mm256_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 3507 _mm256_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 3508 byte[32] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 3509 assert(R.array == correct); 3510 } 3511 3512 /// Create mask from the most significant bit of each 8-bit element in `a`. 3513 int _mm256_movemask_epi8 (__m256i a) pure @trusted 3514 { 3515 static if (GDC_with_AVX2) 3516 { 3517 return __builtin_ia32_pmovmskb256(cast(ubyte32)a); 3518 } 3519 else static if (LDC_with_AVX2) 3520 { 3521 return __builtin_ia32_pmovmskb256(cast(byte32)a); 3522 } 3523 else 3524 { 3525 // ARM64 splitting makes it 33 inst instead of 48 for naive version. 3526 // PERF not sure if there is something better, sounds likely 3527 // Otherwise, beneficial for every case. 3528 __m128i a_lo = _mm256_extractf128_si256!0(a); 3529 __m128i a_hi = _mm256_extractf128_si256!1(a); 3530 return (_mm_movemask_epi8(a_hi) << 16) | _mm_movemask_epi8(a_lo); 3531 } 3532 } 3533 unittest 3534 { 3535 assert(0x9D37_9C36 == _mm256_movemask_epi8(_mm256_set_epi8(-1, 1, 2, -3, -1, -1, 4,-8, 127, 0, -1, -1, 0, -1, -1, -1, 3536 -1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0))); 3537 } 3538 3539 /// Basically 2x `_mm_mpsadbw_epu8` in parallel, over the two lanes. 3540 __m256i _mm256_mpsadbw_epu8(int imm8)(__m256i a, __m256i b) pure @safe 3541 { 3542 static if (GDC_with_AVX2) 3543 { 3544 return cast(__m256i) __builtin_ia32_mpsadbw256(cast(ubyte32)a, 3545 cast(ubyte32)b, 3546 imm8); 3547 } 3548 else static if (LDC_with_AVX2) 3549 { 3550 return cast(__m256i) __builtin_ia32_mpsadbw256(cast(byte32)a, 3551 cast(byte32)b, 3552 imm8); 3553 } 3554 else 3555 { 3556 // split 3557 __m128i a_lo = _mm256_extractf128_si256!0(a); 3558 __m128i a_hi = _mm256_extractf128_si256!1(a); 3559 __m128i b_lo = _mm256_extractf128_si256!0(b); 3560 __m128i b_hi = _mm256_extractf128_si256!1(b); 3561 __m128i r_lo = _mm_mpsadbw_epu8!(imm8 & 7)(a_lo, b_lo); 3562 __m128i r_hi = _mm_mpsadbw_epu8!((imm8 >> 3) & 7)(a_hi, b_hi); 3563 return _mm256_set_m128i(r_hi, r_lo); 3564 } 3565 } 3566 unittest 3567 { 3568 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 3569 __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5, 5, 5, 12, 13, 14, 15); 3570 __m256i AA = _mm256_set_m128i(A, A); 3571 __m256i BB = _mm256_set_m128i(B, B); 3572 short[16] correct = [755, 753, 751, 749, 747, 745, 743, 741, 3573 32, 28, 24, 20, 16, 12, 8, 4]; 3574 short16 r5 = cast(short16) _mm256_mpsadbw_epu8!(7 * 8 + 5)(AA, BB); 3575 assert(r5.array == correct); 3576 } 3577 3578 /// Multiply the low signed 32-bit integers from each packed 64-bit element in `a` and `b`, and 3579 /// return the signed 64-bit results. 3580 __m256i _mm256_mul_epi32 (__m256i a, __m256i b) pure @trusted 3581 { 3582 // PERF LDC + SSE2 to SSSE3. I don't quite see what to do, same problem in _mm_mul_epi32. 3583 static if (GDC_with_AVX2) 3584 { 3585 return cast(__m256i) __builtin_ia32_pmuldq256(cast(int8)a, cast(int8)b); 3586 } 3587 else static if ( (LDC_with_SSE41 || LDC_with_AVX2) && LDC_with_optimizations) 3588 { 3589 // good with LDC + SSE4.1 to AVX2, else need to split 3590 enum ir = ` 3591 %ia = shufflevector <8 x i32> %0,<8 x i32> %0, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 3592 %ib = shufflevector <8 x i32> %1,<8 x i32> %1, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 3593 %la = sext <4 x i32> %ia to <4 x i64> 3594 %lb = sext <4 x i32> %ib to <4 x i64> 3595 %r = mul <4 x i64> %la, %lb 3596 ret <4 x i64> %r`; 3597 return cast(__m256i) LDCInlineIR!(ir, long4, int8, int8)(cast(int8)a, cast(int8)b); 3598 } 3599 else 3600 { 3601 // split, very beneficial with LDC+ARM64 3602 __m128i a_lo = _mm256_extractf128_si256!0(a); 3603 __m128i a_hi = _mm256_extractf128_si256!1(a); 3604 __m128i b_lo = _mm256_extractf128_si256!0(b); 3605 __m128i b_hi = _mm256_extractf128_si256!1(b); 3606 __m128i r_lo = _mm_mul_epi32(a_lo, b_lo); 3607 __m128i r_hi = _mm_mul_epi32(a_hi, b_hi); 3608 return _mm256_set_m128i(r_hi, r_lo); 3609 } 3610 } 3611 unittest 3612 { 3613 __m256i A = _mm256_setr_epi32(61616461, 1915324654, 4564061, 3, 61616466, 1915324654, 4564061, 3); 3614 __m256i B = _mm256_setr_epi32(49716422, -915616216, -121144, 0, 49716422, -915616216, -121145, 0); 3615 long4 R = cast(long4) _mm256_mul_epi32(A, B); 3616 long[4] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144, cast(long)61616466 * 49716422, cast(long)4564061 * -121145]; 3617 assert(R.array == correct); 3618 } 3619 3620 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in `a` and `b`, and 3621 /// return the unsigned 64-bit results. 3622 __m256i _mm256_mul_epu32 (__m256i a, __m256i b) pure @trusted 3623 { 3624 // PERF DMD 3625 static if (GDC_with_AVX2) 3626 { 3627 return cast(__m256i) __builtin_ia32_pmuludq256(cast(int8)a, cast(int8)b); 3628 } 3629 else version(GNU) 3630 { 3631 // explicit split needed for GDC without avx2 3632 __m128i a_lo = _mm256_extractf128_si256!0(a); 3633 __m128i a_hi = _mm256_extractf128_si256!1(a); 3634 __m128i b_lo = _mm256_extractf128_si256!0(b); 3635 __m128i b_hi = _mm256_extractf128_si256!1(b); 3636 __m128i r_lo = _mm_mul_epu32(a_lo, b_lo); 3637 __m128i r_hi = _mm_mul_epu32(a_hi, b_hi); 3638 return _mm256_set_m128i(r_hi, r_lo); 3639 } 3640 else 3641 { 3642 // Works well in all LDC cases, surprisingly. 3643 int8 ia = cast(int8)a; 3644 int8 ib = cast(int8)b; 3645 long4 r; 3646 r.ptr[0] = cast(long)cast(uint)ia.array[0] * cast(long)cast(uint)ib.array[0]; 3647 r.ptr[1] = cast(long)cast(uint)ia.array[2] * cast(long)cast(uint)ib.array[2]; 3648 r.ptr[2] = cast(long)cast(uint)ia.array[4] * cast(long)cast(uint)ib.array[4]; 3649 r.ptr[3] = cast(long)cast(uint)ia.array[6] * cast(long)cast(uint)ib.array[6]; 3650 return cast(__m256i)r; 3651 } 3652 } 3653 unittest 3654 { 3655 __m256i A = _mm256_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff, 42, 0xDEADBEEF, 42, 0xffffffff); 3656 __m256i B = _mm256_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff, 42, 0xCAFEBABE, 42, 0xffffffff); 3657 __m256i C = _mm256_mul_epu32(A, B); 3658 long4 LC = cast(long4)C; 3659 long[4] correct = [18446744065119617025uL, 12723420444339690338uL, 18446744065119617025uL, 12723420444339690338uL]; 3660 assert(LC.array == correct); 3661 } 3662 3663 /// Multiply the packed signed 16-bit integers in `a` and `b`, 3664 /// producing intermediate 32-bit integers, and return the high 3665 /// 16 bits of the intermediate integers. 3666 __m256i _mm256_mulhi_epi16 (__m256i a, __m256i b) pure @safe 3667 { 3668 static if (GDC_with_AVX2) 3669 { 3670 return cast(__m256i) __builtin_ia32_pmulhw256(cast(short16)a, cast(short16)b); 3671 } 3672 else static if (LDC_with_AVX2) 3673 { 3674 return cast(__m256i) __builtin_ia32_pmulhw256(cast(short16)a, cast(short16)b); 3675 } 3676 else 3677 { 3678 // split 3679 __m128i a_lo = _mm256_extractf128_si256!0(a); 3680 __m128i a_hi = _mm256_extractf128_si256!1(a); 3681 __m128i b_lo = _mm256_extractf128_si256!0(b); 3682 __m128i b_hi = _mm256_extractf128_si256!1(b); 3683 __m128i r_lo = _mm_mulhi_epi16(a_lo, b_lo); 3684 __m128i r_hi = _mm_mulhi_epi16(a_hi, b_hi); 3685 return _mm256_set_m128i(r_hi, r_lo); 3686 } 3687 } 3688 unittest 3689 { 3690 __m256i A = _mm256_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7, 0, -16, 2, 3, 4, 8, 16, 8); 3691 __m256i B = _mm256_set1_epi16(16384); 3692 short16 R = cast(short16)_mm256_mulhi_epi16(A, B); 3693 short[16] correct = [0, -4, 0, 0, 1, 2, 4, 1, 0, -4, 0, 0, 1, 2, 4, 2]; 3694 assert(R.array == correct); 3695 } 3696 3697 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, 3698 /// producing intermediate 32-bit integers, and return the high 3699 /// 16 bits of the intermediate integers. 3700 __m256i _mm256_mulhi_epu16 (__m256i a, __m256i b) pure @safe 3701 { 3702 static if (GDC_with_AVX2) 3703 { 3704 return cast(__m256i) __builtin_ia32_pmulhuw256(cast(short16)a, cast(short16)b); 3705 } 3706 else static if (LDC_with_AVX2) 3707 { 3708 return cast(__m256i) __builtin_ia32_pmulhuw256(cast(short16)a, cast(short16)b); 3709 } 3710 else 3711 { 3712 // split 3713 __m128i a_lo = _mm256_extractf128_si256!0(a); 3714 __m128i a_hi = _mm256_extractf128_si256!1(a); 3715 __m128i b_lo = _mm256_extractf128_si256!0(b); 3716 __m128i b_hi = _mm256_extractf128_si256!1(b); 3717 __m128i r_lo = _mm_mulhi_epu16(a_lo, b_lo); 3718 __m128i r_hi = _mm_mulhi_epu16(a_hi, b_hi); 3719 return _mm256_set_m128i(r_hi, r_lo); 3720 } 3721 } 3722 3723 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers. 3724 /// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return 3725 /// bits [16:1] to dst. 3726 __m256i _mm256_mulhrs_epi16 (__m256i a, __m256i b) pure @safe 3727 { 3728 static if (GDC_or_LDC_with_AVX2) 3729 { 3730 return cast(__m256i)__builtin_ia32_pmulhrsw256(cast(short16)a, cast(short16)b); 3731 } 3732 else 3733 { 3734 // ARM64: 8 instr with LDC >= 1.32 -O2, nice 3735 __m128i a_lo = _mm256_extractf128_si256!0(a); 3736 __m128i a_hi = _mm256_extractf128_si256!1(a); 3737 __m128i b_lo = _mm256_extractf128_si256!0(b); 3738 __m128i b_hi = _mm256_extractf128_si256!1(b); 3739 __m128i r_lo = _mm_mulhrs_epi16(a_lo, b_lo); 3740 __m128i r_hi = _mm_mulhrs_epi16(a_hi, b_hi); 3741 return _mm256_set_m128i(r_hi, r_lo); 3742 } 3743 } 3744 unittest 3745 { 3746 __m128i A = _mm_setr_epi16(12345, -32768, 32767, 0, 1, 845, -6999, -1); 3747 __m128i B = _mm_setr_epi16(8877, -24487, 15678, 32760, 1, 0, -149, -1); 3748 __m256i AB = _mm256_set_m128i(B, A); 3749 __m256i BA = _mm256_set_m128i(A, B); 3750 short16 C = cast(short16) _mm256_mulhrs_epi16(AB, BA); 3751 short[16] correct = [3344, 24487, 15678, 0, 0, 0, 32, 0, 3344, 24487, 15678, 0, 0, 0, 32, 0]; 3752 assert(C.array == correct); 3753 } 3754 3755 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 3756 /// and return the low 16 bits of the intermediate integers. 3757 __m256i _mm256_mullo_epi16 (__m256i a, __m256i b) pure @safe 3758 { 3759 // PERF D_SIMD 3760 static if (GDC_with_AVX) 3761 { 3762 return cast(__m256i)(cast(short16)a * cast(short16)b); 3763 } 3764 else version(LDC) 3765 { 3766 return cast(__m256i)(cast(short16)a * cast(short16)b); 3767 } 3768 else 3769 { 3770 // split 3771 __m128i a_lo = _mm256_extractf128_si256!0(a); 3772 __m128i a_hi = _mm256_extractf128_si256!1(a); 3773 __m128i b_lo = _mm256_extractf128_si256!0(b); 3774 __m128i b_hi = _mm256_extractf128_si256!1(b); 3775 __m128i r_lo = _mm_mullo_epi16(a_lo, b_lo); 3776 __m128i r_hi = _mm_mullo_epi16(a_hi, b_hi); 3777 return _mm256_set_m128i(r_hi, r_lo); 3778 } 3779 } 3780 unittest 3781 { 3782 __m256i A = _mm256_setr_epi16(16384, -16, 0, 3, 4, 1, 16, 7, 16384, -16, 0, 3, 4, 1, 16, 7); 3783 __m256i B = _mm256_set1_epi16(16384); 3784 short16 R = cast(short16)_mm256_mullo_epi16(A, B); 3785 short[16] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384, 0, 0, 0, -16384, 0, 16384, 0, -16384]; 3786 assert(R.array == correct); 3787 } 3788 3789 /// Multiply the packed signed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, 3790 /// and store the low 32 bits of the intermediate integer. 3791 __m256i _mm256_mullo_epi32 (__m256i a, __m256i b) pure @safe 3792 { 3793 // PERF D_SIMD 3794 static if (GDC_with_AVX) 3795 { 3796 return cast(__m256i)(cast(int8)a * cast(int8)b); 3797 } 3798 else version(LDC) 3799 { 3800 return cast(__m256i)(cast(int8)a * cast(int8)b); 3801 } 3802 else 3803 { 3804 // split 3805 __m128i a_lo = _mm256_extractf128_si256!0(a); 3806 __m128i a_hi = _mm256_extractf128_si256!1(a); 3807 __m128i b_lo = _mm256_extractf128_si256!0(b); 3808 __m128i b_hi = _mm256_extractf128_si256!1(b); 3809 __m128i r_lo = _mm_mullo_epi32(a_lo, b_lo); 3810 __m128i r_hi = _mm_mullo_epi32(a_hi, b_hi); 3811 return _mm256_set_m128i(r_hi, r_lo); 3812 } 3813 } 3814 unittest 3815 { 3816 __m256i A = _mm256_setr_epi32(61616461, 1915324654, 4564061, 3, 61616461, 1915324654, 4564061, 3); 3817 __m256i B = _mm256_setr_epi32(49716422, -915616216, -121144, 0, 49716422, -915616216, -121144, 1); 3818 int8 R = cast(int8) _mm256_mullo_epi32(A, B); 3819 int[8] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0, 3820 cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 3]; 3821 assert(R.array == correct); 3822 } 3823 3824 /// Compute the bitwise OR of 256 bits (representing integer data) in `a` and `b`. 3825 __m256i _mm256_or_si256 (__m256i a, __m256i b) pure @safe 3826 { 3827 return a | b; 3828 } 3829 unittest 3830 { 3831 long A = 0x55555555_55555555; 3832 long B = 0xAAAAAAAA_AAAAAAAA; 3833 __m256i vA = _mm256_set_epi64(A, B, A, B); 3834 __m256i vB = _mm256_set_epi64(B, A, 0, B); 3835 __m256i R = _mm256_or_si256(vA, vB); 3836 long[4] correct = [B, A, -1, -1]; 3837 assert(R.array == correct); 3838 } 3839 3840 /// Convert packed signed 16-bit integers from `a` and `b `to packed 8-bit integers using signed saturation. 3841 /// Warning: `a` and `b` are interleaved per-lane. 3842 /// Result has: `a` lane 0, `b` lane 0, `a` lane 1, `b` lane 1. 3843 __m256i _mm256_packs_epi16 (__m256i a, __m256i b) pure @safe 3844 { 3845 // PERF D_SIMD 3846 static if (GDC_with_AVX2) 3847 { 3848 return cast(__m256i) __builtin_ia32_packsswb256(cast(short16)a, cast(short16)b); 3849 } 3850 else static if (LDC_with_AVX2) 3851 { 3852 return cast(__m256i) __builtin_ia32_packsswb256(cast(short16)a, cast(short16)b); 3853 } 3854 else 3855 { 3856 __m128i a_lo = _mm256_extractf128_si256!0(a); 3857 __m128i a_hi = _mm256_extractf128_si256!1(a); 3858 __m128i b_lo = _mm256_extractf128_si256!0(b); 3859 __m128i b_hi = _mm256_extractf128_si256!1(b); 3860 __m128i r_lo = _mm_packs_epi16(a_lo, b_lo); 3861 __m128i r_hi = _mm_packs_epi16(a_hi, b_hi); 3862 return _mm256_set_m128i(r_hi, r_lo); 3863 } 3864 } 3865 unittest 3866 { 3867 __m256i A = _mm256_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0, 3868 -1000, -1000, 1000, 0, 256, -129, 254, 0); 3869 byte32 R = cast(byte32) _mm256_packs_epi16(A, A); 3870 byte[32] correct = [127, -128, 127, 0, 127, -128, 127, 0, 3871 127, -128, 127, 0, 127, -128, 127, 0, 3872 -128, -128, 127, 0, 127, -128, 127, 0, 3873 -128, -128, 127, 0, 127, -128, 127, 0]; 3874 assert(R.array == correct); 3875 } 3876 3877 /// Convert packed signed 32-bit integers from `a` and `b `to packed 16-bit integers using signed saturation. 3878 /// Warning: `a` and `b` are interleaved per-lane. 3879 /// Result has: `a` lane 0, `b` lane 0, `a` lane 1, `b` lane 1. 3880 __m256i _mm256_packs_epi32 (__m256i a, __m256i b) pure @safe 3881 { 3882 // PERF D_SIMD 3883 static if (GDC_with_AVX2) 3884 { 3885 return cast(__m256i) __builtin_ia32_packssdw256(cast(int8)a, cast(int8)b); 3886 } 3887 else static if (LDC_with_AVX2) 3888 { 3889 return cast(__m256i) __builtin_ia32_packssdw256(cast(int8)a, cast(int8)b); 3890 } 3891 else 3892 { 3893 __m128i a_lo = _mm256_extractf128_si256!0(a); 3894 __m128i a_hi = _mm256_extractf128_si256!1(a); 3895 __m128i b_lo = _mm256_extractf128_si256!0(b); 3896 __m128i b_hi = _mm256_extractf128_si256!1(b); 3897 __m128i r_lo = _mm_packs_epi32(a_lo, b_lo); 3898 __m128i r_hi = _mm_packs_epi32(a_hi, b_hi); 3899 return _mm256_set_m128i(r_hi, r_lo); 3900 } 3901 } 3902 unittest 3903 { 3904 __m256i A = _mm256_setr_epi32(100000, -100000, 1000, 0, 4, 5, -100000, 7); 3905 short16 R = cast(short16) _mm256_packs_epi32(A, A); 3906 short[16] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0, 4, 5, -32768, 7, 4, 5, -32768, 7]; 3907 assert(R.array == correct); 3908 } 3909 3910 3911 /// Convert packed signed 16-bit integers from `a` and `b `to packed 8-bit integers using unsigned saturation. 3912 /// Warning: `a` and `b` are interleaved per-lane. 3913 /// Result has: `a` lane 0, `b` lane 0, `a` lane 1, `b` lane 1. 3914 __m256i _mm256_packus_epi16 (__m256i a, __m256i b) pure @trusted 3915 { 3916 // PERF D_SIMD 3917 static if (GDC_with_AVX2) 3918 { 3919 return cast(__m256i) __builtin_ia32_packuswb256(cast(short16)a, cast(short16)b); 3920 } 3921 else static if (LDC_with_AVX2) 3922 { 3923 return cast(__m256i) __builtin_ia32_packuswb256(cast(short16)a, cast(short16)b); 3924 } 3925 else 3926 { 3927 // Always beneficial with LDC. 3928 // arm64: 4 inst with LDC -O1 3929 __m128i a_lo = _mm256_extractf128_si256!0(a); 3930 __m128i a_hi = _mm256_extractf128_si256!1(a); 3931 __m128i b_lo = _mm256_extractf128_si256!0(b); 3932 __m128i b_hi = _mm256_extractf128_si256!1(b); 3933 __m128i r_lo = _mm_packus_epi16(a_lo, b_lo); 3934 __m128i r_hi = _mm_packus_epi16(a_hi, b_hi); 3935 return _mm256_set_m128i(r_hi, r_lo); 3936 } 3937 } 3938 unittest 3939 { 3940 __m256i A = _mm256_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0, -10, 400, 0, 256, -32768, 2, 1, 0); 3941 __m256i B = _mm256_setr_epi16( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 3942 byte32 R = cast(byte32) _mm256_packus_epi16(A, B); 3943 align(32) static immutable byte[32] correctResult = [0, -1, 0, -1, -1, 2, 1, 0, 0, 1, 2, 3, 4, 5, 6, 7, 3944 0, -1, 0, -1, 0 , 2, 1, 0, 8, 9, 10, 11, 12, 13, 14, 15]; 3945 assert(R.array == correctResult); 3946 } 3947 3948 /// Convert packed signed 32-bit integers from `a` and `b `to packed 16-bit integers using unsigned saturation. 3949 /// Warning: `a` and `b` are interleaved per-lane. 3950 /// Result has: `a` lane 0, `b` lane 0, `a` lane 1, `b` lane 1. 3951 __m256i _mm256_packus_epi32 (__m256i a, __m256i b) pure @safe 3952 { 3953 // PERF D_SIMD 3954 static if (GDC_with_AVX2) 3955 { 3956 return cast(__m256i) __builtin_ia32_packusdw256(cast(int8)a, cast(int8)b); 3957 } 3958 else static if (LDC_with_AVX2) 3959 { 3960 return cast(__m256i) __builtin_ia32_packusdw256(cast(int8)a, cast(int8)b); 3961 } 3962 else 3963 { 3964 // 8 inst in arm64 since LDC 1.22 -O2, 3965 // sounds a bit underperforming maybe 3966 __m128i a_lo = _mm256_extractf128_si256!0(a); 3967 __m128i a_hi = _mm256_extractf128_si256!1(a); 3968 __m128i b_lo = _mm256_extractf128_si256!0(b); 3969 __m128i b_hi = _mm256_extractf128_si256!1(b); 3970 __m128i r_lo = _mm_packus_epi32(a_lo, b_lo); 3971 __m128i r_hi = _mm_packus_epi32(a_hi, b_hi); 3972 return _mm256_set_m128i(r_hi, r_lo); 3973 } 3974 } 3975 unittest 3976 { 3977 __m256i A = _mm256_setr_epi32(100000, -100000, 1000, 0, 100000, -100000, 1000, 1); 3978 short16 R = cast(short16) _mm256_packus_epi32(A, A); 3979 short[16] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0, 3980 cast(short)65535, 0, 1000, 1, cast(short)65535, 0, 1000, 1]; 3981 assert(R.array == correct); 3982 } 3983 3984 /// Shuffle 128-bits (composed of 2 packed (128-bit) integer elements) 3985 /// selected by `imm8` from `a` and `b`. 3986 /// See the documentation as the `imm8` format is quite complex. 3987 __m256i _mm256_permute2x128_si256(int imm8)(__m256i a, __m256i b) pure @safe 3988 { 3989 // PERF: the only difference with _mm256_permute2f128_si256, which we 3990 // haven't reproduced here, is that _mm256_permute2x128_si256 is supposed 3991 // to be with an integer hint at instruction level, and requires AVX2. 3992 return _mm256_permute2f128_si256!imm8(a, b); 3993 } 3994 unittest 3995 { 3996 __m256d A = _mm256_setr_pd(8.0, 1, 2, 3); 3997 __m256d B = _mm256_setr_pd(4.0, 5, 6, 7); 3998 __m256d R2 = _mm256_permute2f128_pd!(3*16 + 8 + 1)(A, B); 3999 double[4] correct2 = [0.0, 0.0, 6.0, 7.0]; 4000 assert(R2.array == correct2); 4001 } 4002 4003 /// Shuffle 64-bit integers in `a` across lanes using the control in `imm8`. 4004 __m256i _mm256_permute4x64_epi64(int imm8)(__m256i a) pure @trusted 4005 { 4006 static if (GDC_with_AVX2) 4007 return cast(__m256i) __builtin_ia32_permdi256(a, imm8); 4008 else static if (LDC_with_optimizations) 4009 { 4010 return shufflevector!(long4, (imm8 >> 0) & 3, 4011 (imm8 >> 2) & 3, 4012 (imm8 >> 4) & 3, 4013 (imm8 >> 6) & 3)(a, a); 4014 } 4015 else 4016 { 4017 __m256i b = a; 4018 static foreach (i; 0..4) 4019 a[i] = b[(imm8 & (0b00000011 << (i * 2))) >> (i * 2)]; 4020 return a; 4021 } 4022 } 4023 unittest 4024 { 4025 __m256i A = _mm256_setr_epi64x(1, 2, 3, 4); 4026 static immutable long[4] correct = [ 4, 3, 2, 1 ]; 4027 assert(_mm256_permute4x64_epi64!(0b00011011)(A).array == correct); 4028 4029 A = _mm256_setr_epi64x(1, 2, 3, 4); 4030 static immutable long[4] correct2 = [ 1, 4, 1, 1 ]; 4031 assert(_mm256_permute4x64_epi64!(0b00001100)(A).array == correct2); 4032 } 4033 4034 /// Shuffle 64-bit double in `a` across lanes using the control in `imm8`. 4035 __m256d _mm256_permute4x64_pd(int imm8)(__m256d a) pure @trusted 4036 { 4037 // PERF: ignore instruction-level type hint 4038 return cast(__m256d) _mm256_permute4x64_epi64!imm8(cast(__m256i)a); 4039 } 4040 unittest 4041 { 4042 __m256d A = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0); 4043 static immutable double[4] correct = [ 4.0, 3.0, 2.0, 1.0 ]; 4044 assert(_mm256_permute4x64_pd!(0b00011011)(A).array == correct); 4045 } 4046 4047 /// Shuffle 32-bit integers in `a` across lanes using the corresponding index in `idx`. 4048 __m256i _mm256_permutevar8x32_epi32 (__m256i a, __m256i idx) pure @trusted 4049 { 4050 // While it _should_ be possible to use 4x _mm_shuffle_epi8 for this permute, 4051 // it is quite hard to pull off and simd-everwhere doesn't attempt either. 4052 static if (GDC_or_LDC_with_AVX2) 4053 { 4054 return cast(__m256i) __builtin_ia32_permvarsi256(cast(int8)a, cast(int8)idx); 4055 } 4056 else 4057 { 4058 // PERF ARM64 and x86 without AVX, it's not very good 4059 int8 ai = cast(int8)a; 4060 int8 ii = cast(int8)idx; 4061 int8 ri; 4062 4063 for (int j = 0; j < 8; ++j) 4064 { 4065 ri.ptr[j] = ai.array[ ii[j] & 7 ]; 4066 } 4067 return cast(__m256i) ri; 4068 } 4069 } 4070 unittest 4071 { 4072 __m256i A = _mm256_setr_epi32(8, 9, 10, 11, 12, 13, 14, 15); 4073 __m256i B = _mm256_setr_epi32(8 + 1, 4, 7, 8 + 2, 24, 3, 3, 2); 4074 int8 R = cast(int8) _mm256_permutevar8x32_epi32(A, B); 4075 int[8] correct = [ 9, 12, 15, 10, 8, 11, 11, 10 ]; 4076 assert(R.array == correct); 4077 } 4078 4079 /// Shuffle single-precision (32-bit) floating-point in `a` across lanes using the 4080 /// corresponding index in `idx`. 4081 __m256 _mm256_permutevar8x32_ps (__m256 a, __m256i idx) pure @safe 4082 { 4083 return cast(__m256) _mm256_permutevar8x32_epi32(cast(__m256i)a, cast(__m256i)idx); 4084 } 4085 4086 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 4087 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 4088 /// low 16 bits of 64-bit elements in result. 4089 __m256i _mm256_sad_epu8 (__m256i a, __m256i b) pure @trusted 4090 { 4091 static if (GDC_with_AVX2) 4092 { 4093 return cast(__m256i) __builtin_ia32_psadbw256(cast(ubyte32)a, cast(ubyte32)b); 4094 } 4095 else static if (LDC_with_AVX2) 4096 { 4097 return cast(__m256i) __builtin_ia32_psadbw256(cast(byte32)a, cast(byte32)b); 4098 } 4099 else 4100 { 4101 // split is beneficial for ARM64, LDC and GDC without AVX2 4102 __m128i a_lo = _mm256_extractf128_si256!0(a); 4103 __m128i a_hi = _mm256_extractf128_si256!1(a); 4104 __m128i b_lo = _mm256_extractf128_si256!0(b); 4105 __m128i b_hi = _mm256_extractf128_si256!1(b); 4106 __m128i r_lo = _mm_sad_epu8(a_lo, b_lo); 4107 __m128i r_hi = _mm_sad_epu8(a_hi, b_hi); 4108 return _mm256_set_m128i(r_hi, r_lo); 4109 } 4110 } 4111 unittest 4112 { 4113 __m256i A = _mm256_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54, 4114 3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 4115 __m256i B = _mm256_set1_epi8(1); 4116 int8 R = cast(int8) _mm256_sad_epu8(A, B); 4117 int[8] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 4118 0, 4119 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 4120 0, 4121 2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 4122 0, 4123 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 4124 0]; 4125 assert(R.array == correct); 4126 } 4127 4128 /// Shuffle 32-bit integers in `a` within 128-bit lanes using the control in `imm8`, and return the results. 4129 __m256i _mm256_shuffle_epi32(int imm8)(__m256i a) pure @trusted 4130 { 4131 static if (GDC_with_AVX2) 4132 return cast(__m256i)__builtin_ia32_pshufd256(cast(int8)a, imm8); 4133 else static if (LDC_with_AVX2) 4134 { 4135 return cast(__m256i)shufflevectorLDC!(int8, 4136 (imm8 >> 0) & 3, 4137 (imm8 >> 2) & 3, 4138 (imm8 >> 4) & 3, 4139 (imm8 >> 6) & 3, 4140 ((imm8 >> 0) & 3) + 4, 4141 ((imm8 >> 2) & 3) + 4, 4142 ((imm8 >> 4) & 3) + 4, 4143 ((imm8 >> 6) & 3) + 4)(cast(int8)a, cast(int8)a); 4144 } 4145 else 4146 { 4147 auto hi = _mm_shuffle_epi32!imm8(_mm256_extractf128_si256!0(a)); 4148 auto lo = _mm_shuffle_epi32!imm8(_mm256_extractf128_si256!1(a)); 4149 return _mm256_setr_m128i(hi, lo); 4150 } 4151 } 4152 unittest 4153 { 4154 __m256i a = _mm256_set_epi32(32, 31, 30, 29, 28, 27, 26, 25); 4155 assert(_mm256_shuffle_epi32!255(a).array == [120259084316L, 120259084316, 137438953504, 137438953504]); 4156 } 4157 4158 /// Shuffle 8-bit integers in `a` within 128-bit lanes according to shuffle control mask in the 4159 /// corresponding 8-bit element of `b`. 4160 __m256i _mm256_shuffle_epi8(__m256i a, __m256i b) pure @trusted 4161 { 4162 static if (GDC_with_AVX2) 4163 return cast(__m256i)__builtin_ia32_pshufb256(cast(ubyte32)a, cast(ubyte32)b); 4164 else static if (LDC_with_AVX2) 4165 return cast(__m256i)__builtin_ia32_pshufb256(cast(byte32)a, cast(byte32)b); 4166 else 4167 { 4168 auto hi = _mm_shuffle_epi8(_mm256_extractf128_si256!0(a), _mm256_extractf128_si256!0(b)); 4169 auto lo = _mm_shuffle_epi8(_mm256_extractf128_si256!1(a), _mm256_extractf128_si256!1(b)); 4170 return _mm256_setr_m128i(hi, lo); 4171 } 4172 } 4173 unittest 4174 { 4175 __m256i a = _mm256_set_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); 4176 __m256i b = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1); 4177 4178 __m256i expected = _mm256_setr_epi8( 4179 2, 2, 2, 2, 2, 2, 2, 2, 4180 1, 1, 1, 1, 1, 1, 1, 1, 4181 18, 18, 18, 18, 18, 18, 18, 18, 4182 17, 17, 17, 17, 17, 17, 17, 17 4183 ); 4184 4185 assert(_mm256_shuffle_epi8(a, b).array == expected.array); 4186 } 4187 4188 /// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of `a` using 4189 /// the control in `imm8`. Store the results in the high 64 bits of 128-bit lanes 4190 /// of result, with the low 64 bits of 128-bit lanes being copied from from `a`. 4191 /// See also: `_MM_SHUFFLE`. 4192 __m256i _mm256_shufflehi_epi16(int imm8)(__m256i a) pure @safe 4193 { 4194 static if (GDC_with_AVX2) 4195 { 4196 return cast(__m256i) __builtin_ia32_pshufhw256(cast(short16)a, imm8); 4197 } 4198 else static if (LDC_with_optimizations) 4199 { 4200 return cast(__m256i) shufflevectorLDC!(short16, 4201 0, 1, 2, 3, 4202 4 + ( (imm8 >> 0) & 3 ), 4203 4 + ( (imm8 >> 2) & 3 ), 4204 4 + ( (imm8 >> 4) & 3 ), 4205 4 + ( (imm8 >> 6) & 3 ), 4206 8, 9, 10, 11, 4207 12 + ( (imm8 >> 0) & 3 ), 4208 12 + ( (imm8 >> 2) & 3 ), 4209 12 + ( (imm8 >> 4) & 3 ), 4210 12 + ( (imm8 >> 6) & 3 )) 4211 (cast(short16)a, cast(short16)a); 4212 } 4213 else 4214 { 4215 __m128i a_lo = _mm256_extractf128_si256!0(a); 4216 __m128i a_hi = _mm256_extractf128_si256!1(a); 4217 __m128i r_lo = _mm_shufflehi_epi16!imm8(a_lo); 4218 __m128i r_hi = _mm_shufflehi_epi16!imm8(a_hi); 4219 return _mm256_set_m128i(r_hi, r_lo); 4220 } 4221 } 4222 unittest 4223 { 4224 __m256i A = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 4225 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 4226 short16 B = cast(short16) _mm256_shufflehi_epi16!SHUFFLE(A); 4227 short[16] expectedB = [ 0, 1, 2, 3, 7, 6, 5, 4, 8, 9, 10, 11, 15, 14, 13, 12 ]; 4228 assert(B.array == expectedB); 4229 } 4230 4231 /// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of `a` using 4232 /// the control in `imm8`. Store the results in the low 64 bits of 128-bit lanes 4233 /// of result, with the high 64 bits of 128-bit lanes being copied from from `a`. 4234 /// See also: `_MM_SHUFFLE`. 4235 __m256i _mm256_shufflelo_epi16(int imm8)(__m256i a) pure @safe 4236 { 4237 static if (GDC_with_AVX2) 4238 { 4239 return cast(__m256i) __builtin_ia32_pshuflw256(cast(short16)a, imm8); 4240 } 4241 else static if (LDC_with_optimizations) 4242 { 4243 return cast(__m256i) shufflevectorLDC!(short16, 4244 ( (imm8 >> 0) & 3 ), 4245 ( (imm8 >> 2) & 3 ), 4246 ( (imm8 >> 4) & 3 ), 4247 ( (imm8 >> 6) & 3 ), 4248 4, 5, 6, 7, 4249 ( (imm8 >> 0) & 3 ) + 8, 4250 ( (imm8 >> 2) & 3 ) + 8, 4251 ( (imm8 >> 4) & 3 ) + 8, 4252 ( (imm8 >> 6) & 3 ) + 8, 4253 12, 13, 14, 15) 4254 (cast(short16)a, cast(short16)a); 4255 } 4256 else 4257 { 4258 __m128i a_lo = _mm256_extractf128_si256!0(a); 4259 __m128i a_hi = _mm256_extractf128_si256!1(a); 4260 __m128i r_lo = _mm_shufflelo_epi16!imm8(a_lo); 4261 __m128i r_hi = _mm_shufflelo_epi16!imm8(a_hi); 4262 return _mm256_set_m128i(r_hi, r_lo); 4263 } 4264 } 4265 unittest 4266 { 4267 __m256i A = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 4268 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 4269 short16 B = cast(short16) _mm256_shufflelo_epi16!SHUFFLE(A); 4270 short[16] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7, 11, 10, 9, 8, 12, 13, 14, 15 ]; 4271 assert(B.array == expectedB); 4272 } 4273 4274 /// Negate packed signed 16-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 4275 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 4276 __m256i _mm256_sign_epi16 (__m256i a, __m256i b) pure @safe 4277 { 4278 // PERF DMD 4279 static if (GDC_with_AVX2) 4280 { 4281 return cast(__m256i) __builtin_ia32_psignw256(cast(short16)a, cast(short16)b); 4282 } 4283 else static if (LDC_with_AVX2) 4284 { 4285 return cast(__m256i) __builtin_ia32_psignw256(cast(short16)a, cast(short16)b); 4286 } 4287 else // split 4288 { 4289 __m128i a_lo = _mm256_extractf128_si256!0(a); 4290 __m128i a_hi = _mm256_extractf128_si256!1(a); 4291 __m128i b_lo = _mm256_extractf128_si256!0(b); 4292 __m128i b_hi = _mm256_extractf128_si256!1(b); 4293 __m128i r_lo = _mm_sign_epi16(a_lo, b_lo); 4294 __m128i r_hi = _mm_sign_epi16(a_hi, b_hi); 4295 return _mm256_set_m128i(r_hi, r_lo); 4296 } 4297 // PERF: not optimal in AVX without AVX2 4298 } 4299 unittest 4300 { 4301 __m128i A = _mm_setr_epi16(-2, -1, 0, 1, 2, short.min, short.min, short.min); 4302 __m128i B = _mm_setr_epi16(-1, 0,-1, 1, -2, -50, 0, 50); 4303 __m256i AA = _mm256_set_m128i(A, A); 4304 __m256i BB = _mm256_set_m128i(B, B); 4305 short16 C = cast(short16) _mm256_sign_epi16(AA, BB); 4306 short[16] correct = [ 2, 0, 0, 1, -2, short.min, 0, short.min, 2, 0, 0, 1, -2, short.min, 0, short.min]; 4307 assert(C.array == correct); 4308 } 4309 4310 /// Negate packed signed 32-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 4311 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 4312 __m256i _mm256_sign_epi32 (__m256i a, __m256i b) pure @safe 4313 { 4314 // PERF DMD 4315 static if (GDC_with_AVX2) 4316 { 4317 return cast(__m256i) __builtin_ia32_psignd256(cast(int8)a, cast(int8)b); 4318 } 4319 else static if (LDC_with_AVX2) 4320 { 4321 return cast(__m256i) __builtin_ia32_psignd256(cast(int8)a, cast(int8)b); 4322 } 4323 else // split 4324 { 4325 __m128i a_lo = _mm256_extractf128_si256!0(a); 4326 __m128i a_hi = _mm256_extractf128_si256!1(a); 4327 __m128i b_lo = _mm256_extractf128_si256!0(b); 4328 __m128i b_hi = _mm256_extractf128_si256!1(b); 4329 __m128i r_lo = _mm_sign_epi32(a_lo, b_lo); 4330 __m128i r_hi = _mm_sign_epi32(a_hi, b_hi); 4331 return _mm256_set_m128i(r_hi, r_lo); 4332 } 4333 // PERF: not optimal in AVX without AVX2 4334 } 4335 unittest 4336 { 4337 __m256i A = _mm256_setr_epi32(-2, -1, 0, int.max, -2, -1, 0, int.max); 4338 __m256i B = _mm256_setr_epi32(-1, 0, -1, 1, -1, 0, -1, 1); 4339 int8 C = cast(int8) _mm256_sign_epi32(A, B); 4340 int[8] correct = [ 2, 0, 0, int.max, 2, 0, 0, int.max]; 4341 assert(C.array == correct); 4342 } 4343 4344 /// Negate packed signed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 4345 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 4346 __m256i _mm256_sign_epi8 (__m256i a, __m256i b) pure @safe 4347 { 4348 // PERF DMD 4349 static if (GDC_with_AVX2) 4350 { 4351 return cast(__m256i) __builtin_ia32_psignb256(cast(ubyte32)a, cast(ubyte32)b); 4352 } 4353 else static if (LDC_with_AVX2) 4354 { 4355 return cast(__m256i) __builtin_ia32_psignb256(cast(byte32)a, cast(byte32)b); 4356 } 4357 else // split 4358 { 4359 // LDC arm64, 10 inst since LDC 1.32.1 -O1 4360 __m128i a_lo = _mm256_extractf128_si256!0(a); 4361 __m128i a_hi = _mm256_extractf128_si256!1(a); 4362 __m128i b_lo = _mm256_extractf128_si256!0(b); 4363 __m128i b_hi = _mm256_extractf128_si256!1(b); 4364 __m128i r_lo = _mm_sign_epi8(a_lo, b_lo); 4365 __m128i r_hi = _mm_sign_epi8(a_hi, b_hi); 4366 return _mm256_set_m128i(r_hi, r_lo); 4367 } 4368 // PERF: not optimal in AVX without AVX2 4369 } 4370 unittest 4371 { 4372 __m256i A = _mm256_setr_epi8( 1, 1, 1, 1, 1, 1, -2, 1, 0, 1, 0, 0, 0, 0, -2, 1, 4373 -2, -1, 0, 1, 2, byte.min, byte.min, byte.min, -1, 0,-1, 1, -2, -50, 0, 50); 4374 __m256i B = _mm256_setr_epi8(-1, 0,-1, 1, -2, -50, 0, 50, -1, 0,-1, 1, -2, -50, 0, 50, 4375 -1, 0,-1, 1, -2, -50, 0, 50, -2, -1, 0, 1, 2, byte.min, byte.min, byte.min); 4376 byte32 C = cast(byte32) _mm256_sign_epi8(A, B); 4377 byte[32] correct = [ -1, 0,-1, 1, -1, -1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 4378 2, 0, 0, 1, -2, byte.min, 0, byte.min, 1, 0, 0, 1, -2, 50, 0, -50]; 4379 assert(C.array == correct); 4380 } 4381 4382 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeroes. 4383 /// Bit-shift is a single value in the low-order 64-bit of `count`. 4384 /// If bit-shift > 15, result is defined to be all zeroes. 4385 /// Note: prefer `_mm256_slli_epi16`, less of a trap. 4386 __m256i _mm256_sll_epi16 (__m256i a, __m128i count) pure @trusted 4387 { 4388 // PERF ARM64 4389 static if (GDC_or_LDC_with_AVX2) 4390 { 4391 return cast(__m256i) __builtin_ia32_psllw256(cast(short16)a, cast(short8)count); 4392 } 4393 else 4394 { 4395 __m128i a_lo = _mm256_extractf128_si256!0(a); 4396 __m128i a_hi = _mm256_extractf128_si256!1(a); 4397 __m128i r_lo = _mm_sll_epi16(a_lo, count); 4398 __m128i r_hi = _mm_sll_epi16(a_hi, count); 4399 return _mm256_set_m128i(r_hi, r_lo); 4400 } 4401 } 4402 unittest 4403 { 4404 __m128i shift0 = _mm_setzero_si128(); 4405 __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift 4406 __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5); 4407 __m256i A = _mm256_setr_epi16(4, -8, 11, -32768, 4, -8, 11, -32768, 4, -8, 11, -32768, 4, -8, 11, -32768); 4408 short[16] correct0 = (cast(short16)A).array; 4409 short[16] correctX = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4410 short[16] correct2 = [16, -32, 44, 0, 16, -32, 44, 0, 16, -32, 44, 0, 16, -32, 44, 0]; 4411 short16 B0 = cast(short16) _mm256_sll_epi16(A, shift0); 4412 short16 BX = cast(short16) _mm256_sll_epi16(A, shiftX); 4413 short16 B2 = cast(short16) _mm256_sll_epi16(A, shift2); 4414 assert(B0.array == correct0); 4415 assert(BX.array == correctX); 4416 assert(B2.array == correct2); 4417 } 4418 4419 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeroes. 4420 /// Bit-shift is a single value in the low-order 64-bit of `count`. 4421 /// If bit-shift > 31, result is defined to be all zeroes. 4422 /// Note: prefer `_mm256_slli_epi32`, less of a trap. 4423 __m256i _mm256_sll_epi32 (__m256i a, __m128i count) pure @trusted 4424 { 4425 // PERF ARM64 4426 static if (GDC_or_LDC_with_AVX2) 4427 { 4428 return cast(__m256i) __builtin_ia32_pslld256(cast(int8)a, count); 4429 } 4430 else 4431 { 4432 __m128i a_lo = _mm256_extractf128_si256!0(a); 4433 __m128i a_hi = _mm256_extractf128_si256!1(a); 4434 __m128i r_lo = _mm_sll_epi32(a_lo, count); 4435 __m128i r_hi = _mm_sll_epi32(a_hi, count); 4436 return _mm256_set_m128i(r_hi, r_lo); 4437 } 4438 } 4439 unittest 4440 { 4441 __m128i shift0 = _mm_setzero_si128(); 4442 __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift 4443 __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5); 4444 __m256i A = _mm256_setr_epi32(4, -9, 11, -2147483648, 2, -9, 11, -2147483648); 4445 int[8] correct0 = (cast(int8)A).array; 4446 int[8] correctX = [0, 0, 0, 0, 0, 0, 0, 0]; 4447 int[8] correct2 = [16, -36, 44, 0, 8, -36, 44, 0]; 4448 int8 B0 = cast(int8) _mm256_sll_epi32(A, shift0); 4449 int8 BX = cast(int8) _mm256_sll_epi32(A, shiftX); 4450 int8 B2 = cast(int8) _mm256_sll_epi32(A, shift2); 4451 assert(B0.array == correct0); 4452 assert(BX.array == correctX); 4453 assert(B2.array == correct2); 4454 } 4455 4456 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeroes. 4457 /// Bit-shift is a single value in the low-order 64-bit of `count`. 4458 /// If bit-shift > 63, result is defined to be all zeroes. 4459 /// Note: prefer `_mm256_sll_epi64`, less of a trap. 4460 __m256i _mm256_sll_epi64 (__m256i a, __m128i count) pure @trusted 4461 { 4462 // PERF ARM64 4463 static if (GDC_or_LDC_with_AVX2) 4464 { 4465 return cast(__m256i) __builtin_ia32_psllq256(cast(long4)a, cast(long2)count); 4466 } 4467 else 4468 { 4469 __m128i a_lo = _mm256_extractf128_si256!0(a); 4470 __m128i a_hi = _mm256_extractf128_si256!1(a); 4471 __m128i r_lo = _mm_sll_epi64(a_lo, count); 4472 __m128i r_hi = _mm_sll_epi64(a_hi, count); 4473 return _mm256_set_m128i(r_hi, r_lo); 4474 } 4475 } 4476 unittest 4477 { 4478 __m128i shift0 = _mm_setzero_si128(); 4479 __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift 4480 __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5); 4481 __m256i A = _mm256_setr_epi64(4, -9, 5, -8); 4482 long[4] correct0 = [ 4, -9, 5, -8]; 4483 long[4] correctX = [ 0, 0, 0, 0]; 4484 long[4] correct2 = [16, -36, 20, -32]; 4485 long4 B0 = cast(long4) _mm256_sll_epi64(A, shift0); 4486 long4 BX = cast(long4) _mm256_sll_epi64(A, shiftX); 4487 long4 B2 = cast(long4) _mm256_sll_epi64(A, shift2); 4488 assert(B0.array == correct0); 4489 assert(BX.array == correctX); 4490 assert(B2.array == correct2); 4491 } 4492 4493 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 4494 __m256i _mm256_slli_epi16(__m256i a, int imm8) pure @safe 4495 { 4496 static if (GDC_or_LDC_with_AVX2) 4497 { 4498 return cast(__m256i) __builtin_ia32_psllwi256(cast(short16)a, cast(ubyte)imm8); 4499 } 4500 else // split 4501 { 4502 __m128i a_lo = _mm256_extractf128_si256!0(a); 4503 __m128i a_hi = _mm256_extractf128_si256!1(a); 4504 __m128i r_lo = _mm_slli_epi16(a_lo, imm8); 4505 __m128i r_hi = _mm_slli_epi16(a_hi, imm8); 4506 return _mm256_set_m128i(r_hi, r_lo); 4507 } 4508 } 4509 unittest 4510 { 4511 __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7); 4512 short16 B = cast(short16)( _mm256_slli_epi16(A, 1) ); 4513 short16 B2 = cast(short16)( _mm256_slli_epi16(A, 1 + 256) ); 4514 short[16] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14, 0, 2, 4, 6, -8, -10, 12, 14 ]; 4515 assert(B.array == expectedB); 4516 assert(B2.array == expectedB); 4517 4518 short16 C = cast(short16)( _mm256_slli_epi16(A, 16) ); 4519 short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]; 4520 assert(C.array == expectedC); 4521 } 4522 4523 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 4524 __m256i _mm256_slli_epi32 (__m256i a, int imm8) pure @safe 4525 { 4526 static if (GDC_or_LDC_with_AVX2) 4527 { 4528 return cast(__m256i) __builtin_ia32_pslldi256(cast(int8)a, cast(ubyte)imm8); 4529 } 4530 else 4531 { 4532 __m128i a_lo = _mm256_extractf128_si256!0(a); 4533 __m128i a_hi = _mm256_extractf128_si256!1(a); 4534 __m128i r_lo = _mm_slli_epi32(a_lo, imm8); 4535 __m128i r_hi = _mm_slli_epi32(a_hi, imm8); 4536 return _mm256_set_m128i(r_hi, r_lo); 4537 } 4538 } 4539 unittest 4540 { 4541 __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -9); 4542 int8 B = cast(int8) _mm256_slli_epi32(A, 1); 4543 int8 B2 = cast(int8) _mm256_slli_epi32(A, 1 + 256); 4544 int[8] expectedB = [ 0, 4, 6, -8, 0, 4, 6, -18 ]; 4545 assert(B.array == expectedB); 4546 assert(B2.array == expectedB); 4547 4548 int8 C = cast(int8) _mm256_slli_epi32(A, 0); 4549 int[8] expectedC = [ 0, 2, 3, -4, 0, 2, 3, -9 ]; 4550 assert(C.array == expectedC); 4551 4552 int8 D = cast(int8) _mm256_slli_epi32(A, 65); 4553 int[8] expectedD = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 4554 assert(D.array == expectedD); 4555 } 4556 4557 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros. 4558 __m256i _mm256_slli_epi64 (__m256i a, int imm8) pure @safe 4559 { 4560 static if (GDC_or_LDC_with_AVX2) 4561 { 4562 return cast(__m256i) __builtin_ia32_psllqi256(cast(long4)a, cast(ubyte)imm8); 4563 } 4564 else 4565 { 4566 __m128i a_lo = _mm256_extractf128_si256!0(a); 4567 __m128i a_hi = _mm256_extractf128_si256!1(a); 4568 __m128i r_lo = _mm_slli_epi64(a_lo, imm8); 4569 __m128i r_hi = _mm_slli_epi64(a_hi, imm8); 4570 return _mm256_set_m128i(r_hi, r_lo); 4571 } 4572 } 4573 unittest 4574 { 4575 __m256i A = _mm256_setr_epi64(23, -4, 1, long.max); 4576 long4 B = cast(long4) _mm256_slli_epi64(A, 1); 4577 long4 B2 = cast(long4) _mm256_slli_epi64(A, 1 + 256); 4578 4579 long[4] expectedB = [ 46, -8, 2, -2]; 4580 assert(B.array == expectedB); 4581 assert(B2.array == expectedB); 4582 4583 long4 C = cast(long4) _mm256_slli_epi64(A, 0); 4584 long[4] expectedC = [ 23, -4, 1, long.max ]; 4585 assert(C.array == expectedC); 4586 4587 long4 D = cast(long4) _mm256_slli_epi64(A, 65); 4588 long[4] expectedD = [ 0, 0, 0, 0 ]; 4589 assert(D.array == expectedD); 4590 } 4591 4592 /// Shift 128-bit lanes in `a` left by `bytes` bytes while shifting in zeroes. 4593 alias _mm256_slli_si256 = _mm256_bslli_epi128; 4594 4595 /// Shift packed 32-bit integers in `a` left by the amount specified by the corresponding element in `count` while shifting in zeroes. 4596 __m128i _mm_sllv_epi32(__m128i a, __m128i count) pure @trusted 4597 { 4598 static if (GDC_with_AVX2 || LDC_with_AVX2) 4599 return cast(__m128i)__builtin_ia32_psllv4si(cast(byte16)a, cast(byte16)count); 4600 else 4601 { 4602 // UB if b[n] >= 32 4603 __m128i R = _mm_setr_epi32(a.array[0] << count.array[0], 4604 a.array[1] << count.array[1], 4605 a.array[2] << count.array[2], 4606 a.array[3] << count.array[3]); 4607 4608 // Map large and negative shifts to 32 4609 __m128i mm32 = _mm_set1_epi32(32); 4610 __m128i shift = _mm_min_epu32(count, mm32); 4611 4612 // Set to 0 where the shift is >= 32 4613 R = R & _mm_cmplt_epi32(shift, mm32); 4614 return R; 4615 } 4616 } 4617 unittest 4618 { 4619 __m128i A = _mm_setr_epi32(-1, 1, 4, -4); 4620 __m128i shift = _mm_setr_epi32( 2, -6, 1, 32); 4621 int4 R = cast(int4) _mm_sllv_epi32(A, shift); 4622 int[4] expected = [ -4, 0, 8, 0 ]; 4623 assert(R.array == expected); 4624 } 4625 4626 /// Shift packed 32-bit integers in `a` left by the amount specified by the corresponding element in `count` while shifting in zeroes. 4627 __m256i _mm256_sllv_epi32 (__m256i a, __m256i count) pure @safe 4628 { 4629 static if (GDC_with_AVX2 || LDC_with_AVX2) 4630 return cast(__m256i)__builtin_ia32_psllv8si(cast(int8)a, cast(int8)count); 4631 else 4632 { 4633 // split 4634 __m128i a_lo = _mm256_extractf128_si256!0(a); 4635 __m128i a_hi = _mm256_extractf128_si256!1(a); 4636 __m128i c_lo = _mm256_extractf128_si256!0(count); 4637 __m128i c_hi = _mm256_extractf128_si256!1(count); 4638 __m128i r_lo = _mm_sllv_epi32(a_lo, c_lo); 4639 __m128i r_hi = _mm_sllv_epi32(a_hi, c_hi); 4640 return _mm256_set_m128i(r_hi, r_lo); 4641 } 4642 } 4643 unittest 4644 { 4645 __m256i A = _mm256_setr_epi32(-1, 1, 4, -4, -1, 1, 4, -4); 4646 __m256i shift = _mm256_setr_epi32( 2, -6, 1, 32, 2, -6, 33, 32); 4647 int8 R = cast(int8) _mm256_sllv_epi32(A, shift); 4648 int[8] expected = [ -4, 0, 8, 0, -4, 0, 0, 0 ]; 4649 assert(R.array == expected); 4650 } 4651 4652 4653 /// Shift packed 64-bit integers in `a` left by the amount specified by the corresponding element in `b` while shifting in zeros. 4654 __m128i _mm_sllv_epi64(__m128i a, __m128i count) pure @trusted 4655 { 4656 static if (GDC_with_AVX2 || LDC_with_AVX2) 4657 { 4658 return cast(__m128i)__builtin_ia32_psllv2di(cast(long2)a, cast(long2)count); 4659 } 4660 else 4661 { 4662 // PERF arm64 4663 // LDC: x86, it's not good, but at least it's branchless 4664 long2 la = cast(long2)a; 4665 long2 lb = cast(long2)count; 4666 long2 R; 4667 R.ptr[0] = cast(uint)(lb.array[0]) < 64 ? (la.array[0] << lb.array[0]) : 0; 4668 R.ptr[1] = cast(uint)(lb.array[1]) < 64 ? (la.array[1] << lb.array[1]) : 0; 4669 return cast(__m128i)R; 4670 } 4671 } 4672 unittest 4673 { 4674 __m128i A = _mm_setr_epi64( -4, 6); 4675 __m128i B1 = _mm_setr_epi64( 2, 0); 4676 __m128i B2 = _mm_setr_epi64(-12, 64); 4677 long2 R1 = cast(long2) _mm_sllv_epi64(A, B1); 4678 long2 R2 = cast(long2) _mm_sllv_epi64(A, B2); 4679 long[2] correct1 = [-16, 6]; 4680 long[2] correct2 = [ 0, 0]; 4681 assert(R1.array == correct1); 4682 assert(R2.array == correct2); 4683 } 4684 4685 /// Shift packed 64-bit integers in `a` left by the amount specified by the corresponding element in `count` while shifting in zeroes. 4686 __m256i _mm256_sllv_epi64 (__m256i a, __m256i count) pure @safe 4687 { 4688 static if (GDC_with_AVX2 || LDC_with_AVX2) 4689 return cast(__m256i)__builtin_ia32_psllv4di(cast(long4)a, cast(long4)count); 4690 else 4691 { 4692 // split 4693 __m128i a_lo = _mm256_extractf128_si256!0(a); 4694 __m128i a_hi = _mm256_extractf128_si256!1(a); 4695 __m128i c_lo = _mm256_extractf128_si256!0(count); 4696 __m128i c_hi = _mm256_extractf128_si256!1(count); 4697 __m128i r_lo = _mm_sllv_epi64(a_lo, c_lo); 4698 __m128i r_hi = _mm_sllv_epi64(a_hi, c_hi); 4699 return _mm256_set_m128i(r_hi, r_lo); 4700 } 4701 } 4702 unittest 4703 { 4704 __m256i A = _mm256_setr_epi64( -4, 6, -1, 6); 4705 __m256i B1 = _mm256_setr_epi64( 2, 0, 3, 1); 4706 __m256i B2 = _mm256_setr_epi64(-12, 64, 63, 64); 4707 long4 R1 = cast(long4) _mm256_sllv_epi64(A, B1); 4708 long4 R2 = cast(long4) _mm256_sllv_epi64(A, B2); 4709 long[4] correct1 = [-16, 6, -8, 12]; 4710 long[4] correct2 = [ 0, 0, long.min, 0]; 4711 assert(R1.array == correct1); 4712 assert(R2.array == correct2); 4713 } 4714 4715 4716 4717 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits. 4718 /// Bit-shift is a single value in the low-order 64-bit of `count`. 4719 /// If bit-shift > 15, result is defined to be all sign bits. 4720 /// Warning: prefer `_mm256_srai_epi16`, less of a trap. 4721 __m256i _mm256_sra_epi16 (__m256i a, __m128i count) pure @trusted 4722 { 4723 static if (GDC_or_LDC_with_AVX2) 4724 { 4725 return cast(__m256i) __builtin_ia32_psraw256(cast(short16)a, cast(short8)count); 4726 } 4727 else 4728 { 4729 // split 4730 __m128i a_lo = _mm256_extractf128_si256!0(a); 4731 __m128i a_hi = _mm256_extractf128_si256!1(a); 4732 __m128i r_lo = _mm_sra_epi16(a_lo, count); 4733 __m128i r_hi = _mm_sra_epi16(a_hi, count); 4734 return _mm256_set_m128i(r_hi, r_lo); 4735 } 4736 } 4737 unittest 4738 { 4739 __m128i shift0 = _mm_setzero_si128(); 4740 __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift 4741 __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5); 4742 __m256i A = _mm256_setr_epi16(4, -9, 11, -32768, 4, -8, 11, -32768, 4743 4, -9, 11, -32768, 4, -8, 11, -32768); 4744 short[16] correct0 = (cast(short16)A).array; 4745 short[16] correctX = [0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1]; 4746 short[16] correct2 = [1, -3, 2, -8192, 1, -2, 2, -8192, 1, -3, 2, -8192, 1, -2, 2, -8192]; 4747 short16 B0 = cast(short16) _mm256_sra_epi16(A, shift0); 4748 short16 BX = cast(short16) _mm256_sra_epi16(A, shiftX); 4749 short16 B2 = cast(short16) _mm256_sra_epi16(A, shift2); 4750 assert(B0.array == correct0); 4751 assert(BX.array == correctX); 4752 assert(B2.array == correct2); 4753 } 4754 4755 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits. 4756 /// Bit-shift is a single value in the low-order 64-bit of `count`. 4757 /// If bit-shift > 31, result is defined to be all sign bits. 4758 /// Warning: prefer `_mm256_sra_epi32`, less of a trap. 4759 __m256i _mm256_sra_epi32 (__m256i a, __m128i count) pure @trusted 4760 { 4761 static if (GDC_or_LDC_with_AVX2) 4762 { 4763 return cast(__m256i) __builtin_ia32_psrad256(cast(int8)a, cast(int4)count); 4764 } 4765 else 4766 { 4767 // split 4768 __m128i a_lo = _mm256_extractf128_si256!0(a); 4769 __m128i a_hi = _mm256_extractf128_si256!1(a); 4770 __m128i r_lo = _mm_sra_epi32(a_lo, count); 4771 __m128i r_hi = _mm_sra_epi32(a_hi, count); 4772 return _mm256_set_m128i(r_hi, r_lo); 4773 } 4774 } 4775 unittest 4776 { 4777 __m128i shift0 = _mm_setzero_si128(); 4778 __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift 4779 __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5); 4780 __m256i A = _mm256_setr_epi32(4, -9, 11, -2147483648, 8, -9, 11, -2147483648); 4781 int[8] correct0 = (cast(int8)A).array; 4782 int[8] correctX = [0, -1, 0, -1, 0, -1, 0, -1]; 4783 int[8] correct2 = [1, -3, 2, -536870912, 2, -3, 2, -536870912]; 4784 int8 B0 = cast(int8) _mm256_sra_epi32(A, shift0); 4785 int8 BX = cast(int8) _mm256_sra_epi32(A, shiftX); 4786 int8 B2 = cast(int8) _mm256_sra_epi32(A, shift2); 4787 assert(B0.array == correct0); 4788 assert(BX.array == correctX); 4789 assert(B2.array == correct2); 4790 } 4791 4792 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 4793 __m256i _mm256_srai_epi16 (__m256i a, int imm8) pure @safe 4794 { 4795 static if (GDC_or_LDC_with_AVX2) 4796 { 4797 return cast(__m256i) __builtin_ia32_psrawi256(cast(short16)a, cast(ubyte)imm8); 4798 } 4799 else 4800 { 4801 // split 4802 __m128i a_lo = _mm256_extractf128_si256!0(a); 4803 __m128i a_hi = _mm256_extractf128_si256!1(a); 4804 __m128i r_lo = _mm_srai_epi16(a_lo, imm8); 4805 __m128i r_hi = _mm_srai_epi16(a_hi, imm8); 4806 return _mm256_set_m128i(r_hi, r_lo); 4807 } 4808 } 4809 unittest 4810 { 4811 __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, short.min, short.max, 2, 3, -4, -5, 6, 7); 4812 short16 B = cast(short16)( _mm256_srai_epi16(A, 1) ); 4813 short16 B2 = cast(short16)( _mm256_srai_epi16(A, 1 + 256) ); 4814 short[16] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3, -16384, 16383, 1, 1, -2, -3, 3, 3 ]; 4815 assert(B.array == expectedB); 4816 assert(B2.array == expectedB); 4817 4818 short16 C = cast(short16)( _mm256_srai_epi16(A, 18) ); 4819 short[16] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0, 4820 -1, 0, 0, 0, -1, -1, 0, 0 ]; 4821 assert(C.array == expectedC); 4822 } 4823 4824 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 4825 __m256i _mm256_srai_epi32 (__m256i a, int imm8) pure @safe 4826 { 4827 static if (GDC_or_LDC_with_AVX2) 4828 { 4829 return cast(__m256i) __builtin_ia32_psradi256(cast(int8)a, cast(ubyte)imm8); 4830 } 4831 else // split 4832 { 4833 __m128i a_lo = _mm256_extractf128_si256!0(a); 4834 __m128i a_hi = _mm256_extractf128_si256!1(a); 4835 __m128i r_lo = _mm_srai_epi32(a_lo, imm8); 4836 __m128i r_hi = _mm_srai_epi32(a_hi, imm8); 4837 return _mm256_set_m128i(r_hi, r_lo); 4838 } 4839 } 4840 unittest 4841 { 4842 __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4); 4843 int8 B = cast(int8) _mm256_srai_epi32(A, 1); 4844 int8 B2 = cast(int8) _mm256_srai_epi32(A, 1 + 256); 4845 int[8] expectedB = [ 0, 1, 1, -2, 0, 1, 1, -2]; 4846 assert(B.array == expectedB); 4847 assert(B2.array == expectedB); 4848 4849 int8 C = cast(int8) _mm256_srai_epi32(A, 32); 4850 int[8] expectedC = [ 0, 0, 0, -1, 0, 0, 0, -1]; 4851 assert(C.array == expectedC); 4852 4853 int8 D = cast(int8) _mm256_srai_epi32(A, 0); 4854 int[8] expectedD = [ 0, 2, 3, -4, 0, 2, 3, -4]; 4855 assert(D.array == expectedD); 4856 } 4857 4858 __m128i _mm_srav_epi32(__m128i a, __m128i count) pure @trusted 4859 { 4860 static if (GDC_with_AVX2 || LDC_with_AVX2) 4861 return cast(__m128i)__builtin_ia32_psrav4si(cast(int4)a, cast(int4)count); 4862 else 4863 { 4864 __m128i R = _mm_setr_epi32(a.array[0] >> count.array[0], 4865 a.array[1] >> count.array[1], 4866 a.array[2] >> count.array[2], 4867 a.array[3] >> count.array[3]); 4868 4869 // Map large and negative shifts to all sign bits 4870 __m128i signbits = _mm_srai_epi32(a, 31); 4871 __m128i mm32 = _mm_set1_epi32(32); 4872 __m128i shift = _mm_min_epu32(count, mm32); 4873 4874 // Set to 0 where the shift is >= 32 4875 __m128i lower = _mm_cmplt_epi32(shift, mm32); 4876 4877 R = (R & lower) | (signbits & ~lower); 4878 return R; 4879 } 4880 } 4881 unittest 4882 { 4883 __m128i A = _mm_setr_epi32(-1, 1, -4, -4); 4884 __m128i shift = _mm_setr_epi32( 2, -6, 31, 32); 4885 int4 R = cast(int4) _mm_srav_epi32(A, shift); 4886 int[4] expected = [ -1, 0, -1, -1 ]; 4887 assert(R.array == expected); 4888 } 4889 4890 __m256i _mm256_srav_epi32 (__m256i a, __m256i count) pure @safe 4891 { 4892 static if (GDC_or_LDC_with_AVX2) 4893 { 4894 return cast(__m256i) __builtin_ia32_psrav8si(cast(int8)a, cast(int8)count); 4895 } 4896 else // split 4897 { 4898 __m128i a_lo = _mm256_extractf128_si256!0(a); 4899 __m128i a_hi = _mm256_extractf128_si256!1(a); 4900 __m128i c_lo = _mm256_extractf128_si256!0(count); 4901 __m128i c_hi = _mm256_extractf128_si256!1(count); 4902 __m128i r_lo = _mm_srav_epi32(a_lo, c_lo); 4903 __m128i r_hi = _mm_srav_epi32(a_hi, c_hi); 4904 return _mm256_set_m128i(r_hi, r_lo); 4905 } 4906 } 4907 unittest 4908 { 4909 __m128i A = _mm_setr_epi32(-1, 1, -4, -4); 4910 __m128i shift = _mm_setr_epi32( 2, -6, 31, 32); 4911 int4 R = cast(int4) _mm_srav_epi32(A, shift); 4912 int[4] expected = [ -1, 0, -1, -1 ]; 4913 assert(R.array == expected); 4914 } 4915 4916 /// Shift packed 16-bit integers in `a` right by `count` while shifting in zeroes. 4917 /// Bit-shift is a single value in the low-order 64-bit of `count`. 4918 /// If bit-shift > 15, result is defined to be all zeroes. 4919 /// Note: prefer `_mm256_srli_epi16`, less of a trap. 4920 __m256i _mm256_srl_epi16 (__m256i a, __m128i count) pure @trusted 4921 { 4922 // PERF ARM64 4923 static if (GDC_or_LDC_with_AVX2) 4924 { 4925 return cast(__m256i) __builtin_ia32_psrlw256(cast(short16)a, cast(short8)count); 4926 } 4927 else 4928 { 4929 __m128i a_lo = _mm256_extractf128_si256!0(a); 4930 __m128i a_hi = _mm256_extractf128_si256!1(a); 4931 __m128i r_lo = _mm_srl_epi16(a_lo, count); 4932 __m128i r_hi = _mm_srl_epi16(a_hi, count); 4933 return _mm256_set_m128i(r_hi, r_lo); 4934 } 4935 } 4936 unittest 4937 { 4938 __m128i shift0 = _mm_setzero_si128(); 4939 __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift 4940 __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5); 4941 __m256i A = _mm256_setr_epi16(4, -8, 11, -32768, 4, -8, 11, -32768, 4, -8, 11, -32768, 4, -8, 11, -32768); 4942 short[16] correct0 = (cast(short16)A).array; 4943 short[16] correctX = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4944 short[16] correct2 = [1, 16382, 2, 8192, 1, 16382, 2, 8192, 1, 16382, 2, 8192, 1, 16382, 2, 8192]; 4945 short16 B0 = cast(short16) _mm256_srl_epi16(A, shift0); 4946 short16 BX = cast(short16) _mm256_srl_epi16(A, shiftX); 4947 short16 B2 = cast(short16) _mm256_srl_epi16(A, shift2); 4948 assert(B0.array == correct0); 4949 assert(BX.array == correctX); 4950 assert(B2.array == correct2); 4951 } 4952 4953 /// Shift packed 32-bit integers in `a` right by `count` while shifting in zeroes. 4954 /// Bit-shift is a single value in the low-order 64-bit of `count`. 4955 /// If bit-shift > 31, result is defined to be all zeroes. 4956 /// Note: prefer `_mm256_srli_epi32`, less of a trap. 4957 __m256i _mm256_srl_epi32 (__m256i a, __m128i count) pure @trusted 4958 { 4959 // PERF ARM64 4960 static if (GDC_or_LDC_with_AVX2) 4961 { 4962 return cast(__m256i) __builtin_ia32_psrld256(cast(int8)a, count); 4963 } 4964 else 4965 { 4966 __m128i a_lo = _mm256_extractf128_si256!0(a); 4967 __m128i a_hi = _mm256_extractf128_si256!1(a); 4968 __m128i r_lo = _mm_srl_epi32(a_lo, count); 4969 __m128i r_hi = _mm_srl_epi32(a_hi, count); 4970 return _mm256_set_m128i(r_hi, r_lo); 4971 } 4972 } 4973 unittest 4974 { 4975 __m128i shift0 = _mm_setzero_si128(); 4976 __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift 4977 __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5); 4978 __m256i A = _mm256_setr_epi32(4, -8, 11, -0x80000000, 0, 1, -11, 0x7fffffff); 4979 int[8] correct0 = (cast(int8)A).array; 4980 int[8] correctX = [0, 0, 0, 0, 0, 0, 0, 0]; 4981 int[8] correct2 = [1, 1073741822, 2, 536870912, 0, 0, 1073741821, 0x1fffffff]; 4982 int8 B0 = cast(int8) _mm256_srl_epi32(A, shift0); 4983 int8 BX = cast(int8) _mm256_srl_epi32(A, shiftX); 4984 int8 B2 = cast(int8) _mm256_srl_epi32(A, shift2); 4985 assert(B0.array == correct0); 4986 assert(BX.array == correctX); 4987 assert(B2.array == correct2); 4988 } 4989 4990 /// Shift packed 64-bit integers in `a` right by `count` while shifting in zeroes. 4991 /// Bit-shift is a single value in the low-order 64-bit of `count`. 4992 /// If bit-shift > 63, result is defined to be all zeroes. 4993 /// Note: prefer `_mm256_srli_epi64`, less of a trap. 4994 __m256i _mm256_srl_epi64 (__m256i a, __m128i count) pure @trusted 4995 { 4996 // PERF ARM64 4997 /* 4998 static if (LDC_with_ARM64) 4999 { 5000 long bs = (cast(long2)count).array[0]; 5001 if (bs > 63) 5002 return long4(0); 5003 else 5004 { 5005 a <<= long4(bs); 5006 return a; 5007 } 5008 } 5009 else*/ static if (GDC_or_LDC_with_AVX2) 5010 { 5011 return cast(__m256i) __builtin_ia32_psrlq256(cast(long4)a, cast(long2)count); 5012 } 5013 else 5014 { 5015 __m128i a_lo = _mm256_extractf128_si256!0(a); 5016 __m128i a_hi = _mm256_extractf128_si256!1(a); 5017 __m128i r_lo = _mm_srl_epi64(a_lo, count); 5018 __m128i r_hi = _mm_srl_epi64(a_hi, count); 5019 return _mm256_set_m128i(r_hi, r_lo); 5020 } 5021 } 5022 unittest 5023 { 5024 __m128i shift0 = _mm_setzero_si128(); 5025 __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift 5026 __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5); 5027 __m256i A = _mm256_setr_epi64(4, -9, 8, -9); 5028 long[4] correct0 = [ 4, -9, 8, -9]; 5029 long[4] correctX = [ 0, 0, 0, 0]; 5030 long[4] correct2 = [ 1, 4611686018427387901, 2, 4611686018427387901]; 5031 long4 B0 = cast(long4) _mm256_srl_epi64(A, shift0); 5032 long4 BX = cast(long4) _mm256_srl_epi64(A, shiftX); 5033 long4 B2 = cast(long4) _mm256_srl_epi64(A, shift2); 5034 assert(B0.array == correct0); 5035 assert(BX.array == correctX); 5036 assert(B2.array == correct2); 5037 } 5038 5039 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 5040 __m256i _mm256_srli_epi16 (__m256i a, int imm8) pure @trusted 5041 { 5042 static if (GDC_with_AVX2) 5043 { 5044 return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8); 5045 } 5046 else static if (LDC_with_AVX2) 5047 { 5048 return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8); 5049 } 5050 else 5051 { 5052 __m128i a_lo = _mm256_extractf128_si256!0(a); 5053 __m128i a_hi = _mm256_extractf128_si256!1(a); 5054 __m128i r_lo = _mm_srli_epi16(a_lo, imm8); 5055 __m128i r_hi = _mm_srli_epi16(a_hi, imm8); 5056 return _mm256_set_m128i(r_hi, r_lo); 5057 } 5058 } 5059 unittest 5060 { 5061 __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7); 5062 short16 B = cast(short16) _mm256_srli_epi16(A, 1); 5063 short16 B2 = cast(short16) _mm256_srli_epi16(A, 1 + 256); 5064 short[16] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3, 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 5065 assert(B.array == expectedB); 5066 assert(B2.array == expectedB); 5067 5068 short16 C = cast(short16) _mm256_srli_epi16(A, 16); 5069 short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]; 5070 assert(C.array == expectedC); 5071 5072 short16 D = cast(short16) _mm256_srli_epi16(A, 0); 5073 short[16] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7 ]; 5074 assert(D.array == expectedD); 5075 } 5076 5077 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 5078 __m256i _mm256_srli_epi32 (__m256i a, int imm8) pure @trusted 5079 { 5080 static if (GDC_with_AVX2) 5081 { 5082 return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8); 5083 } 5084 else static if (LDC_with_AVX2) 5085 { 5086 return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8); 5087 } 5088 else 5089 { 5090 // split 5091 __m128i a_lo = _mm256_extractf128_si256!0(a); 5092 __m128i a_hi = _mm256_extractf128_si256!1(a); 5093 __m128i r_lo = _mm_srli_epi32(a_lo, imm8); 5094 __m128i r_hi = _mm_srli_epi32(a_hi, imm8); 5095 return _mm256_set_m128i(r_hi, r_lo); 5096 } 5097 } 5098 unittest 5099 { 5100 __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4); 5101 int8 B = cast(int8) _mm256_srli_epi32(A, 1); 5102 int8 B2 = cast(int8) _mm256_srli_epi32(A, 1 + 256); 5103 int[8] expectedB = [ 0, 1, 1, 0x7FFFFFFE, 0, 1, 1, 0x7FFFFFFE]; 5104 assert(B.array == expectedB); 5105 assert(B2.array == expectedB); 5106 5107 int8 C = cast(int8) _mm256_srli_epi32(A, 255); 5108 int[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 5109 assert(C.array == expectedC); 5110 } 5111 5112 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros. 5113 __m256i _mm256_srli_epi64 (__m256i a, int imm8) pure @safe 5114 { 5115 static if (GDC_or_LDC_with_AVX2) 5116 { 5117 return cast(__m256i) __builtin_ia32_psrlqi256(cast(int8)a, cast(ubyte)imm8); 5118 } 5119 else 5120 { 5121 // split 5122 __m128i a_lo = _mm256_extractf128_si256!0(a); 5123 __m128i a_hi = _mm256_extractf128_si256!1(a); 5124 __m128i r_lo = _mm_srli_epi64(a_lo, imm8); 5125 __m128i r_hi = _mm_srli_epi64(a_hi, imm8); 5126 return _mm256_set_m128i(r_hi, r_lo); 5127 } 5128 } 5129 unittest 5130 { 5131 __m256i A = _mm256_setr_epi64(8, -4, 16, -8); 5132 long4 B = cast(long4) _mm256_srli_epi64(A, 1); 5133 long4 B2 = cast(long4) _mm256_srli_epi64(A, 1 + 512); 5134 long[4] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE, 8, 0x7FFFFFFFFFFFFFFC]; 5135 assert(B.array == expectedB); 5136 assert(B2.array == expectedB); 5137 5138 long4 C = cast(long4) _mm256_srli_epi64(A, 64); 5139 long[4] expectedC = [ 0, 0, 0, 0 ]; 5140 assert(C.array == expectedC); 5141 } 5142 5143 /// Shift 128-bit lanes in `a` right by `bytes` bytes while shifting in zeroes. 5144 alias _mm256_srli_si256 = _mm256_bsrli_epi128; 5145 5146 /// Shift packed 32-bit integers in `a` right by the amount specified by the corresponding element in `count` while shifting in zeroes. 5147 __m128i _mm_srlv_epi32(__m128i a, __m128i count) pure @trusted 5148 { 5149 static if (GDC_with_AVX2 || LDC_with_AVX2) 5150 return cast(__m128i)__builtin_ia32_psrlv4si(cast(byte16)a, cast(byte16)count); 5151 else 5152 { 5153 __m128i R = _mm_setr_epi32(a.array[0] >>> count.array[0], 5154 a.array[1] >>> count.array[1], 5155 a.array[2] >>> count.array[2], 5156 a.array[3] >>> count.array[3]); 5157 5158 // Map large and negative shifts to 32 5159 __m128i mm32 = _mm_set1_epi32(32); 5160 __m128i shift = _mm_min_epu32(count, mm32); 5161 5162 // Set to 0 where the shift is >= 32 5163 R = R & _mm_cmplt_epi32(shift, mm32); 5164 return R; 5165 } 5166 } 5167 unittest 5168 { 5169 __m128i A = _mm_setr_epi32(-1, 1, 4, -4); 5170 __m128i shift = _mm_setr_epi32( 2, -6, 1, 32); 5171 int4 R = cast(int4) _mm_srlv_epi32(A, shift); 5172 int[4] expected = [ 1073741823, 0, 2, 0 ]; 5173 assert(R.array == expected); 5174 } 5175 5176 /// Shift packed 32-bit integers in `a` right by the amount specified by the corresponding element in `count` while shifting in zeroes. 5177 __m256i _mm256_srlv_epi32 (__m256i a, __m256i count) pure @trusted 5178 { 5179 static if (GDC_with_AVX2 || LDC_with_AVX2) 5180 return cast(__m256i)__builtin_ia32_psrlv8si(cast(int8)a, cast(int8)count); 5181 else 5182 { 5183 // split 5184 __m128i a_lo = _mm256_extractf128_si256!0(a); 5185 __m128i a_hi = _mm256_extractf128_si256!1(a); 5186 __m128i c_lo = _mm256_extractf128_si256!0(count); 5187 __m128i c_hi = _mm256_extractf128_si256!1(count); 5188 __m128i r_lo = _mm_srlv_epi32(a_lo, c_lo); 5189 __m128i r_hi = _mm_srlv_epi32(a_hi, c_hi); 5190 return _mm256_set_m128i(r_hi, r_lo); 5191 } 5192 } 5193 unittest 5194 { 5195 __m256i A = _mm256_setr_epi32(-1, 1, 4, -4, -1, 1, 4, -4); 5196 __m256i shift = _mm256_setr_epi32( 2, -6, 1, 32, 33, 2, -6, 1); 5197 int8 R = cast(int8) _mm256_srlv_epi32(A, shift); 5198 int[8] expected = [ 1073741823, 0, 2, 0, 0, 0, 0, 2147483646 ]; 5199 assert(R.array == expected); 5200 } 5201 5202 /// Shift packed 64-bit integers in `a` right by the amount specified by the corresponding element in `count` while shifting in zeroes. 5203 __m128i _mm_srlv_epi64(__m128i a, __m128i count) pure @trusted 5204 { 5205 static if (GDC_or_LDC_with_AVX2) 5206 { 5207 return cast(__m128i)__builtin_ia32_psrlv2di(cast(long2)a, cast(long2)count); 5208 } 5209 else 5210 { 5211 // Note: arm64 rather bad for LDC < 1.34 5212 // after that, perfect. 5213 // LDC: x86, it's not good, but at least it's branchless 5214 long2 la = cast(long2)a; 5215 long2 lb = cast(long2)count; 5216 long2 R; 5217 R.ptr[0] = cast(ulong)(lb.array[0]) < 64 ? (la.array[0] >>> lb.array[0]) : 0; 5218 R.ptr[1] = cast(ulong)(lb.array[1]) < 64 ? (la.array[1] >>> lb.array[1]) : 0; 5219 return cast(__m128i)R; 5220 } 5221 } 5222 unittest 5223 { 5224 __m256i A = _mm256_setr_epi64( -4, 6, -4, 6); 5225 __m256i B1 = _mm256_setr_epi64( 2, 0, 2, 0); 5226 __m256i B2 = _mm256_setr_epi64(-12, 64, -12, 64); 5227 long4 R1 = cast(long4) _mm256_srlv_epi64(A, B1); 5228 long4 R2 = cast(long4) _mm256_srlv_epi64(A, B2); 5229 long[4] correct1 = [ 4611686018427387903, 6, 4611686018427387903, 6]; 5230 long[4] correct2 = [ 0, 0, 0, 0]; 5231 assert(R1.array == correct1); 5232 assert(R2.array == correct2); 5233 } 5234 5235 /// Shift packed 64-bit integers in `a` right by the amount specified by the corresponding element in `count` while shifting in zeroes. 5236 __m256i _mm256_srlv_epi64 (__m256i a, __m256i count) pure @trusted 5237 { 5238 // PERF: rather lame in non-AVX2 x86 5239 static if (GDC_with_AVX2 || LDC_with_AVX2) 5240 return cast(__m256i)__builtin_ia32_psrlv4di(cast(long4)a, cast(long4)count); 5241 else 5242 { 5243 // split 5244 __m128i a_lo = _mm256_extractf128_si256!0(a); 5245 __m128i a_hi = _mm256_extractf128_si256!1(a); 5246 __m128i c_lo = _mm256_extractf128_si256!0(count); 5247 __m128i c_hi = _mm256_extractf128_si256!1(count); 5248 __m128i r_lo = _mm_srlv_epi64(a_lo, c_lo); 5249 __m128i r_hi = _mm_srlv_epi64(a_hi, c_hi); 5250 return _mm256_set_m128i(r_hi, r_lo); 5251 } 5252 } 5253 unittest 5254 { 5255 __m256i A = _mm256_setr_epi64( -4, 6, -4, 6); 5256 __m256i B1 = _mm256_setr_epi64( 2, 0, 2, 0); 5257 __m256i B2 = _mm256_setr_epi64(-12, 64, -12, 64); 5258 long4 R1 = cast(long4) _mm256_srlv_epi64(A, B1); 5259 long4 R2 = cast(long4) _mm256_srlv_epi64(A, B2); 5260 long[4] correct1 = [ 4611686018427387903, 6, 4611686018427387903, 6]; 5261 long[4] correct2 = [ 0, 0, 0, 0]; 5262 assert(R1.array == correct1); 5263 assert(R2.array == correct2); 5264 } 5265 5266 /// Load 256-bits of integer data from memory using a non-temporal memory hint. 5267 /// `mem_addr` must be aligned on a 32-byte boundary or a general-protection exception may be generated. 5268 __m256i _mm256_stream_load_si256 (const(__m256i)* mem_addr) pure @trusted 5269 { 5270 // PERF DMD D_SIMD 5271 static if (GDC_with_AVX2) 5272 { 5273 return cast(__m256i) __builtin_ia32_movntdqa256(cast(__m256i*)mem_addr); // const_cast 5274 } 5275 else static if (LDC_with_InlineIREx && LDC_with_optimizations) 5276 { 5277 enum prefix = `!0 = !{ i32 1 }`; 5278 enum ir = ` 5279 %r = load <4 x i64>, <4 x i64>* %0, !nontemporal !0 5280 ret <4 x i64> %r`; 5281 return cast(__m256i) LDCInlineIREx!(prefix, ir, "", long4, const(long4)*)(mem_addr); 5282 } 5283 else 5284 { 5285 return *mem_addr; // regular move instead 5286 } 5287 } 5288 unittest 5289 { 5290 align(32) static immutable int[8] correct = [1, 2, 3, 4, 5, 6, 7, 8]; 5291 __m256i A = _mm256_stream_load_si256(cast(__m256i*)correct.ptr); 5292 _mm_mfence(); 5293 assert((cast(int8)A).array == correct); 5294 } 5295 5296 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 5297 __m256i _mm256_sub_epi16 (__m256i a, __m256i b) pure @safe 5298 { 5299 pragma(inline, true); 5300 return cast(__m256i)(cast(short16)a - cast(short16)b); 5301 } 5302 unittest 5303 { 5304 __m256i A = _mm256_setr_epi16( -7, -1, 0, 9, -100, 100, 234, 432, -32768, 32767, 0, -1, -20000, 0, 6, -2); 5305 short16 R = cast(short16) _mm256_sub_epi16(A, A); 5306 short[16] correct = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]; 5307 assert(R.array == correct); 5308 } 5309 5310 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 5311 __m256i _mm256_sub_epi32(__m256i a, __m256i b) pure @safe 5312 { 5313 pragma(inline, true); 5314 return cast(__m256i)(cast(int8)a - cast(int8)b); 5315 } 5316 unittest 5317 { 5318 __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432); 5319 int8 R = cast(int8) _mm256_sub_epi32(A, A); 5320 int[8] correct = [ 0, 0, 0, 0, 0, 0, 0, 0]; 5321 assert(R.array == correct); 5322 } 5323 5324 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`. 5325 __m256i _mm256_sub_epi64 (__m256i a, __m256i b) pure @safe 5326 { 5327 pragma(inline, true); 5328 return a - b; 5329 } 5330 unittest 5331 { 5332 __m256i A = _mm256_setr_epi64(-1, 0x8000_0000_0000_0000, 42, -12); 5333 long4 R = cast(__m256i) _mm256_sub_epi64(A, A); 5334 long[4] correct = [ 0, 0, 0, 0 ]; 5335 assert(R.array == correct); 5336 } 5337 5338 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 5339 __m256i _mm256_sub_epi8 (__m256i a, __m256i b) pure @safe 5340 { 5341 pragma(inline, true); 5342 return cast(__m256i)(cast(byte32)a - cast(byte32)b); 5343 } 5344 unittest 5345 { 5346 __m256i A = _mm256_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78, 5347 4, 9, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -2, 0, 10, 78); 5348 byte32 R = cast(byte32) _mm256_sub_epi8(A, A); 5349 byte[32] correct; // zero initialized 5350 assert(R.array == correct); 5351 } 5352 5353 /// Subtract packed signed 16-bit integers in `b` from packed 16-bit integers in `a` using 5354 /// saturation. 5355 __m256i _mm256_subs_epi16 (__m256i a, __m256i b) pure @trusted 5356 { 5357 // PERF DMD 5358 static if (GDC_with_AVX2) 5359 { 5360 return cast(__m256i) __builtin_ia32_psubsw256(cast(short16)a, cast(short16)b); 5361 } 5362 else static if(LDC_with_saturated_intrinsics) 5363 { 5364 return cast(__m256i) inteli_llvm_subs!short16(cast(short16)a, cast(short16)b); 5365 } 5366 else 5367 { 5368 short16 r; 5369 short16 sa = cast(short16)a; 5370 short16 sb = cast(short16)b; 5371 foreach(i; 0..16) 5372 r.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 5373 return cast(__m256i)r; 5374 } 5375 } 5376 unittest 5377 { 5378 short16 res = cast(short16) _mm256_subs_epi16(_mm256_setr_epi16( 7, 6, 5, -32768, 3, 3, 32766, 0, 7, 6, 5, -32750, 3, 3, 32767, 0), 5379 _mm256_setr_epi16( 7, 6, 5, -30000, 3, 1, -2, -10, 7, 6, 5, 100, 3, 1, 1, -10)); 5380 static immutable short[16] correctResult = [ 0, 0, 0, -2768, 0, 2, 32767, 10, 0, 0, 0, -32768, 0, 2, 32766, 10]; 5381 assert(res.array == correctResult); 5382 } 5383 5384 5385 /// Subtract packed signed 8-bit integers in `b` from packed 8-bit integers in `a` using 5386 /// saturation. 5387 __m256i _mm256_subs_epi8 (__m256i a, __m256i b) pure @trusted 5388 { 5389 // PERF DMD 5390 static if (GDC_with_AVX2) 5391 { 5392 return cast(__m256i) __builtin_ia32_psubsb256(cast(ubyte32)a, cast(ubyte32)b); 5393 } 5394 else static if(LDC_with_saturated_intrinsics) 5395 { 5396 return cast(__m256i) inteli_llvm_subs!byte32(cast(byte32)a, cast(byte32)b); 5397 } 5398 else 5399 { 5400 byte32 r; 5401 byte32 sa = cast(byte32)a; 5402 byte32 sb = cast(byte32)b; 5403 foreach(i; 0..32) 5404 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 5405 return cast(__m256i)r; 5406 } 5407 } 5408 unittest 5409 { 5410 byte32 R = cast(byte32) _mm256_subs_epi8(_mm256_setr_epi8(15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0, 15, 14, 13, 12, 11, 126, 9, 8, 7, 6, 5, -127, 3, 2, 1, 0), 5411 _mm256_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, -10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 5412 static immutable byte[32] correct = [ 0, 0, 0, 0, 0, 117, 0, 0, 0, 0, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127, 0, 0, 0, 0, 0, -128, 0, 0, 0, 0]; 5413 assert(R.array == correct); 5414 } 5415 5416 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit integers in `a` 5417 /// using saturation. 5418 __m256i _mm256_subs_epu16 (__m256i a, __m256i b) pure @trusted 5419 { 5420 // PERF DMD 5421 static if (GDC_with_AVX2) 5422 { 5423 return cast(__m256i) __builtin_ia32_psubusw256(cast(short16)a, cast(short16)b); 5424 } 5425 else static if(LDC_with_saturated_intrinsics) 5426 { 5427 return cast(__m256i) inteli_llvm_subus!short16(cast(short16)a, cast(short16)b); 5428 } 5429 else 5430 { 5431 short16 r; 5432 short16 sa = cast(short16)a; 5433 short16 sb = cast(short16)b; 5434 foreach(i; 0..16) 5435 r.ptr[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i])); 5436 return cast(__m256i)r; 5437 } 5438 } 5439 unittest 5440 { 5441 short16 R = cast(short16) _mm256_subs_epu16(_mm256_setr_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65534, 0), 5442 _mm256_setr_epi16(3, 4, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 20, cast(short)65535, 0)); 5443 static immutable short[16] correct = [0, 0, cast(short)65534, 0, 0, 0, cast(short)65534, 0, 0, 0, cast(short)65534, 0, 0, 0, 0, 0]; 5444 assert(R.array == correct); 5445 } 5446 5447 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit integers in `a` using 5448 /// saturation. 5449 __m256i _mm256_subs_epu8 (__m256i a, __m256i b) pure @trusted 5450 { 5451 // PERF DMD 5452 // PERF GDC without AVX2 5453 static if (GDC_with_AVX2) 5454 { 5455 return cast(__m256i) __builtin_ia32_psubusb256(cast(ubyte32)a, cast(ubyte32)b); 5456 } 5457 else static if(LDC_with_saturated_intrinsics) 5458 { 5459 return cast(__m256i) inteli_llvm_subus!byte32(cast(byte32)a, cast(byte32)b); 5460 } 5461 else 5462 { 5463 byte32 r; 5464 byte32 sa = cast(byte32)a; 5465 byte32 sb = cast(byte32)b; 5466 foreach(i; 0..32) 5467 r.ptr[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 5468 return cast(__m256i)r; 5469 } 5470 } 5471 unittest 5472 { 5473 __m256i A = _mm256_setr_epi8(0, 0, 5, 4, 5, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)136, 0, 0, 0, cast(byte)136, 0, 0, 0, 0, 0, 0); 5474 __m256i B = _mm256_setr_epi8(0, 0, 4, 5, 5, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)137, 0, 0, 0, 40, 0, 0, 0, 0, 0, 0); 5475 byte32 R = cast(byte32) _mm256_subs_epu8(A, B); 5476 static immutable byte[32] correct = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)254, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)0, 0, 0, 0, cast(byte) 96, 0, 0, 0, 0, 0, 0]; 5477 assert(R.array == correct); 5478 } 5479 5480 /// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in `a` and `b`. 5481 __m256i _mm256_unpackhi_epi16 (__m256i a, __m256i b) pure @safe 5482 { 5483 static if (GDC_with_AVX2) 5484 { 5485 return cast(long4) __builtin_ia32_punpckhwd256(cast(short16)a, cast(short16)b); 5486 } 5487 else static if (LDC_with_optimizations) 5488 { 5489 enum ir = `%r = shufflevector <16 x i16> %0, <16 x i16> %1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12,i32 28, i32 13,i32 29, i32 14,i32 30, i32 15,i32 31> 5490 ret <16 x i16> %r`; 5491 return cast(__m256i)LDCInlineIR!(ir, short16, short16, short16)(cast(short16)a, cast(short16)b); 5492 } 5493 else 5494 { 5495 // Better for arm64, GDC without AVX2 5496 __m128i a_lo = _mm256_extractf128_si256!0(a); 5497 __m128i a_hi = _mm256_extractf128_si256!1(a); 5498 __m128i b_lo = _mm256_extractf128_si256!0(b); 5499 __m128i b_hi = _mm256_extractf128_si256!1(b); 5500 __m128i r_lo = _mm_unpackhi_epi16(a_lo, b_lo); 5501 __m128i r_hi = _mm_unpackhi_epi16(a_hi, b_hi); 5502 return _mm256_set_m128i(r_hi, r_lo); 5503 } 5504 } 5505 unittest 5506 { 5507 __m256i A = _mm256_setr_epi16( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 5508 __m256i B = _mm256_setr_epi16(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 5509 short16 C = cast(short16) _mm256_unpackhi_epi16(A, B); 5510 short[16] correct = [4, 20, 5, 21, 6, 22, 7, 23, 5511 12, 28, 13, 29, 14, 30, 15, 31]; 5512 assert(C.array == correct); 5513 } 5514 5515 /// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in `a` and `b`. 5516 __m256i _mm256_unpackhi_epi32 (__m256i a, __m256i b) pure @trusted 5517 { 5518 static if (GDC_with_AVX2) 5519 enum bool split = false; 5520 else version(GNU) 5521 enum bool split = true; 5522 else 5523 enum bool split = false; 5524 5525 static if (GDC_with_AVX2) 5526 { 5527 return cast(long4) __builtin_ia32_punpckhdq256(cast(int8)a, cast(int8)b); 5528 } 5529 else static if (LDC_with_optimizations) 5530 { 5531 // LDC AVX2: Suprisingly, this start using vunpckhps in LDC 1.31 -O2 5532 enum ir = `%r = shufflevector <8 x i32> %0, <8 x i32> %1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> 5533 ret <8 x i32> %r`; 5534 return cast(__m256i)LDCInlineIR!(ir, int8, int8, int8)(cast(int8)a, cast(int8)b); 5535 } 5536 else static if (split) 5537 { 5538 __m128i a_lo = _mm256_extractf128_si256!0(a); 5539 __m128i a_hi = _mm256_extractf128_si256!1(a); 5540 __m128i b_lo = _mm256_extractf128_si256!0(b); 5541 __m128i b_hi = _mm256_extractf128_si256!1(b); 5542 __m128i r_lo = _mm_unpackhi_epi32(a_lo, b_lo); 5543 __m128i r_hi = _mm_unpackhi_epi32(a_hi, b_hi); 5544 return _mm256_set_m128i(r_hi, r_lo); 5545 } 5546 else 5547 { 5548 int8 R; 5549 int8 ai = cast(int8)a; 5550 int8 bi = cast(int8)b; 5551 R.ptr[0] = ai.array[2]; 5552 R.ptr[1] = bi.array[2]; 5553 R.ptr[2] = ai.array[3]; 5554 R.ptr[3] = bi.array[3]; 5555 R.ptr[4] = ai.array[6]; 5556 R.ptr[5] = bi.array[6]; 5557 R.ptr[6] = ai.array[7]; 5558 R.ptr[7] = bi.array[7]; 5559 return cast(__m256i) R; 5560 } 5561 } 5562 unittest 5563 { 5564 __m256i A = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); 5565 __m256i B = _mm256_setr_epi32(8, 9, 10, 11, 12, 13, 14, 15); 5566 int8 C = cast(int8) _mm256_unpackhi_epi32(A, B); 5567 int[8] correct = [2, 10, 3, 11, 6, 14, 7, 15]; 5568 assert(C.array == correct); 5569 } 5570 5571 /// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in `a` and `b`, 5572 __m256i _mm256_unpackhi_epi8 (__m256i a, __m256i b) pure @trusted 5573 { 5574 static if (GDC_with_AVX2) 5575 { 5576 return cast(__m256i) __builtin_ia32_punpckhbw256(cast(ubyte32)a, cast(ubyte32)b); 5577 } 5578 else static if (LDC_with_optimizations) 5579 { 5580 enum ir = `%r = shufflevector <32 x i8> %0, <32 x i8> %1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 5581 ret <32 x i8> %r`; 5582 return cast(__m256i)LDCInlineIR!(ir, byte32, byte32, byte32)(cast(byte32)a, cast(byte32)b); 5583 } 5584 else 5585 { 5586 // Splitting always beneficial 5587 __m128i a_lo = _mm256_extractf128_si256!0(a); 5588 __m128i a_hi = _mm256_extractf128_si256!1(a); 5589 __m128i b_lo = _mm256_extractf128_si256!0(b); 5590 __m128i b_hi = _mm256_extractf128_si256!1(b); 5591 __m128i r_lo = _mm_unpackhi_epi8(a_lo, b_lo); 5592 __m128i r_hi = _mm_unpackhi_epi8(a_hi, b_hi); 5593 return _mm256_set_m128i(r_hi, r_lo); 5594 } 5595 } 5596 unittest 5597 { 5598 __m256i A = _mm256_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5599 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 5600 __m256i B = _mm256_setr_epi8( 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 5601 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); 5602 byte32 C = cast(byte32) _mm256_unpackhi_epi8(A, B); 5603 byte[32] correct = [ 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 5604 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 ]; 5605 assert(C.array == correct); 5606 } 5607 5608 /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in `a` and `b`. 5609 __m256i _mm256_unpackhi_epi64 (__m256i a, __m256i b) pure @trusted 5610 { 5611 version(GNU) 5612 enum split = true; // Benefits GDC in non-AVX2 5613 else 5614 enum split = false; 5615 5616 static if (GDC_with_AVX2) 5617 { 5618 return __builtin_ia32_punpckhqdq256(a, b); 5619 } 5620 else static if (LDC_with_optimizations) 5621 { 5622 enum ir = `%r = shufflevector <4 x i64> %0, <4 x i64> %1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 5623 ret <4 x i64> %r`; 5624 return cast(__m256i)LDCInlineIR!(ir, long4, long4, long4)(a, b); 5625 } 5626 else static if (split) 5627 { 5628 __m128i a_lo = _mm256_extractf128_si256!0(a); 5629 __m128i a_hi = _mm256_extractf128_si256!1(a); 5630 __m128i b_lo = _mm256_extractf128_si256!0(b); 5631 __m128i b_hi = _mm256_extractf128_si256!1(b); 5632 __m128i r_lo = _mm_unpackhi_epi64(a_lo, b_lo); 5633 __m128i r_hi = _mm_unpackhi_epi64(a_hi, b_hi); 5634 return _mm256_set_m128i(r_hi, r_lo); 5635 } 5636 else 5637 { 5638 long4 R; 5639 R.ptr[0] = a.array[1]; 5640 R.ptr[1] = b.array[1]; 5641 R.ptr[2] = a.array[3]; 5642 R.ptr[3] = b.array[3]; 5643 return R; 5644 } 5645 } 5646 unittest 5647 { 5648 __m256i A = _mm256_setr_epi64(0x22222222_22222222, 0x33333333_33333333, 2, 3); 5649 __m256i B = _mm256_setr_epi64(0x44444444_44444444, 0x55555555_55555555, 4, 5); 5650 long4 C = _mm256_unpackhi_epi64(A, B); 5651 long[4] correct = [0x33333333_33333333, 0x55555555_55555555, 3, 5]; 5652 assert(C.array == correct); 5653 } 5654 5655 /// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in `a` and `b`. 5656 __m256i _mm256_unpacklo_epi16 (__m256i a, __m256i b) pure @safe 5657 { 5658 static if (GDC_with_AVX2) 5659 { 5660 return cast(__m256i) __builtin_ia32_punpcklwd256(cast(short16)a, cast(short16)b); 5661 } 5662 else static if (LDC_with_optimizations) 5663 { 5664 enum ir = `%r = shufflevector <16 x i16> %0, <16 x i16> %1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27> 5665 ret <16 x i16> %r`; 5666 return cast(__m256i)LDCInlineIR!(ir, short16, short16, short16)(cast(short16)a, cast(short16)b); 5667 } 5668 else 5669 { 5670 __m128i a_lo = _mm256_extractf128_si256!0(a); 5671 __m128i a_hi = _mm256_extractf128_si256!1(a); 5672 __m128i b_lo = _mm256_extractf128_si256!0(b); 5673 __m128i b_hi = _mm256_extractf128_si256!1(b); 5674 __m128i r_lo = _mm_unpacklo_epi16(a_lo, b_lo); 5675 __m128i r_hi = _mm_unpacklo_epi16(a_hi, b_hi); 5676 return _mm256_set_m128i(r_hi, r_lo); 5677 } 5678 } 5679 unittest 5680 { 5681 __m256i A = _mm256_setr_epi16( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 5682 __m256i B = _mm256_setr_epi16(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 5683 short16 C = cast(short16) _mm256_unpacklo_epi16(A, B); 5684 short[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 5685 8, 24, 9, 25, 10, 26, 11, 27]; 5686 assert(C.array == correct); 5687 } 5688 5689 /// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in `a` and `b`. 5690 __m256i _mm256_unpacklo_epi32 (__m256i a, __m256i b) pure @trusted 5691 { 5692 static if (GDC_with_AVX2) 5693 enum bool split = false; 5694 else version(GNU) 5695 enum bool split = true; 5696 else 5697 enum bool split = false; 5698 5699 static if (GDC_with_AVX2) 5700 { 5701 return cast(long4) __builtin_ia32_punpckldq256(cast(int8)a, cast(int8)b); 5702 } 5703 else static if (LDC_with_optimizations) 5704 { 5705 // LDC AVX2: Suprisingly, this start using vunpcklps in LDC 1.31 -O1 5706 enum ir = `%r = shufflevector <8 x i32> %0, <8 x i32> %1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> 5707 ret <8 x i32> %r`; 5708 return cast(__m256i)LDCInlineIR!(ir, int8, int8, int8)(cast(int8)a, cast(int8)b); 5709 } 5710 else static if (split) 5711 { 5712 __m128i a_lo = _mm256_extractf128_si256!0(a); 5713 __m128i a_hi = _mm256_extractf128_si256!1(a); 5714 __m128i b_lo = _mm256_extractf128_si256!0(b); 5715 __m128i b_hi = _mm256_extractf128_si256!1(b); 5716 __m128i r_lo = _mm_unpacklo_epi32(a_lo, b_lo); 5717 __m128i r_hi = _mm_unpacklo_epi32(a_hi, b_hi); 5718 return _mm256_set_m128i(r_hi, r_lo); 5719 } 5720 else 5721 { 5722 int8 R; 5723 int8 ai = cast(int8)a; 5724 int8 bi = cast(int8)b; 5725 R.ptr[0] = ai.array[0]; 5726 R.ptr[1] = bi.array[0]; 5727 R.ptr[2] = ai.array[1]; 5728 R.ptr[3] = bi.array[1]; 5729 R.ptr[4] = ai.array[4]; 5730 R.ptr[5] = bi.array[4]; 5731 R.ptr[6] = ai.array[5]; 5732 R.ptr[7] = bi.array[5]; 5733 return cast(__m256i) R; 5734 } 5735 } 5736 unittest 5737 { 5738 __m256i A = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); 5739 __m256i B = _mm256_setr_epi32(8, 9, 10, 11, 12, 13, 14, 15); 5740 int8 C = cast(int8) _mm256_unpacklo_epi32(A, B); 5741 int[8] correct = [0, 8, 1, 9, 4, 12, 5, 13]; 5742 assert(C.array == correct); 5743 } 5744 5745 /// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in `a` and `b`. 5746 __m256i _mm256_unpacklo_epi64 (__m256i a, __m256i b) pure @trusted 5747 { 5748 version(GNU) 5749 enum split = true; // Benefits GDC in non-AVX2 5750 else 5751 enum split = false; 5752 5753 static if (GDC_with_AVX2) 5754 { 5755 return __builtin_ia32_punpcklqdq256(a, b); 5756 } 5757 else static if (LDC_with_optimizations) 5758 { 5759 enum ir = `%r = shufflevector <4 x i64> %0, <4 x i64> %1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 5760 ret <4 x i64> %r`; 5761 return cast(__m256i)LDCInlineIR!(ir, long4, long4, long4)(a, b); 5762 } 5763 else static if (split) 5764 { 5765 __m128i a_lo = _mm256_extractf128_si256!0(a); 5766 __m128i a_hi = _mm256_extractf128_si256!1(a); 5767 __m128i b_lo = _mm256_extractf128_si256!0(b); 5768 __m128i b_hi = _mm256_extractf128_si256!1(b); 5769 __m128i r_lo = _mm_unpacklo_epi64(a_lo, b_lo); 5770 __m128i r_hi = _mm_unpacklo_epi64(a_hi, b_hi); 5771 return _mm256_set_m128i(r_hi, r_lo); 5772 } 5773 else 5774 { 5775 long4 R; 5776 R.ptr[0] = a.array[0]; 5777 R.ptr[1] = b.array[0]; 5778 R.ptr[2] = a.array[2]; 5779 R.ptr[3] = b.array[2]; 5780 return R; 5781 } 5782 } 5783 unittest 5784 { 5785 __m256i A = _mm256_setr_epi64(0x22222222_22222222, 0x33333333_33333333, 2, 3); 5786 __m256i B = _mm256_setr_epi64(0x44444444_44444444, 0x55555555_55555555, 4, 5); 5787 long4 C = _mm256_unpacklo_epi64(A, B); 5788 long[4] correct = [0x22222222_22222222, 0x44444444_44444444, 2, 4]; 5789 assert(C.array == correct); 5790 } 5791 5792 /// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in `a` and `b`. 5793 __m256i _mm256_unpacklo_epi8 (__m256i a, __m256i b) pure @trusted 5794 { 5795 static if (GDC_with_AVX2) 5796 { 5797 return cast(__m256i) __builtin_ia32_punpcklbw256(cast(ubyte32)a, cast(ubyte32)b); 5798 } 5799 else static if (LDC_with_optimizations) 5800 { 5801 enum ir = `%r = shufflevector <32 x i8> %0, <32 x i8> %1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> 5802 ret <32 x i8> %r`; 5803 return cast(__m256i)LDCInlineIR!(ir, byte32, byte32, byte32)(cast(byte32)a, cast(byte32)b); 5804 } 5805 else 5806 { 5807 // Splitting always beneficial 5808 __m128i a_lo = _mm256_extractf128_si256!0(a); 5809 __m128i a_hi = _mm256_extractf128_si256!1(a); 5810 __m128i b_lo = _mm256_extractf128_si256!0(b); 5811 __m128i b_hi = _mm256_extractf128_si256!1(b); 5812 __m128i r_lo = _mm_unpacklo_epi8(a_lo, b_lo); 5813 __m128i r_hi = _mm_unpacklo_epi8(a_hi, b_hi); 5814 return _mm256_set_m128i(r_hi, r_lo); 5815 } 5816 } 5817 unittest 5818 { 5819 __m256i A = _mm256_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5820 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 5821 __m256i B = _mm256_setr_epi8( 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 5822 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); 5823 byte32 C = cast(byte32) _mm256_unpacklo_epi8(A, B); 5824 byte[32] correct = [ 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 5825 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 ]; 5826 assert(C.array == correct); 5827 } 5828 5829 /// Compute the bitwise XOR of 256 bits (representing integer data) in `a` and `b`. 5830 __m256i _mm256_xor_si256 (__m256i a, __m256i b) pure @safe 5831 { 5832 return a ^ b; 5833 } 5834 unittest 5835 { 5836 __m256i A = _mm256_setr_epi64(975394, 619809709, -1, 54); 5837 __m256i B = _mm256_setr_epi64(-920275025, -6, 85873, 96644); 5838 long4 R = cast(long4) _mm256_xor_si256(A, B); 5839 long[4] correct = [975394 ^ (-920275025L), 619809709L ^ -6, (-1) ^ 85873, 54 ^ 96644]; 5840 assert(R.array == correct); 5841 } 5842 5843 private bool isValidSIBScale(const int scale) 5844 { 5845 // Encoded using two SIB bits in the x86 instruction 5846 return scale == 1 || scale == 2 || scale == 4 || scale == 8; 5847 }