1 /** 2 * AVX2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX2 4 * 5 * Copyright: Guillaume Piolat 2022. 6 * Johan Engelen 2022. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.avx2intrin; 10 11 // AVX2 instructions 12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX2 13 // Note: this header will work whether you have AVX2 enabled or not. 14 // With LDC, use "dflags-ldc": ["-mattr=+avx2"] or equivalent to actively 15 // generate AVX2 instructions. 16 // With GDC, use "dflags-gdc": ["-mavx2"] or equivalent to actively 17 // generate AVX2 instructions. 18 19 public import inteli.types; 20 import inteli.internals; 21 22 // Pull in all previous instruction set intrinsics. 23 public import inteli.avxintrin; 24 25 nothrow @nogc: 26 27 /// Compute the absolute value of packed signed 16-bit integers in `a`. 28 __m256i _mm256_abs_epi16 (__m256i a) @trusted 29 { 30 // PERF DMD 31 version(LDC) 32 enum split = true; // akways beneficial in LDC neon, ssse3, or even sse2 33 else 34 enum split = GDC_with_SSSE3; 35 36 static if (GDC_with_AVX2) 37 { 38 return cast(__m256i) __builtin_ia32_pabsw256(cast(short16)a); 39 } 40 else static if (__VERSION__ >= 2097 && LDC_with_AVX2) 41 { 42 // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 43 // no good way to do abs(256-bit) 44 return cast(__m256i) inteli_llvm_abs!short16(cast(short16)a, false); 45 } 46 else static if (split) 47 { 48 __m128i a_lo = _mm256_extractf128_si256!0(a); 49 __m128i a_hi = _mm256_extractf128_si256!1(a); 50 __m128i r_lo = _mm_abs_epi16(a_lo); 51 __m128i r_hi = _mm_abs_epi16(a_hi); 52 return _mm256_set_m128i(r_hi, r_lo); 53 } 54 else 55 { 56 short16 sa = cast(short16)a; 57 for (int i = 0; i < 16; ++i) 58 { 59 short s = sa.array[i]; 60 sa.ptr[i] = s >= 0 ? s : cast(short)(-cast(int)(s)); 61 } 62 return cast(__m256i)sa; 63 } 64 } 65 unittest 66 { 67 __m256i A = _mm256_setr_epi16(0, -1, -32768, 32767, 10, -10, 1000, -1000, 68 1, -1, -32768, 32767, 12, -13, 1000, -1040); 69 short16 B = cast(short16) _mm256_abs_epi16(A); 70 short[16] correct = [0, 1, -32768, 32767, 10, 10, 1000, 1000, 71 1, 1, -32768, 32767, 12, 13, 1000, 1040]; 72 assert(B.array == correct); 73 } 74 75 /// Compute the absolute value of packed signed 32-bit integers in `a`. 76 __m256i _mm256_abs_epi32 (__m256i a) @trusted 77 { 78 // PERF DMD 79 version(LDC) 80 enum split = true; // always beneficial in LDC neon, ssse3, or even sse2 81 else 82 enum split = false; // GDC manages to split and use pabsd in SSSE3 without guidance 83 84 static if (GDC_with_AVX2) 85 { 86 return cast(__m256i) __builtin_ia32_pabsd256(cast(int8)a); 87 } 88 else static if (__VERSION__ >= 2097 && LDC_with_AVX2) 89 { 90 // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 91 // no good way to do abs(256-bit) 92 return cast(__m256i) inteli_llvm_abs!int8(cast(int8)a, false); 93 } 94 else static if (split) 95 { 96 __m128i a_lo = _mm256_extractf128_si256!0(a); 97 __m128i a_hi = _mm256_extractf128_si256!1(a); 98 __m128i r_lo = _mm_abs_epi32(a_lo); 99 __m128i r_hi = _mm_abs_epi32(a_hi); 100 return _mm256_set_m128i(r_hi, r_lo); 101 } 102 else 103 { 104 int8 sa = cast(int8)a; 105 for (int i = 0; i < 8; ++i) 106 { 107 int s = sa.array[i]; 108 sa.ptr[i] = (s >= 0 ? s : -s); 109 } 110 return cast(__m256i)sa; 111 } 112 } 113 unittest 114 { 115 __m256i A = _mm256_setr_epi32(0, -1, -2_147_483_648, -2_147_483_647, -1, 0, -2_147_483_648, -2_147_483_646); 116 int8 B = cast(int8) _mm256_abs_epi32(A); 117 int[8] correct = [0, 1, -2_147_483_648, 2_147_483_647, 1, 0, -2_147_483_648, 2_147_483_646]; 118 assert(B.array == correct); 119 } 120 121 /// Compute the absolute value of packed signed 8-bit integers in `a`. 122 __m256i _mm256_abs_epi8 (__m256i a) @trusted 123 { 124 // PERF DMD 125 // PERF GDC in SSSE3 to AVX doesn't use pabsb and split is catastrophic because of _mm_min_epu8 126 version(LDC) 127 enum split = true; // akways beneficial in LDC neon, ssse3, sse2 128 else 129 enum split = false; 130 131 static if (GDC_with_AVX2) 132 { 133 return cast(__m256i) __builtin_ia32_pabsb256(cast(ubyte32)a); 134 } 135 else static if (__VERSION__ >= 2097 && LDC_with_AVX2) 136 { 137 // Before LDC 1.27 llvm.abs LLVM intrinsic didn't exist, and hence 138 // no good way to do abs(256-bit) 139 return cast(__m256i) inteli_llvm_abs!byte32(cast(byte32)a, false); 140 } 141 else static if (split) 142 { 143 __m128i a_lo = _mm256_extractf128_si256!0(a); 144 __m128i a_hi = _mm256_extractf128_si256!1(a); 145 __m128i r_lo = _mm_abs_epi8(a_lo); 146 __m128i r_hi = _mm_abs_epi8(a_hi); 147 return _mm256_set_m128i(r_hi, r_lo); 148 } 149 else 150 { 151 // Basically this loop is poison for LDC optimizer 152 byte32 sa = cast(byte32)a; 153 for (int i = 0; i < 32; ++i) 154 { 155 byte s = sa.array[i]; 156 sa.ptr[i] = s >= 0 ? s : cast(byte)(-cast(int)(s)); 157 } 158 return cast(__m256i)sa; 159 } 160 } 161 unittest 162 { 163 __m256i A = _mm256_setr_epi8(0, -1, -128, -127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 164 0, -1, -128, -126, 127, -6, -5, -4, -3, -2, 0, 1, 2, 3, 4, 5); 165 byte32 B = cast(byte32) _mm256_abs_epi8(A); 166 byte[32] correct = [0, 1, -128, 127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 167 0, 1, -128, 126, 127, 6, 5, 4, 3, 2, 0, 1, 2, 3, 4, 5]; 168 assert(B.array == correct); 169 } 170 171 /// Add packed 16-bit integers in `a` and `b`. 172 __m256i _mm256_add_epi16 (__m256i a, __m256i b) pure @safe 173 { 174 pragma(inline, true); 175 return cast(__m256i)(cast(short16)a + cast(short16)b); 176 } 177 unittest 178 { 179 __m256i A = _mm256_setr_epi16( -7, -1, 0, 9, -100, 100, 234, 432, -32768, 32767, 0, -1, -20000, 0, 6, -2); 180 short16 R = cast(short16) _mm256_add_epi16(A, A); 181 short[16] correct = [ -14, -2, 0, 18, -200, 200, 468, 864, 0, -2, 0, -2, 25536, 0, 12, -4 ]; 182 assert(R.array == correct); 183 } 184 185 /// Add packed 32-bit integers in `a` and `b`. 186 __m256i _mm256_add_epi32(__m256i a, __m256i b) pure @safe 187 { 188 pragma(inline, true); 189 return cast(__m256i)(cast(int8)a + cast(int8)b); 190 } 191 unittest 192 { 193 __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432); 194 int8 R = cast(int8) _mm256_add_epi32(A, A); 195 int[8] correct = [ -14, -2, 0, 18, -200, 200, 468, 864 ]; 196 assert(R.array == correct); 197 } 198 199 /// Add packed 64-bit integers in `a` and `b`. 200 __m256i _mm256_add_epi64 (__m256i a, __m256i b) pure @safe 201 { 202 pragma(inline, true); 203 return a + b; 204 } 205 unittest 206 { 207 __m256i A = _mm256_setr_epi64(-1, 0x8000_0000_0000_0000, 42, -12); 208 long4 R = cast(__m256i) _mm256_add_epi64(A, A); 209 long[4] correct = [ -2, 0, 84, -24 ]; 210 assert(R.array == correct); 211 } 212 213 /// Add packed 8-bit integers in `a` and `b`. 214 __m256i _mm256_add_epi8 (__m256i a, __m256i b) pure @safe 215 { 216 pragma(inline, true); 217 return cast(__m256i)(cast(byte32)a + cast(byte32)b); 218 } 219 unittest 220 { 221 __m256i A = _mm256_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78, 222 4, 9, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -2, 0, 10, 78); 223 byte32 R = cast(byte32) _mm256_add_epi8(A, A); 224 byte[32] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100, 225 8, 18, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -4, 0, 20, -100]; 226 assert(R.array == correct); 227 } 228 229 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 230 __m256i _mm256_adds_epi16 (__m256i a, __m256i b) pure @trusted 231 { 232 // PERF DMD 233 static if (GDC_with_AVX2) 234 { 235 return cast(__m256i) __builtin_ia32_paddsw256(cast(short16)a, cast(short16)b); 236 } 237 else version(LDC) 238 { 239 return cast(__m256i) inteli_llvm_adds!short16(cast(short16)a, cast(short16)b); 240 } 241 else 242 { 243 short16 r; 244 short16 sa = cast(short16)a; 245 short16 sb = cast(short16)b; 246 foreach(i; 0..16) 247 r.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 248 return cast(__m256i)r; 249 } 250 } 251 unittest 252 { 253 short16 res = cast(short16) _mm256_adds_epi16(_mm256_setr_epi16( 7, 6, 5, -32768, 3, 3, 32767, 0, 7, 6, 5, -32768, 3, 3, 32767, 0), 254 _mm256_setr_epi16( 7, 6, 5, -30000, 3, 1, 1, -10, 7, 6, 5, -30000, 3, 1, 1, -10)); 255 static immutable short[16] correctResult = [14, 12, 10, -32768, 6, 4, 32767, -10, 14, 12, 10, -32768, 6, 4, 32767, -10]; 256 assert(res.array == correctResult); 257 } 258 259 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 260 __m256i _mm256_adds_epi8 (__m256i a, __m256i b) pure @trusted 261 { 262 // PERF DMD 263 static if (GDC_with_AVX2) 264 { 265 return cast(__m256i) __builtin_ia32_paddsb256(cast(ubyte32)a, cast(ubyte32)b); 266 } 267 else version(LDC) 268 { 269 return cast(__m256i) inteli_llvm_adds!byte32(cast(byte32)a, cast(byte32)b); 270 } 271 else 272 { 273 byte32 r; 274 byte32 sa = cast(byte32)a; 275 byte32 sb = cast(byte32)b; 276 foreach(i; 0..32) 277 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i] + sb.array[i]); 278 return cast(__m256i)r; 279 } 280 } 281 unittest 282 { 283 byte32 res = cast(byte32) _mm256_adds_epi8(_mm256_setr_epi8(15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0, 15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0), 284 _mm256_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, -4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, -4, 3, 2, 1, 0)); 285 static immutable byte[32] correctResult = [30, 28, 26, 24, 22, 127,18,16,14,12,10, -128, 6, 4, 2, 0, 30, 28, 26, 24, 22, 127,18,16,14,12,10, -128, 6, 4, 2, 0]; 286 assert(res.array == correctResult); 287 } 288 289 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 290 __m256i _mm256_adds_epu16 (__m256i a, __m256i b) pure @trusted 291 { 292 // PERF DMD 293 static if (GDC_with_AVX2) 294 { 295 return cast(__m256i) __builtin_ia32_paddusw256(cast(short16)a, cast(short16)b); 296 } 297 else version(LDC) 298 { 299 return cast(__m256i) inteli_llvm_addus!short16(cast(short16)a, cast(short16)b); 300 } 301 else 302 { 303 short16 r; 304 short16 sa = cast(short16)a; 305 short16 sb = cast(short16)b; 306 foreach(i; 0..16) 307 r.ptr[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 308 return cast(__m256i)r; 309 } 310 } 311 unittest 312 { 313 short16 res = cast(short16) _mm256_adds_epu16(_mm256_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0), 314 _mm256_set_epi16(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0)); 315 static immutable short[16] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6]; 316 assert(res.array == correctResult); 317 } 318 319 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 320 __m256i _mm256_adds_epu8 (__m256i a, __m256i b) pure @trusted 321 { 322 // PERF DMD 323 static if (GDC_with_AVX2) 324 { 325 return cast(__m256i) __builtin_ia32_paddusb256(cast(ubyte32)a, cast(ubyte32)b); 326 } 327 else version(LDC) 328 { 329 return cast(__m256i) inteli_llvm_addus!byte32(cast(byte32)a, cast(byte32)b); 330 } 331 else 332 { 333 byte32 r; 334 byte32 sa = cast(byte32)a; 335 byte32 sb = cast(byte32)b; 336 foreach(i; 0..32) 337 r.ptr[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 338 return cast(__m256i)r; 339 } 340 } 341 unittest 342 { 343 __m256i A = _mm256_setr_epi8(0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)136, 0, 0, 0, cast(byte)136, 0, 0, 0, 0, 0, 0); 344 __m256i B = _mm256_setr_epi8(0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)136, 0, 0, 0, 40, 0, 0, 0, 0, 0, 0); 345 byte32 R = cast(byte32) _mm256_adds_epu8(A, B); 346 static immutable byte[32] correct = [0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, 0, 0, 0, 0, 0, cast(byte)255, 0, 0, 0, cast(byte)176, 0, 0, 0, 0, 0, 0]; 347 assert(R.array == correct); 348 } 349 350 // TODO __m256i _mm256_alignr_epi8 (__m256i a, __m256i b, const int imm8) pure @safe 351 352 /// Compute the bitwise AND of 256 bits (representing integer data) in `a` and `b`. 353 __m256i _mm256_and_si256 (__m256i a, __m256i b) pure @safe 354 { 355 pragma(inline, true); 356 return a & b; 357 } 358 unittest 359 { 360 __m256i A = _mm256_set1_epi32(7); 361 __m256i B = _mm256_set1_epi32(14); 362 int8 R = cast(int8) _mm256_and_si256(A, B); 363 int[8] correct = [6, 6, 6, 6, 6, 6, 6, 6]; 364 assert(R.array == correct); 365 } 366 367 /// Compute the bitwise NOT of 256 bits (representing integer data) in `a` and then AND with `b`. 368 __m256i _mm256_andnot_si256 (__m256i a, __m256i b) pure @safe 369 { 370 pragma(inline, true); 371 return (~a) & b; 372 } 373 unittest 374 { 375 __m256i A = _mm256_setr_epi32(7, -2, 9, 54654, 7, -2, 9, 54654); 376 __m256i B = _mm256_setr_epi32(14, 78, 111, -256, 14, 78, 111, -256); 377 int8 R = cast(int8) _mm256_andnot_si256(A, B); 378 int[8] correct = [8, 0, 102, -54784, 8, 0, 102, -54784]; 379 assert(R.array == correct); 380 } 381 382 383 // TODO __m256i _mm256_avg_epu16 (__m256i a, __m256i b) pure @safe 384 // TODO __m256i _mm256_avg_epu8 (__m256i a, __m256i b) pure @safe 385 // TODO __m256i _mm256_blend_epi16 (__m256i a, __m256i b, const int imm8) pure @safe 386 // TODO __m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8) pure @safe 387 // TODO __m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8) pure @safe 388 // TODO __m256i _mm256_blendv_epi8 (__m256i a, __m256i b, __m256i mask) pure @safe 389 // TODO __m128i _mm_broadcastb_epi8 (__m128i a) pure @safe 390 // TODO __m256i _mm256_broadcastb_epi8 (__m128i a) pure @safe 391 // TODO __m128i _mm_broadcastd_epi32 (__m128i a) pure @safe 392 // TODO __m256i _mm256_broadcastd_epi32 (__m128i a) pure @safe 393 // TODO __m128i _mm_broadcastq_epi64 (__m128i a) pure @safe 394 // TODO __m256i _mm256_broadcastq_epi64 (__m128i a) pure @safe 395 // TODO __m128d _mm_broadcastsd_pd (__m128d a) pure @safe 396 // TODO __m256d _mm256_broadcastsd_pd (__m128d a) pure @safe 397 // TODO __m256i _mm_broadcastsi128_si256 (__m128i a) pure @safe 398 // TODO __m256i _mm256_broadcastsi128_si256 (__m128i a) pure @safe 399 // TODO __m128 _mm_broadcastss_ps (__m128 a) pure @safe 400 // TODO __m256 _mm256_broadcastss_ps (__m128 a) pure @safe 401 // TODO __m128i _mm_broadcastw_epi16 (__m128i a) pure @safe 402 // TODO __m256i _mm256_broadcastw_epi16 (__m128i a) pure @safe 403 // TODO __m256i _mm256_bslli_epi128 (__m256i a, const int imm8) pure @safe 404 // TODO __m256i _mm256_bsrli_epi128 (__m256i a, const int imm8) pure @safe 405 // TODO __m256i _mm256_cmpeq_epi16 (__m256i a, __m256i b) pure @safe 406 // TODO __m256i _mm256_cmpeq_epi32 (__m256i a, __m256i b) pure @safe 407 // TODO __m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b) pure @safe 408 // TODO __m256i _mm256_cmpeq_epi8 (__m256i a, __m256i b) pure @safe 409 // TODO __m256i _mm256_cmpgt_epi16 (__m256i a, __m256i b) pure @safe 410 // TODO __m256i _mm256_cmpgt_epi32 (__m256i a, __m256i b) pure @safe 411 // TODO __m256i _mm256_cmpgt_epi64 (__m256i a, __m256i b) pure @safe 412 // TODO __m256i _mm256_cmpgt_epi8 (__m256i a, __m256i b) pure @safe 413 // TODO __m256i _mm256_cvtepi16_epi32 (__m128i a) pure @safe 414 // TODO __m256i _mm256_cvtepi16_epi64 (__m128i a) pure @safe 415 // TODO __m256i _mm256_cvtepi32_epi64 (__m128i a) pure @safe 416 // TODO __m256i _mm256_cvtepi8_epi16 (__m128i a) pure @safe 417 // TODO __m256i _mm256_cvtepi8_epi32 (__m128i a) pure @safe 418 // TODO __m256i _mm256_cvtepi8_epi64 (__m128i a) pure @safe 419 420 /// Zero-extend packed unsigned 16-bit integers in `a` to packed 32-bit integers. 421 // TODO verify 422 __m256i _mm256_cvtepu16_epi32(__m128i a) pure @trusted 423 { 424 static if (GDC_with_AVX2) 425 { 426 return cast(__m256i) __builtin_ia32_pmovzxwd256(cast(short8)a); 427 } 428 else 429 { 430 short8 sa = cast(short8)a; 431 int8 r; // PERF =void; 432 // Explicit cast to unsigned to get *zero* extension (instead of sign extension). 433 r.ptr[0] = cast(ushort)sa.array[0]; 434 r.ptr[1] = cast(ushort)sa.array[1]; 435 r.ptr[2] = cast(ushort)sa.array[2]; 436 r.ptr[3] = cast(ushort)sa.array[3]; 437 r.ptr[4] = cast(ushort)sa.array[4]; 438 r.ptr[5] = cast(ushort)sa.array[5]; 439 r.ptr[6] = cast(ushort)sa.array[6]; 440 r.ptr[7] = cast(ushort)sa.array[7]; 441 return cast(__m256i)r; 442 } 443 } 444 unittest 445 { 446 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, -1, 0, -32768, 32767); 447 int8 C = cast(int8) _mm256_cvtepu16_epi32(A); 448 int[8] correct = [65535, 0, 32768, 32767, 65535, 0, 32768, 32767]; 449 assert(C.array == correct); 450 } 451 452 // TODO __m256i _mm256_cvtepu16_epi64 (__m128i a) pure @safe 453 // TODO __m256i _mm256_cvtepu32_epi64 (__m128i a) pure @safe 454 // TODO __m256i _mm256_cvtepu8_epi16 (__m128i a) pure @safe 455 // TODO __m256i _mm256_cvtepu8_epi32 (__m128i a) pure @safe 456 // TODO __m256i _mm256_cvtepu8_epi64 (__m128i a) pure @safe 457 // TODO int _mm256_extract_epi16 (__m256i a, const int index) pure @safe 458 // TODO int _mm256_extract_epi8 (__m256i a, const int index) pure @safe 459 460 /// Extract 128 bits (composed of integer data) from `a`, selected with `imm8`. 461 __m128i _mm256_extracti128_si256(int imm8)(__m256i a) pure @trusted 462 if ( (imm8 == 0) || (imm8 == 1) ) 463 // TODO verify 464 { 465 pragma(inline, true); 466 467 static if (GDC_with_AVX2) 468 { 469 return cast(__m128i) __builtin_ia32_extract128i256(a, imm8); 470 } 471 else version (LDC) 472 { 473 enum str = (imm8 == 1) ? "<i32 2, i32 3>" : "<i32 0, i32 1>"; 474 enum ir = "%r = shufflevector <4 x i64> %0, <4 x i64> undef, <2 x i32>" ~ str ~ "\n" ~ 475 "ret <2 x i64> %r"; 476 return cast(__m128i) LDCInlineIR!(ir, ulong2, ulong4)(cast(ulong4)a); 477 } 478 else 479 { 480 long4 al = cast(long4) a; 481 long2 ret; 482 ret.ptr[0] = (imm8==1) ? al.array[2] : al.array[0]; 483 ret.ptr[1] = (imm8==1) ? al.array[3] : al.array[1]; 484 return cast(__m128i) ret; 485 } 486 } 487 unittest 488 { 489 __m256i A = _mm256_setr_epi32( -7, -1, 0, 9, -100, 100, 234, 432 ); 490 int[4] correct0 = [ -7, -1, 0, 9 ]; 491 int[4] correct1 = [ -100, 100, 234, 432 ]; 492 __m128i R0 = _mm256_extracti128_si256!(0)(A); 493 __m128i R1 = _mm256_extracti128_si256!(1)(A); 494 assert(R0.array == correct0); 495 assert(R1.array == correct1); 496 } 497 498 // TODO __m256i _mm256_hadd_epi16 (__m256i a, __m256i b) pure @safe 499 // TODO __m256i _mm256_hadd_epi32 (__m256i a, __m256i b) pure @safe 500 // TODO __m256i _mm256_hadds_epi16 (__m256i a, __m256i b) pure @safe 501 // TODO __m256i _mm256_hsub_epi16 (__m256i a, __m256i b) pure @safe 502 // TODO __m256i _mm256_hsub_epi32 (__m256i a, __m256i b) pure @safe 503 // TODO __m256i _mm256_hsubs_epi16 (__m256i a, __m256i b) pure @safe 504 // TODO __m128i _mm_i32gather_epi32 (int const* base_addr, __m128i vindex, const int scale) pure @safe 505 // TODO __m128i _mm_mask_i32gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe 506 // TODO __m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale) pure @safe 507 // TODO __m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale) pure @safe 508 // TODO __m128i _mm_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) pure @safe 509 // TODO __m128i _mm_mask_i32gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe 510 // TODO __m256i _mm256_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) pure @safe 511 // TODO __m256i _mm256_mask_i32gather_epi64 (__m256i src, __int64 const* base_addr, __m128i vindex, __m256i mask, const int scale) pure @safe 512 // TODO __m128d _mm_i32gather_pd (double const* base_addr, __m128i vindex, const int scale) pure @safe 513 // TODO __m128d _mm_mask_i32gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale) pure @safe 514 // TODO __m256d _mm256_i32gather_pd (double const* base_addr, __m128i vindex, const int scale) pure @safe 515 // TODO __m256d _mm256_mask_i32gather_pd (__m256d src, double const* base_addr, __m128i vindex, __m256d mask, const int scale) pure @safe 516 // TODO __m128 _mm_i32gather_ps (float const* base_addr, __m128i vindex, const int scale) pure @safe 517 // TODO __m128 _mm_mask_i32gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale) pure @safe 518 // TODO __m256 _mm256_i32gather_ps (float const* base_addr, __m256i vindex, const int scale) pure @safe 519 // TODO __m256 _mm256_mask_i32gather_ps (__m256 src, float const* base_addr, __m256i vindex, __m256 mask, const int scale) pure @safe 520 // TODO __m128i _mm_i64gather_epi32 (int const* base_addr, __m128i vindex, const int scale) pure @safe 521 // TODO __m128i _mm_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe 522 // TODO __m128i _mm256_i64gather_epi32 (int const* base_addr, __m256i vindex, const int scale) pure @safe 523 // TODO __m128i _mm256_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m256i vindex, __m128i mask, const int scale) pure @safe 524 // TODO __m128i _mm_i64gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) pure @safe 525 // TODO __m128i _mm_mask_i64gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe 526 // TODO __m256i _mm256_i64gather_epi64 (__int64 const* base_addr, __m256i vindex, const int scale) pure @safe 527 // TODO __m256i _mm256_mask_i64gather_epi64 (__m256i src, __int64 const* base_addr, __m256i vindex, __m256i mask, const int scale) pure @safe 528 // TODO __m128d _mm_i64gather_pd (double const* base_addr, __m128i vindex, const int scale) pure @safe 529 // TODO __m128d _mm_mask_i64gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale) pure @safe 530 // TODO __m256d _mm256_i64gather_pd (double const* base_addr, __m256i vindex, const int scale) pure @safe 531 // TODO __m256d _mm256_mask_i64gather_pd (__m256d src, double const* base_addr, __m256i vindex, __m256d mask, const int scale) pure @safe 532 // TODO __m128 _mm_i64gather_ps (float const* base_addr, __m128i vindex, const int scale) pure @safe 533 // TODO __m128 _mm_mask_i64gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale) pure @safe 534 // TODO __m128 _mm256_i64gather_ps (float const* base_addr, __m256i vindex, const int scale) pure @safe 535 // TODO __m128 _mm256_mask_i64gather_ps (__m128 src, float const* base_addr, __m256i vindex, __m128 mask, const int scale) pure @safe 536 // TODO __m256i _mm256_inserti128_si256 (__m256i a, __m128i b, const int imm8) pure @safe 537 538 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 539 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 540 /// and pack the results in destination. 541 // TODO verify 542 __m256i _mm256_madd_epi16 (__m256i a, __m256i b) pure @trusted 543 { 544 static if (GDC_with_AVX2) 545 { 546 return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b); 547 } 548 else static if (LDC_with_AVX2) 549 { 550 return cast(__m256i) __builtin_ia32_pmaddwd256(cast(short16)a, cast(short16)b); 551 } 552 else 553 { 554 short16 sa = cast(short16)a; 555 short16 sb = cast(short16)b; 556 int8 r; // PERF =void; 557 foreach(i; 0..8) 558 { 559 r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1]; 560 } 561 return cast(__m256i) r; 562 } 563 } 564 unittest 565 { 566 short16 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767]; 567 short16 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767, 0, 1, 2, 3, -32768, -32768, 32767, 32767]; 568 int8 R = cast(int8) _mm256_madd_epi16(cast(__m256i)A, cast(__m256i)B); 569 int[8] correct = [1, 13, -2147483648, 2*32767*32767, 1, 13, -2147483648, 2*32767*32767]; 570 assert(R.array == correct); 571 } 572 573 // TODO __m256i _mm256_maddubs_epi16 (__m256i a, __m256i b) pure @safe 574 // TODO __m128i _mm_maskload_epi32 (int const* mem_addr, __m128i mask) pure @safe 575 // TODO __m256i _mm256_maskload_epi32 (int const* mem_addr, __m256i mask) pure @safe 576 // TODO __m128i _mm_maskload_epi64 (__int64 const* mem_addr, __m128i mask) pure @safe 577 // TODO __m256i _mm256_maskload_epi64 (__int64 const* mem_addr, __m256i mask) pure @safe 578 // TODO __m256i _mm256_max_epi16 (__m256i a, __m256i b) pure @safe 579 // TODO __m256i _mm256_max_epi32 (__m256i a, __m256i b) pure @safe 580 // TODO __m256i _mm256_max_epi8 (__m256i a, __m256i b) pure @safe 581 // TODO __m256i _mm256_max_epu16 (__m256i a, __m256i b) pure @safe 582 // TODO __m256i _mm256_max_epu32 (__m256i a, __m256i b) pure @safe 583 // TODO __m256i _mm256_max_epu8 (__m256i a, __m256i b) pure @safe 584 // TODO __m256i _mm256_min_epi16 (__m256i a, __m256i b) pure @safe 585 // TODO __m256i _mm256_min_epi32 (__m256i a, __m256i b) pure @safe 586 // TODO __m256i _mm256_min_epi8 (__m256i a, __m256i b) pure @safe 587 // TODO __m256i _mm256_min_epu16 (__m256i a, __m256i b) pure @safe 588 // TODO __m256i _mm256_min_epu32 (__m256i a, __m256i b) pure @safe 589 // TODO __m256i _mm256_min_epu8 (__m256i a, __m256i b) pure @safe 590 // TODO int _mm256_movemask_epi8 (__m256i a) pure @safe 591 // TODO __m256i _mm256_mpsadbw_epu8 (__m256i a, __m256i b, const int imm8) pure @safe 592 // TODO __m256i _mm256_mul_epi32 (__m256i a, __m256i b) pure @safe 593 // TODO __m256i _mm256_mul_epu32 (__m256i a, __m256i b) pure @safe 594 // TODO __m256i _mm256_mulhi_epi16 (__m256i a, __m256i b) pure @safe 595 // TODO __m256i _mm256_mulhi_epu16 (__m256i a, __m256i b) pure @safe 596 // TODO __m256i _mm256_mulhrs_epi16 (__m256i a, __m256i b) pure @safe 597 // TODO __m256i _mm256_mullo_epi16 (__m256i a, __m256i b) pure @safe 598 // TODO __m256i _mm256_mullo_epi32 (__m256i a, __m256i b) pure @safe 599 600 /// Compute the bitwise OR of 256 bits (representing integer data) in `a` and `b`. 601 __m256i _mm256_or_si256 (__m256i a, __m256i b) pure @safe 602 { 603 return a | b; 604 } 605 unittest 606 { 607 long A = 0x55555555_55555555; 608 long B = 0xAAAAAAAA_AAAAAAAA; 609 __m256i vA = _mm256_set_epi64(A, B, A, B); 610 __m256i vB = _mm256_set_epi64(B, A, 0, B); 611 __m256i R = _mm256_or_si256(vA, vB); 612 long[4] correct = [B, A, -1, -1]; 613 assert(R.array == correct); 614 } 615 616 // TODO __m256i _mm256_packs_epi16 (__m256i a, __m256i b) pure @safe 617 // TODO __m256i _mm256_packs_epi32 (__m256i a, __m256i b) pure @safe 618 // TODO __m256i _mm256_packus_epi16 (__m256i a, __m256i b) pure @safe 619 // TODO __m256i _mm256_packus_epi32 (__m256i a, __m256i b) pure @safe 620 // TODO __m256i _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8) pure @safe 621 // TODO __m256i _mm256_permute4x64_epi64 (__m256i a, const int imm8) pure @safe 622 // TODO __m256d _mm256_permute4x64_pd (__m256d a, const int imm8) pure @safe 623 // TODO __m256i _mm256_permutevar8x32_epi32 (__m256i a, __m256i idx) pure @safe 624 // TODO __m256 _mm256_permutevar8x32_ps (__m256 a, __m256i idx) pure @safe 625 626 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 627 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 628 /// low 16 bits of 64-bit elements in result. 629 // TODO verify 630 __m256i _mm256_sad_epu8 (__m256i a, __m256i b) pure @trusted 631 { 632 static if (GDC_with_AVX2) 633 { 634 return cast(__m256i) __builtin_ia32_psadbw256(cast(ubyte32)a, cast(ubyte32)b); 635 } 636 else static if (LDC_with_AVX2) 637 { 638 return cast(__m256i) __builtin_ia32_psadbw256(cast(byte32)a, cast(byte32)b); 639 } 640 else 641 { 642 // PERF: ARM64/32 is lacking 643 byte32 ab = cast(byte32)a; 644 byte32 bb = cast(byte32)b; 645 ubyte[32] t; 646 foreach(i; 0..32) 647 { 648 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]); 649 if (diff < 0) diff = -diff; 650 t.ptr[i] = cast(ubyte)(diff); 651 } 652 int8 r = cast(int8) _mm256_setzero_si256(); 653 r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 654 r.ptr[2] = t[8] + t[9] + t[10] + t[11] + t[12] + t[13] + t[14] + t[15]; 655 r.ptr[4] = t[16] + t[17] + t[18] + t[19] + t[20] + t[21] + t[22] + t[23]; 656 r.ptr[6] = t[24] + t[25] + t[26] + t[27] + t[28] + t[29] + t[30] + t[31]; 657 return cast(__m256i) r; 658 } 659 } 660 unittest 661 { 662 __m256i A = _mm256_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54, 663 3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 664 __m256i B = _mm256_set1_epi8(1); 665 int8 R = cast(int8) _mm256_sad_epu8(A, B); 666 int[8] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 667 0, 668 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 669 0, 670 2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 671 0, 672 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 673 0]; 674 assert(R.array == correct); 675 } 676 677 678 // TODO __m256i _mm256_shuffle_epi32 (__m256i a, const int imm8) pure @safe 679 // TODO __m256i _mm256_shuffle_epi8 (__m256i a, __m256i b) pure @safe 680 // TODO __m256i _mm256_shufflehi_epi16 (__m256i a, const int imm8) pure @safe 681 // TODO __m256i _mm256_shufflelo_epi16 (__m256i a, const int imm8) pure @safe 682 // TODO __m256i _mm256_sign_epi16 (__m256i a, __m256i b) pure @safe 683 // TODO __m256i _mm256_sign_epi32 (__m256i a, __m256i b) pure @safe 684 // TODO __m256i _mm256_sign_epi8 (__m256i a, __m256i b) pure @safe 685 // TODO __m256i _mm256_sll_epi16 (__m256i a, __m128i count) pure @safe 686 // TODO __m256i _mm256_sll_epi32 (__m256i a, __m128i count) pure @safe 687 // TODO __m256i _mm256_sll_epi64 (__m256i a, __m128i count) pure @safe 688 689 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 690 // TODO verify 691 __m256i _mm256_slli_epi16(__m256i a, int imm8) pure @trusted 692 { 693 static if (GDC_with_AVX2) 694 { 695 return cast(__m256i) __builtin_ia32_psllwi256(cast(short16)a, cast(ubyte)imm8); 696 } 697 else static if (LDC_with_AVX2) 698 { 699 return cast(__m256i) __builtin_ia32_psllwi256(cast(short16)a, cast(ubyte)imm8); 700 } 701 else 702 { 703 //PERF: ARM 704 short16 sa = cast(short16)a; 705 short16 r = cast(short16)_mm256_setzero_si256(); 706 ubyte count = cast(ubyte) imm8; 707 if (count > 15) 708 return cast(__m256i)r; 709 foreach(i; 0..16) 710 r.ptr[i] = cast(short)(sa.array[i] << count); 711 return cast(__m256i)r; 712 } 713 } 714 unittest 715 { 716 __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7); 717 short16 B = cast(short16)( _mm256_slli_epi16(A, 1) ); 718 short16 B2 = cast(short16)( _mm256_slli_epi16(A, 1 + 256) ); 719 short[16] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14, 0, 2, 4, 6, -8, -10, 12, 14 ]; 720 assert(B.array == expectedB); 721 assert(B2.array == expectedB); 722 723 short16 C = cast(short16)( _mm256_slli_epi16(A, 16) ); 724 short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]; 725 assert(C.array == expectedC); 726 } 727 728 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 729 // TODO verify 730 __m256i _mm256_slli_epi32 (__m256i a, int imm8) pure @trusted 731 { 732 static if (GDC_with_AVX2) 733 { 734 return cast(__m256i) __builtin_ia32_pslldi256(cast(int8)a, cast(ubyte)imm8); 735 } 736 else static if (LDC_with_AVX2) 737 { 738 return cast(__m256i) __builtin_ia32_pslldi256(cast(int8)a, cast(ubyte)imm8); 739 } 740 else 741 { 742 // Note: the intrinsics guarantee imm8[0..7] is taken, however 743 // D says "It's illegal to shift by the same or more bits 744 // than the size of the quantity being shifted" 745 // and it's UB instead. 746 int8 a_int8 = cast(int8) a; 747 int8 r = cast(int8) _mm256_setzero_si256(); 748 749 ubyte count = cast(ubyte) imm8; 750 if (count > 31) 751 return cast(__m256i) r; 752 753 foreach(i; 0..8) 754 r.ptr[i] = cast(uint)(a_int8.array[i]) << count; 755 return cast(__m256i) r; 756 } 757 } 758 unittest 759 { 760 __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4); 761 int8 B = cast(int8) _mm256_slli_epi32(A, 1); 762 int8 B2 = cast(int8) _mm256_slli_epi32(A, 1 + 256); 763 int[8] expectedB = [ 0, 4, 6, -8, 0, 4, 6, -8 ]; 764 assert(B.array == expectedB); 765 assert(B2.array == expectedB); 766 767 int8 C = cast(int8) _mm256_slli_epi32(A, 0); 768 int[8] expectedC = [ 0, 2, 3, -4, 0, 2, 3, -4 ]; 769 assert(C.array == expectedC); 770 771 int8 D = cast(int8) _mm256_slli_epi32(A, 65); 772 int[8] expectedD = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 773 assert(D.array == expectedD); 774 } 775 776 // TODO __m256i _mm256_slli_epi64 (__m256i a, int imm8) pure @safe 777 // TODO __m256i _mm256_slli_si256 (__m256i a, const int imm8) pure @safe 778 // TODO __m128i _mm_sllv_epi32 (__m128i a, __m128i count) pure @safe 779 // TODO __m256i _mm256_sllv_epi32 (__m256i a, __m256i count) pure @safe 780 // TODO __m128i _mm_sllv_epi64 (__m128i a, __m128i count) pure @safe 781 // TODO __m256i _mm256_sllv_epi64 (__m256i a, __m256i count) pure @safe 782 // TODO __m256i _mm256_sra_epi16 (__m256i a, __m128i count) pure @safe 783 // TODO __m256i _mm256_sra_epi32 (__m256i a, __m128i count) pure @safe 784 // TODO __m256i _mm256_srai_epi16 (__m256i a, int imm8) pure @safe 785 // TODO __m256i _mm256_srai_epi32 (__m256i a, int imm8) pure @safe 786 // TODO __m128i _mm_srav_epi32 (__m128i a, __m128i count) pure @safe 787 // TODO __m256i _mm256_srav_epi32 (__m256i a, __m256i count) pure @safe 788 // TODO __m256i _mm256_srl_epi16 (__m256i a, __m128i count) pure @safe 789 // TODO __m256i _mm256_srl_epi32 (__m256i a, __m128i count) pure @safe 790 // TODO __m256i _mm256_srl_epi64 (__m256i a, __m128i count) pure @safe 791 792 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 793 // TODO verify 794 __m256i _mm256_srli_epi16 (__m256i a, int imm8) pure @trusted 795 { 796 static if (GDC_with_AVX2) 797 { 798 return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8); 799 } 800 else static if (LDC_with_AVX2) 801 { 802 return cast(__m256i) __builtin_ia32_psrlwi256(cast(short16)a, cast(ubyte)imm8); 803 } 804 else 805 { 806 //PERF: ARM 807 short16 sa = cast(short16)a; 808 ubyte count = cast(ubyte)imm8; 809 short16 r = cast(short16) _mm256_setzero_si256(); 810 if (count >= 16) 811 return cast(__m256i)r; 812 813 foreach(i; 0..16) 814 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> count); 815 return cast(__m256i)r; 816 } 817 } 818 unittest 819 { 820 __m256i A = _mm256_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7); 821 short16 B = cast(short16) _mm256_srli_epi16(A, 1); 822 short16 B2 = cast(short16) _mm256_srli_epi16(A, 1 + 256); 823 short[16] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3, 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 824 assert(B.array == expectedB); 825 assert(B2.array == expectedB); 826 827 short16 C = cast(short16) _mm256_srli_epi16(A, 16); 828 short[16] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]; 829 assert(C.array == expectedC); 830 831 short16 D = cast(short16) _mm256_srli_epi16(A, 0); 832 short[16] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7, 0, 1, 2, 3, -4, -5, 6, 7 ]; 833 assert(D.array == expectedD); 834 } 835 836 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 837 // TODO verify 838 __m256i _mm256_srli_epi32 (__m256i a, int imm8) pure @trusted 839 { 840 static if (GDC_with_AVX2) 841 { 842 return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8); 843 } 844 else static if (LDC_with_AVX2) 845 { 846 return cast(__m256i) __builtin_ia32_psrldi256(cast(int8)a, cast(ubyte)imm8); 847 } 848 else 849 { 850 ubyte count = cast(ubyte) imm8; 851 int8 a_int8 = cast(int8) a; 852 853 // Note: the intrinsics guarantee imm8[0..7] is taken, however 854 // D says "It's illegal to shift by the same or more bits 855 // than the size of the quantity being shifted" 856 // and it's UB instead. 857 int8 r = cast(int8) _mm256_setzero_si256(); 858 if (count >= 32) 859 return cast(__m256i) r; 860 r.ptr[0] = a_int8.array[0] >>> count; 861 r.ptr[1] = a_int8.array[1] >>> count; 862 r.ptr[2] = a_int8.array[2] >>> count; 863 r.ptr[3] = a_int8.array[3] >>> count; 864 r.ptr[4] = a_int8.array[4] >>> count; 865 r.ptr[5] = a_int8.array[5] >>> count; 866 r.ptr[6] = a_int8.array[6] >>> count; 867 r.ptr[7] = a_int8.array[7] >>> count; 868 return cast(__m256i) r; 869 } 870 } 871 unittest 872 { 873 __m256i A = _mm256_setr_epi32(0, 2, 3, -4, 0, 2, 3, -4); 874 int8 B = cast(int8) _mm256_srli_epi32(A, 1); 875 int8 B2 = cast(int8) _mm256_srli_epi32(A, 1 + 256); 876 int[8] expectedB = [ 0, 1, 1, 0x7FFFFFFE, 0, 1, 1, 0x7FFFFFFE]; 877 assert(B.array == expectedB); 878 assert(B2.array == expectedB); 879 880 int8 C = cast(int8) _mm256_srli_epi32(A, 255); 881 int[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 882 assert(C.array == expectedC); 883 } 884 885 // TODO __m256i _mm256_srli_epi64 (__m256i a, int imm8) pure @safe 886 // TODO __m256i _mm256_srli_si256 (__m256i a, const int imm8) pure @safe 887 // TODO __m128i _mm_srlv_epi32 (__m128i a, __m128i count) pure @safe 888 // TODO __m256i _mm256_srlv_epi32 (__m256i a, __m256i count) pure @safe 889 // TODO __m128i _mm_srlv_epi64 (__m128i a, __m128i count) pure @safe 890 // TODO __m256i _mm256_srlv_epi64 (__m256i a, __m256i count) pure @safe 891 // TODO __m256i _mm256_stream_load_si256 (__m256i const* mem_addr) pure @safe 892 // TODO __m256i _mm256_sub_epi16 (__m256i a, __m256i b) pure @safe 893 // TODO __m256i _mm256_sub_epi32 (__m256i a, __m256i b) pure @safe 894 // TODO __m256i _mm256_sub_epi64 (__m256i a, __m256i b) pure @safe 895 // TODO __m256i _mm256_sub_epi8 (__m256i a, __m256i b) pure @safe 896 // TODO __m256i _mm256_subs_epi16 (__m256i a, __m256i b) pure @safe 897 // TODO __m256i _mm256_subs_epi8 (__m256i a, __m256i b) pure @safe 898 // TODO __m256i _mm256_subs_epu16 (__m256i a, __m256i b) pure @safe 899 // TODO __m256i _mm256_subs_epu8 (__m256i a, __m256i b) pure @safe 900 // TODO __m256i _mm256_unpackhi_epi16 (__m256i a, __m256i b) pure @safe 901 // TODO __m256i _mm256_unpackhi_epi32 (__m256i a, __m256i b) pure @safe 902 // TODO __m256i _mm256_unpackhi_epi64 (__m256i a, __m256i b) pure @safe 903 // TODO __m256i _mm256_unpackhi_epi8 (__m256i a, __m256i b) pure @safe 904 // TODO __m256i _mm256_unpacklo_epi16 (__m256i a, __m256i b) pure @safe 905 // TODO __m256i _mm256_unpacklo_epi32 (__m256i a, __m256i b) pure @safe 906 // TODO __m256i _mm256_unpacklo_epi64 (__m256i a, __m256i b) pure @safe 907 // TODO __m256i _mm256_unpacklo_epi8 (__m256i a, __m256i b) pure @safe 908 909 /// Compute the bitwise XOR of 256 bits (representing integer data) in `a` and `b`. 910 __m256i _mm256_xor_si256 (__m256i a, __m256i b) pure @safe 911 // TODO verify 912 { 913 return a ^ b; 914 } 915 // TODO unittest and thus force inline 916 917 918 /+ 919 920 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.d") 921 int4 __builtin_ia32_gatherd_d(int4, const void*, int4, int4, byte); 922 923 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.d.256") 924 int8 __builtin_ia32_gatherd_d256(int8, const void*, int8, int8, byte); 925 926 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.pd") 927 double2 __builtin_ia32_gatherd_pd(double2, const void*, int4, double2, byte); 928 929 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.pd.256") 930 double4 __builtin_ia32_gatherd_pd256(double4, const void*, int4, double4, byte); 931 932 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.ps") 933 float4 __builtin_ia32_gatherd_ps(float4, const void*, int4, float4, byte); 934 935 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.ps.256") 936 float8 __builtin_ia32_gatherd_ps256(float8, const void*, int8, float8, byte); 937 938 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.q") 939 long2 __builtin_ia32_gatherd_q(long2, const void*, int4, long2, byte); 940 941 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.d.q.256") 942 long4 __builtin_ia32_gatherd_q256(long4, const void*, int4, long4, byte); 943 944 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.d") 945 int4 __builtin_ia32_gatherq_d(int4, const void*, long2, int4, byte); 946 947 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.d.256") 948 int4 __builtin_ia32_gatherq_d256(int4, const void*, long4, int4, byte); 949 950 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.pd") 951 double2 __builtin_ia32_gatherq_pd(double2, const void*, long2, double2, byte); 952 953 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.pd.256") 954 double4 __builtin_ia32_gatherq_pd256(double4, const void*, long4, double4, byte); 955 956 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.ps") 957 float4 __builtin_ia32_gatherq_ps(float4, const void*, long2, float4, byte); 958 959 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.ps.256") 960 float4 __builtin_ia32_gatherq_ps256(float4, const void*, long4, float4, byte); 961 962 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.q") 963 long2 __builtin_ia32_gatherq_q(long2, const void*, long2, long2, byte); 964 965 pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.q.256") 966 long4 __builtin_ia32_gatherq_q256(long4, const void*, long4, long4, byte); 967 968 pragma(LDC_intrinsic, "llvm.x86.avx2.maskload.d") 969 int4 __builtin_ia32_maskloadd(const void*, int4); 970 971 pragma(LDC_intrinsic, "llvm.x86.avx2.maskload.d.256") 972 int8 __builtin_ia32_maskloadd256(const void*, int8); 973 974 pragma(LDC_intrinsic, "llvm.x86.avx2.maskload.q") 975 long2 __builtin_ia32_maskloadq(const void*, long2); 976 977 pragma(LDC_intrinsic, "llvm.x86.avx2.maskload.q.256") 978 long4 __builtin_ia32_maskloadq256(const void*, long4); 979 980 pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.d") 981 void __builtin_ia32_maskstored(void*, int4, int4); 982 983 pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.d.256") 984 void __builtin_ia32_maskstored256(void*, int8, int8); 985 986 pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.q") 987 void __builtin_ia32_maskstoreq(void*, long2, long2); 988 989 pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.q.256") 990 void __builtin_ia32_maskstoreq256(void*, long4, long4); 991 992 pragma(LDC_intrinsic, "llvm.x86.avx2.mpsadbw") 993 short16 __builtin_ia32_mpsadbw256(byte32, byte32, byte) pure @safe; 994 995 pragma(LDC_intrinsic, "llvm.x86.avx2.packssdw") 996 short16 __builtin_ia32_packssdw256(int8, int8) pure @safe; 997 998 pragma(LDC_intrinsic, "llvm.x86.avx2.packsswb") 999 byte32 __builtin_ia32_packsswb256(short16, short16) pure @safe; 1000 1001 pragma(LDC_intrinsic, "llvm.x86.avx2.packusdw") 1002 short16 __builtin_ia32_packusdw256(int8, int8) pure @safe; 1003 1004 pragma(LDC_intrinsic, "llvm.x86.avx2.packuswb") 1005 byte32 __builtin_ia32_packuswb256(short16, short16) pure @safe; 1006 1007 pragma(LDC_intrinsic, "llvm.x86.avx2.pavg.b") 1008 byte32 __builtin_ia32_pavgb256(byte32, byte32) pure @safe; 1009 1010 pragma(LDC_intrinsic, "llvm.x86.avx2.pavg.w") 1011 short16 __builtin_ia32_pavgw256(short16, short16) pure @safe; 1012 1013 pragma(LDC_intrinsic, "llvm.x86.avx2.pblendvb") 1014 byte32 __builtin_ia32_pblendvb256(byte32, byte32, byte32) pure @safe; 1015 1016 pragma(LDC_intrinsic, "llvm.x86.avx2.permd") 1017 int8 __builtin_ia32_permvarsi256(int8, int8) pure @safe; 1018 1019 pragma(LDC_intrinsic, "llvm.x86.avx2.permps") 1020 float8 __builtin_ia32_permvarsf256(float8, int8) pure @safe; 1021 1022 pragma(LDC_intrinsic, "llvm.x86.avx2.phadd.d") 1023 int8 __builtin_ia32_phaddd256(int8, int8) pure @safe; 1024 1025 pragma(LDC_intrinsic, "llvm.x86.avx2.phadd.sw") 1026 short16 __builtin_ia32_phaddsw256(short16, short16) pure @safe; 1027 1028 pragma(LDC_intrinsic, "llvm.x86.avx2.phadd.w") 1029 short16 __builtin_ia32_phaddw256(short16, short16) pure @safe; 1030 1031 pragma(LDC_intrinsic, "llvm.x86.avx2.phsub.d") 1032 int8 __builtin_ia32_phsubd256(int8, int8) pure @safe; 1033 1034 pragma(LDC_intrinsic, "llvm.x86.avx2.phsub.sw") 1035 short16 __builtin_ia32_phsubsw256(short16, short16) pure @safe; 1036 1037 pragma(LDC_intrinsic, "llvm.x86.avx2.phsub.w") 1038 short16 __builtin_ia32_phsubw256(short16, short16) pure @safe; 1039 1040 pragma(LDC_intrinsic, "llvm.x86.avx2.pmadd.ub.sw") 1041 short16 __builtin_ia32_pmaddubsw256(byte32, byte32) pure @safe; 1042 1043 pragma(LDC_intrinsic, "llvm.x86.avx2.pmadd.wd") 1044 int8 __builtin_ia32_pmaddwd256(short16, short16) pure @safe; 1045 1046 pragma(LDC_intrinsic, "llvm.x86.avx2.pmovmskb") 1047 int __builtin_ia32_pmovmskb256(byte32) pure @safe; 1048 1049 pragma(LDC_intrinsic, "llvm.x86.avx2.pmul.hr.sw") 1050 short16 __builtin_ia32_pmulhrsw256(short16, short16) pure @safe; 1051 1052 pragma(LDC_intrinsic, "llvm.x86.avx2.pmulh.w") 1053 short16 __builtin_ia32_pmulhw256(short16, short16) pure @safe; 1054 1055 pragma(LDC_intrinsic, "llvm.x86.avx2.pmulhu.w") 1056 short16 __builtin_ia32_pmulhuw256(short16, short16) pure @safe; 1057 1058 pragma(LDC_intrinsic, "llvm.x86.avx2.psad.bw") 1059 long4 __builtin_ia32_psadbw256(byte32, byte32) pure @safe; 1060 1061 pragma(LDC_intrinsic, "llvm.x86.avx2.pshuf.b") 1062 byte32 __builtin_ia32_pshufb256(byte32, byte32) pure @safe; 1063 1064 pragma(LDC_intrinsic, "llvm.x86.avx2.psign.b") 1065 byte32 __builtin_ia32_psignb256(byte32, byte32) pure @safe; 1066 1067 pragma(LDC_intrinsic, "llvm.x86.avx2.psign.d") 1068 int8 __builtin_ia32_psignd256(int8, int8) pure @safe; 1069 1070 pragma(LDC_intrinsic, "llvm.x86.avx2.psign.w") 1071 short16 __builtin_ia32_psignw256(short16, short16) pure @safe; 1072 1073 pragma(LDC_intrinsic, "llvm.x86.avx2.psll.d") 1074 int8 __builtin_ia32_pslld256(int8, int4) pure @safe; 1075 1076 pragma(LDC_intrinsic, "llvm.x86.avx2.psll.q") 1077 long4 __builtin_ia32_psllq256(long4, long2) pure @safe; 1078 1079 pragma(LDC_intrinsic, "llvm.x86.avx2.psll.w") 1080 short16 __builtin_ia32_psllw256(short16, short8) pure @safe; 1081 1082 pragma(LDC_intrinsic, "llvm.x86.avx2.pslli.d") 1083 int8 __builtin_ia32_pslldi256(int8, int) pure @safe; 1084 1085 pragma(LDC_intrinsic, "llvm.x86.avx2.pslli.q") 1086 long4 __builtin_ia32_psllqi256(long4, int) pure @safe; 1087 1088 pragma(LDC_intrinsic, "llvm.x86.avx2.pslli.w") 1089 short16 __builtin_ia32_psllwi256(short16, int) pure @safe; 1090 1091 pragma(LDC_intrinsic, "llvm.x86.avx2.psllv.d") 1092 int4 __builtin_ia32_psllv4si(int4, int4) pure @safe; 1093 1094 pragma(LDC_intrinsic, "llvm.x86.avx2.psllv.d.256") 1095 int8 __builtin_ia32_psllv8si(int8, int8) pure @safe; 1096 1097 pragma(LDC_intrinsic, "llvm.x86.avx2.psllv.q") 1098 long2 __builtin_ia32_psllv2di(long2, long2) pure @safe; 1099 1100 pragma(LDC_intrinsic, "llvm.x86.avx2.psllv.q.256") 1101 long4 __builtin_ia32_psllv4di(long4, long4) pure @safe; 1102 1103 pragma(LDC_intrinsic, "llvm.x86.avx2.psra.d") 1104 int8 __builtin_ia32_psrad256(int8, int4) pure @safe; 1105 1106 pragma(LDC_intrinsic, "llvm.x86.avx2.psra.w") 1107 short16 __builtin_ia32_psraw256(short16, short8) pure @safe; 1108 1109 pragma(LDC_intrinsic, "llvm.x86.avx2.psrai.d") 1110 int8 __builtin_ia32_psradi256(int8, int) pure @safe; 1111 1112 pragma(LDC_intrinsic, "llvm.x86.avx2.psrai.w") 1113 short16 __builtin_ia32_psrawi256(short16, int) pure @safe; 1114 1115 pragma(LDC_intrinsic, "llvm.x86.avx2.psrav.d") 1116 int4 __builtin_ia32_psrav4si(int4, int4) pure @safe; 1117 1118 pragma(LDC_intrinsic, "llvm.x86.avx2.psrav.d.256") 1119 int8 __builtin_ia32_psrav8si(int8, int8) pure @safe; 1120 1121 pragma(LDC_intrinsic, "llvm.x86.avx2.psrl.d") 1122 int8 __builtin_ia32_psrld256(int8, int4) pure @safe; 1123 1124 pragma(LDC_intrinsic, "llvm.x86.avx2.psrl.q") 1125 long4 __builtin_ia32_psrlq256(long4, long2) pure @safe; 1126 1127 pragma(LDC_intrinsic, "llvm.x86.avx2.psrl.w") 1128 short16 __builtin_ia32_psrlw256(short16, short8) pure @safe; 1129 1130 pragma(LDC_intrinsic, "llvm.x86.avx2.psrli.d") 1131 int8 __builtin_ia32_psrldi256(int8, int) pure @safe; 1132 1133 pragma(LDC_intrinsic, "llvm.x86.avx2.psrli.q") 1134 long4 __builtin_ia32_psrlqi256(long4, int) pure @safe; 1135 1136 pragma(LDC_intrinsic, "llvm.x86.avx2.psrli.w") 1137 short16 __builtin_ia32_psrlwi256(short16, int) pure @safe; 1138 1139 pragma(LDC_intrinsic, "llvm.x86.avx2.psrlv.d") 1140 int4 __builtin_ia32_psrlv4si(int4, int4) pure @safe; 1141 1142 pragma(LDC_intrinsic, "llvm.x86.avx2.psrlv.d.256") 1143 int8 __builtin_ia32_psrlv8si(int8, int8) pure @safe; 1144 1145 pragma(LDC_intrinsic, "llvm.x86.avx2.psrlv.q") 1146 long2 __builtin_ia32_psrlv2di(long2, long2) pure @safe; 1147 1148 pragma(LDC_intrinsic, "llvm.x86.avx2.psrlv.q.256") 1149 long4 __builtin_ia32_psrlv4di(long4, long4) pure @safe; 1150 1151 +/