1 /** 2 * AVX intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX 4 * 5 * Copyright: Guillaume Piolat 2022. 6 * Johan Engelen 2022. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.avxintrin; 10 11 // AVX instructions 12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX 13 // Note: this header will work whether you have AVX enabled or not. 14 // With LDC, use "dflags-ldc": ["-mattr=+avx"] or equivalent to actively 15 // generate AVX instructions. 16 // With GDC, use "dflags-gdc": ["-mavx"] or equivalent to actively 17 // generate AVX instructions. 18 19 public import inteli.types; 20 import inteli.internals; 21 22 // Pull in all previous instruction set intrinsics. 23 public import inteli.tmmintrin; 24 25 nothrow @nogc: 26 27 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 28 __m256d _mm256_add_pd (__m256d a, __m256d b) pure @trusted 29 { 30 return a + b; 31 } 32 unittest 33 { 34 align(32) double[4] A = [-1, 2, -3, 40000]; 35 align(32) double[4] B = [ 9, -7, 8, -0.5]; 36 __m256d R = _mm256_add_pd(_mm256_load_pd(A.ptr), _mm256_load_pd(B.ptr)); 37 double[4] correct = [8, -5, 5, 39999.5]; 38 assert(R.array == correct); 39 } 40 41 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`. 42 __m256 _mm256_add_ps (__m256 a, __m256 b) pure @trusted 43 { 44 return a + b; 45 } 46 unittest 47 { 48 align(32) float[8] A = [-1.0f, 2, -3, 40000, 0, 3, 5, 6]; 49 align(32) float[8] B = [ 9.0f, -7, 8, -0.5, 8, 7, 3, -1]; 50 __m256 R = _mm256_add_ps(_mm256_load_ps(A.ptr), _mm256_load_ps(B.ptr)); 51 float[8] correct = [8, -5, 5, 39999.5, 8, 10, 8, 5]; 52 assert(R.array == correct); 53 } 54 55 /// Alternatively add and subtract packed double-precision (64-bit) floating-point 56 /// elements in `a` to/from packed elements in `b`. 57 __m256d _mm256_addsub_pd (__m256d a, __m256d b) pure @trusted 58 { 59 // PERF DMD 60 static if (GDC_or_LDC_with_AVX) 61 { 62 return __builtin_ia32_addsubpd256(a, b); 63 } 64 else 65 { 66 //// Note: GDC x86 generates addsubpd since GDC 11.1 with -O3 67 //// LDC x86 generates addsubpd since LDC 1.18 with -O2 68 //// LDC ARM: not fantastic, ok since LDC 1.18 -O2 69 a.ptr[0] = a.array[0] + (-b.array[0]); 70 a.ptr[1] = a.array[1] + b.array[1]; 71 a.ptr[2] = a.array[2] + (-b.array[2]); 72 a.ptr[3] = a.array[3] + b.array[3]; 73 return a; 74 } 75 } 76 unittest 77 { 78 align(32) double[4] A = [-1, 2, -3, 40000]; 79 align(32) double[4] B = [ 9, -7, 8, -0.5]; 80 __m256d R = _mm256_addsub_pd(_mm256_load_pd(A.ptr), _mm256_load_pd(B.ptr)); 81 double[4] correct = [-10, -5, -11, 39999.5]; 82 assert(R.array == correct); 83 } 84 85 /// Alternatively add and subtract packed single-precision (32-bit) floating-point elements 86 /// in `a` to/from packed elements in `b`. 87 __m256 _mm256_addsub_ps (__m256 a, __m256 b) pure @trusted 88 { 89 // PERF DMD 90 static if (GDC_or_LDC_with_AVX) 91 { 92 return __builtin_ia32_addsubps256(a, b); 93 } 94 else 95 { 96 // Note: GDC x86 generates addsubps since GDC 11 -O3 97 // and in absence of AVX, a pair of SSE3 addsubps since GDC 12 -O2 98 // LDC x86 generates addsubps since LDC 1.18 -O2 99 // and in absence of AVX, a pair of SSE3 addsubps since LDC 1.1 -O1 100 // LDC ARM: neat output since LDC 1.21 -O2 101 102 a.ptr[0] = a.array[0] + (-b.array[0]); 103 a.ptr[1] = a.array[1] + b.array[1]; 104 a.ptr[2] = a.array[2] + (-b.array[2]); 105 a.ptr[3] = a.array[3] + b.array[3]; 106 a.ptr[4] = a.array[4] + (-b.array[4]); 107 a.ptr[5] = a.array[5] + b.array[5]; 108 a.ptr[6] = a.array[6] + (-b.array[6]); 109 a.ptr[7] = a.array[7] + b.array[7]; 110 return a; 111 } 112 } 113 unittest 114 { 115 align(32) float[8] A = [-1.0f, 2, -3, 40000, 0, 3, 5, 6]; 116 align(32) float[8] B = [ 9.0f, -7, 8, -0.5, 8, 7, 3, -1]; 117 __m256 R = _mm256_addsub_ps(_mm256_load_ps(A.ptr), _mm256_load_ps(B.ptr)); 118 float[8] correct = [ -10, -5, -11, 39999.5, -8, 10, 2, 5]; 119 assert(R.array == correct); 120 } 121 122 /// Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in `a` and `b`. 123 __m256d _mm256_and_pd (__m256d a, __m256d b) pure @trusted 124 { 125 // Note: GCC avxintrin.h uses the builtins for AND NOTAND OR of _ps and _pd, 126 // but those do not seem needed at any optimization level. 127 return cast(__m256d)(cast(__m256i)a & cast(__m256i)b); 128 } 129 unittest 130 { 131 double a = 4.32; 132 double b = -78.99; 133 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 134 __m256d A = _mm256_set_pd(a, b, a, b); 135 __m256d B = _mm256_set_pd(b, a, b, a); 136 long4 R = cast(long4)( _mm256_and_pd(A, B) ); 137 assert(R.array[0] == correct); 138 assert(R.array[1] == correct); 139 assert(R.array[2] == correct); 140 assert(R.array[3] == correct); 141 } 142 143 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`. 144 __m256 _mm256_and_ps (__m256 a, __m256 b) pure @trusted 145 { 146 return cast(__m256)(cast(__m256i)a & cast(__m256i)b); 147 } 148 unittest 149 { 150 float a = 4.32f; 151 float b = -78.99f; 152 int correct = (*cast(int*)(&a)) & (*cast(int*)(&b)); 153 __m256 A = _mm256_set_ps(a, b, a, b, a, b, a, b); 154 __m256 B = _mm256_set_ps(b, a, b, a, b, a, b, a); 155 int8 R = cast(int8)( _mm256_and_ps(A, B) ); 156 foreach(i; 0..8) 157 assert(R.array[i] == correct); 158 } 159 160 /// Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in `a` 161 /// and then AND with b. 162 __m256d _mm256_andnot_pd (__m256d a, __m256d b) pure @trusted 163 { 164 // PERF DMD 165 __m256i notA = _mm256_not_si256(cast(__m256i)a); 166 __m256i ib = cast(__m256i)b; 167 __m256i ab = notA & ib; 168 return cast(__m256d)ab; 169 } 170 unittest 171 { 172 double a = 4.32; 173 double b = -78.99; 174 long notA = ~ ( *cast(long*)(&a) ); 175 long correct = notA & (*cast(long*)(&b)); 176 __m256d A = _mm256_set_pd(a, a, a, a); 177 __m256d B = _mm256_set_pd(b, b, b, b); 178 long4 R = cast(long4)( _mm256_andnot_pd(A, B) ); 179 foreach(i; 0..4) 180 assert(R.array[i] == correct); 181 } 182 183 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` 184 /// and then AND with b. 185 __m256 _mm256_andnot_ps (__m256 a, __m256 b) pure @trusted 186 { 187 // PERF DMD 188 __m256i notA = _mm256_not_si256(cast(__m256i)a); 189 __m256i ib = cast(__m256i)b; 190 __m256i ab = notA & ib; 191 return cast(__m256)ab; 192 } 193 unittest 194 { 195 float a = 4.32f; 196 float b = -78.99f; 197 int notA = ~ ( *cast(int*)(&a) ); 198 int correct = notA & (*cast(int*)(&b)); 199 __m256 A = _mm256_set1_ps(a); 200 __m256 B = _mm256_set1_ps(b); 201 int8 R = cast(int8)( _mm256_andnot_ps(A, B) ); 202 foreach(i; 0..8) 203 assert(R.array[i] == correct); 204 } 205 206 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control 207 /// mask `imm8`. 208 __m256d _mm256_blend_pd(int imm8)(__m256d a, __m256d b) 209 { 210 static assert(imm8 >= 0 && imm8 < 16); 211 212 // PERF DMD 213 static if (GDC_with_AVX) 214 { 215 return __builtin_ia32_blendpd256 (a, b, imm8); 216 } 217 else 218 { 219 // Works great with LDC. 220 double4 r; 221 for (int n = 0; n < 4; ++n) 222 { 223 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 224 } 225 return r; 226 } 227 } 228 unittest 229 { 230 __m256d A = _mm256_setr_pd(0, 1, 2, 3); 231 __m256d B = _mm256_setr_pd(8, 9, 10, 11); 232 double4 C = _mm256_blend_pd!0x06(A, B); 233 double[4] correct = [0, 9, 10, 3]; 234 assert(C.array == correct); 235 } 236 237 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control 238 /// mask `imm8`. 239 __m256 _mm256_blend_ps(int imm8)(__m256 a, __m256 b) pure @trusted 240 { 241 static assert(imm8 >= 0 && imm8 < 256); 242 // PERF DMD 243 // PERF ARM64: not awesome with some constant values, up to 8/9 instructions 244 static if (GDC_with_AVX) 245 { 246 return __builtin_ia32_blendps256 (a, b, imm8); 247 } 248 else 249 { 250 // LDC x86: vblendps generated since LDC 1.27 -O1 251 float8 r; 252 for (int n = 0; n < 8; ++n) 253 { 254 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 255 } 256 return r; 257 } 258 } 259 unittest 260 { 261 __m256 A = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7); 262 __m256 B = _mm256_setr_ps(8, 9, 10, 11, 12, 13, 14, 15); 263 float8 C = _mm256_blend_ps!0xe7(A, B); 264 float[8] correct = [8, 9, 10, 3, 4, 13, 14, 15]; 265 assert(C.array == correct); 266 } 267 268 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using mask. 269 __m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask) @trusted 270 { 271 // PERF DMD 272 static if (GDC_with_AVX) 273 { 274 // Amazingly enough, GCC/GDC generates the vblendvpd instruction 275 // with -mavx2 but not -mavx. 276 // Not sure what is the reason, and there is a replacement sequence. 277 // PERF: Sounds like a bug, similar to _mm_blendv_pd 278 return __builtin_ia32_blendvpd256(a, b, mask); 279 } 280 else static if (LDC_with_AVX) 281 { 282 return __builtin_ia32_blendvpd256(a, b, mask); 283 } 284 else 285 { 286 // LDC x86: vblendvpd since LDC 1.27 -O2 287 // arm64: only 4 instructions, since LDC 1.27 -O2 288 __m256d r; 289 long4 lmask = cast(long4)mask; 290 for (int n = 0; n < 4; ++n) 291 { 292 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 293 } 294 return r; 295 } 296 } 297 unittest 298 { 299 __m256d A = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0); 300 __m256d B = _mm256_setr_pd(5.0, 6.0, 7.0, 8.0); 301 __m256d M = _mm256_setr_pd(-3.0, 2.0, 1.0, -4.0); 302 __m256d R = _mm256_blendv_pd(A, B, M); 303 double[4] correct1 = [5.0, 2.0, 3.0, 8.0]; 304 assert(R.array == correct1); // Note: probably the same NaN-mask oddity exist on arm64+linux than with _mm_blendv_pd 305 } 306 307 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` 308 /// using `mask`. 309 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` 310 /// using `mask`. 311 __m256 _mm256_blendv_ps (__m256 a, __m256 b, __m256 mask) @trusted 312 { 313 // PERF DMD 314 // PERF LDC/GDC without AVX could use two intrinsics for each part 315 static if (GDC_or_LDC_with_AVX) 316 { 317 return __builtin_ia32_blendvps256(a, b, mask); 318 } 319 else static if (LDC_with_ARM64) 320 { 321 int8 shift; 322 shift = 31; 323 int8 lmask = cast(int8)mask >> shift; 324 int8 ia = cast(int8)a; 325 int8 ib = cast(int8)b; 326 return cast(__m256)(ia ^ ((ia ^ ib) & lmask)); 327 } 328 else 329 { 330 __m256 r = void; // PERF =void; 331 int8 lmask = cast(int8)mask; 332 for (int n = 0; n < 8; ++n) 333 { 334 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 335 } 336 return r; 337 } 338 } 339 unittest 340 { 341 __m256 A = _mm256_setr_ps(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); 342 __m256 B = _mm256_setr_ps(5.0f, 6.0f, 7.0f, 8.0f, 5.0f, 6.0f, 7.0f, 8.0f); 343 __m256 M = _mm256_setr_ps(-3.0f, 2.0f, 1.0f, -4.0f, -3.0f, 2.0f, 1.0f, -4.0f); 344 __m256 R = _mm256_blendv_ps(A, B, M); 345 float[8] correct1 = [5.0f, 2.0f, 3.0f, 8.0f, 5.0f, 2.0f, 3.0f, 8.0f]; 346 assert(R.array == correct1); // Note: probably the same NaN-mask oddity exist on arm64+linux than with _mm_blendv_pd 347 } 348 349 /// Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit) 350 /// floating-point elements) to all elements. 351 /// This effectively duplicates the 128-bit vector. 352 __m256d _mm256_broadcast_pd (const(__m128d)* mem_addr) pure @trusted 353 { 354 // PERF DMD 355 static if (GDC_with_AVX) 356 { 357 return __builtin_ia32_vbroadcastf128_pd256(cast(float4*)mem_addr); 358 } 359 else 360 { 361 const(double)* p = cast(const(double)*) mem_addr; 362 __m256d r; 363 r.ptr[0] = p[0]; 364 r.ptr[1] = p[1]; 365 r.ptr[2] = p[0]; 366 r.ptr[3] = p[1]; 367 return r; 368 } 369 } 370 unittest 371 { 372 __m128d A = _mm_setr_pd(3, -4); 373 __m256d B = _mm256_broadcast_pd(&A); 374 double[4] correct = [3, -4, 3, -4]; 375 assert(B.array == correct); 376 } 377 378 /// Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) 379 /// floating-point elements) to all elements. 380 /// This effectively duplicates the 128-bit vector. 381 __m256 _mm256_broadcast_ps (const(__m128)* mem_addr) pure @trusted 382 { 383 // PERF DMD 384 static if (GDC_with_AVX) 385 { 386 return __builtin_ia32_vbroadcastf128_ps256(cast(float4*)mem_addr); 387 } 388 else 389 { 390 const(float)* p = cast(const(float)*)mem_addr; 391 __m256 r; 392 r.ptr[0] = p[0]; 393 r.ptr[1] = p[1]; 394 r.ptr[2] = p[2]; 395 r.ptr[3] = p[3]; 396 r.ptr[4] = p[0]; 397 r.ptr[5] = p[1]; 398 r.ptr[6] = p[2]; 399 r.ptr[7] = p[3]; 400 return r; 401 } 402 } 403 unittest 404 { 405 __m128 A = _mm_setr_ps(1, 2, 3, -4); 406 __m256 B = _mm256_broadcast_ps(&A); 407 float[8] correct = [1.0f, 2, 3, -4, 1, 2, 3, -4]; 408 assert(B.array == correct); 409 } 410 411 /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements. 412 __m256d _mm256_broadcast_sd (const(double)* mem_addr) pure @trusted 413 { 414 static if (GDC_with_AVX) 415 { 416 return __builtin_ia32_vbroadcastsd256(mem_addr); 417 } 418 else 419 { 420 double a = *mem_addr; 421 __m256d r; 422 r.ptr[0] = a; 423 r.ptr[1] = a; 424 r.ptr[2] = a; 425 r.ptr[3] = a; 426 return r; 427 } 428 } 429 unittest 430 { 431 double t = 7.5f; 432 __m256d A = _mm256_broadcast_sd(&t); 433 double[4] correct = [7.5, 7.5, 7.5, 7.5]; 434 assert(A.array == correct); 435 } 436 437 /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements. 438 __m128 _mm_broadcast_ss (const(float)* mem_addr) pure @trusted 439 { 440 // PERF: DMD 441 static if (GDC_with_AVX) 442 { 443 return __builtin_ia32_vbroadcastss(mem_addr); 444 } 445 else 446 { 447 float a = *mem_addr; 448 __m128 r; 449 r.ptr[0] = a; 450 r.ptr[1] = a; 451 r.ptr[2] = a; 452 r.ptr[3] = a; 453 return r; 454 } 455 } 456 unittest 457 { 458 float t = 7.5f; 459 __m128 A = _mm_broadcast_ss(&t); 460 float[4] correct = [7.5f, 7.5f, 7.5f, 7.5f]; 461 assert(A.array == correct); 462 } 463 464 __m256 _mm256_broadcast_ss (const(float)* mem_addr) 465 { 466 // PERF: DMD 467 static if (GDC_with_AVX) 468 { 469 return __builtin_ia32_vbroadcastss256 (mem_addr); 470 } 471 else 472 { 473 float a = *mem_addr; 474 __m256 r = __m256(a); 475 return r; 476 } 477 } 478 unittest 479 { 480 float t = 7.5f; 481 __m256 A = _mm256_broadcast_ss(&t); 482 float[8] correct = [7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f]; 483 assert(A.array == correct); 484 } 485 486 /// Cast vector of type `__m256d` to type `__m256`. 487 __m256 _mm256_castpd_ps (__m256d a) pure @safe 488 { 489 return cast(__m256)a; 490 } 491 492 /// Cast vector of type `__m256d` to type `__m256i`. 493 __m256i _mm256_castpd_si256 (__m256d a) pure @safe 494 { 495 return cast(__m256i)a; 496 } 497 498 /// Cast vector of type `__m128d` to type `__m256d`; the upper 128 bits of the result are undefined. 499 __m256d _mm256_castpd128_pd256 (__m128d a) pure @trusted 500 { 501 static if (GDC_with_AVX) 502 { 503 return __builtin_ia32_pd256_pd(a); 504 } 505 else 506 { 507 __m256d r = void; 508 r.ptr[0] = a.array[0]; 509 r.ptr[1] = a.array[1]; 510 return r; 511 } 512 } 513 unittest 514 { 515 __m128d A = _mm_setr_pd(4.0, -6.125); 516 __m256d B = _mm256_castpd128_pd256(A); 517 assert(B.array[0] == 4.0); 518 assert(B.array[1] == -6.125); 519 } 520 521 /// Cast vector of type `__m256d` to type `__m128d`; the upper 128 bits of `a` are lost. 522 __m128d _mm256_castpd256_pd128 (__m256d a) pure @trusted 523 { 524 static if (GDC_with_AVX) 525 { 526 return __builtin_ia32_pd_pd256(a); 527 } 528 else 529 { 530 __m128d r; 531 r.ptr[0] = a.array[0]; 532 r.ptr[1] = a.array[1]; 533 return r; 534 } 535 } 536 unittest 537 { 538 __m256d A = _mm256_set_pd(1, 2, -6.25, 4.0); 539 __m128d B = _mm256_castpd256_pd128(A); 540 assert(B.array[0] == 4.0); 541 assert(B.array[1] == -6.25); 542 } 543 544 /// Cast vector of type `__m256` to type `__m256d`. 545 __m256d _mm256_castps_pd (__m256 a) pure @safe 546 { 547 return cast(__m256d)a; 548 } 549 550 /// Cast vector of type `__m256` to type `__m256i`. 551 __m256i _mm256_castps_si256 (__m256 a) pure @safe 552 { 553 return cast(__m256i)a; 554 } 555 556 /// Cast vector of type `__m128` to type `__m256`; the upper 128 bits of the result are undefined. 557 __m256 _mm256_castps128_ps256 (__m128 a) pure @trusted 558 { 559 static if (GDC_with_AVX) 560 { 561 return __builtin_ia32_ps256_ps(a); 562 } 563 else 564 { 565 __m256 r = void; 566 r.ptr[0] = a.array[0]; 567 r.ptr[1] = a.array[1]; 568 r.ptr[2] = a.array[2]; 569 r.ptr[3] = a.array[3]; 570 return r; 571 } 572 } 573 unittest 574 { 575 __m128 A = _mm_setr_ps(1.0f, 2, 3, 4); 576 __m256 B = _mm256_castps128_ps256(A); 577 float[4] correct = [1.0f, 2, 3, 4]; 578 assert(B.array[0..4] == correct); 579 } 580 581 /// Cast vector of type `__m256` to type `__m128`. The upper 128-bit of `a` are lost. 582 __m128 _mm256_castps256_ps128 (__m256 a) pure @trusted 583 { 584 return *cast(const(__m128)*)(&a); 585 } 586 unittest 587 { 588 __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8); 589 __m128 B = _mm256_castps256_ps128(A); 590 float[4] correct = [1.0f, 2, 3, 4]; 591 assert(B.array == correct); 592 } 593 594 /// Cast vector of type `__m128i` to type `__m256i`; the upper 128 bits of the result are undefined. 595 __m256i _mm256_castsi128_si256 (__m128i a) pure @trusted 596 { 597 long2 la = cast(long2)a; 598 long4 r = void; 599 r.ptr[0] = la.array[0]; 600 r.ptr[1] = la.array[1]; 601 return r; 602 } 603 unittest 604 { 605 __m128i A = _mm_setr_epi64(-1, 42); 606 __m256i B = _mm256_castsi128_si256(A); 607 long[2] correct = [-1, 42]; 608 assert(B.array[0..2] == correct); 609 } 610 611 /// Cast vector of type `__m256i` to type `__m256d`. 612 __m256d _mm256_castsi256_pd (__m256i a) pure @safe 613 { 614 return cast(__m256d)a; 615 } 616 617 /// Cast vector of type `__m256i` to type `__m256`. 618 __m256 _mm256_castsi256_ps (__m256i a) pure @safe 619 { 620 return cast(__m256)a; 621 } 622 623 /// Cast vector of type `__m256i` to type `__m128i`. The upper 128-bit of `a` are lost. 624 __m128i _mm256_castsi256_si128 (__m256i a) pure @trusted 625 { 626 long2 r = void; 627 r.ptr[0] = a.array[0]; 628 r.ptr[1] = a.array[1]; 629 return cast(__m128i)r; 630 } 631 unittest 632 { 633 long4 A; 634 A.ptr[0] = -1; 635 A.ptr[1] = 42; 636 long2 B = cast(long2)(_mm256_castsi256_si128(A)); 637 long[2] correct = [-1, 42]; 638 assert(B.array[0..2] == correct); 639 } 640 641 642 // TODO __m256d _mm256_ceil_pd (__m256d a) 643 // TODO __m256 _mm256_ceil_ps (__m256 a) 644 645 // TODO __m128d _mm_cmp_pd (__m128d a, __m128d b, const int imm8) 646 // TODO __m256d _mm256_cmp_pd (__m256d a, __m256d b, const int imm8) 647 // TODO __m128 _mm_cmp_ps (__m128 a, __m128 b, const int imm8) 648 // TODO __m256 _mm256_cmp_ps (__m256 a, __m256 b, const int imm8) 649 // TODO __m128d _mm_cmp_sd (__m128d a, __m128d b, const int imm8) 650 // TODO __m128 _mm_cmp_ss (__m128 a, __m128 b, const int imm8) 651 652 /// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point 653 /// elements. 654 __m256d _mm256_cvtepi32_pd (__m128i a) pure @trusted 655 { 656 version(LDC) 657 { 658 enum ir = ` 659 %r = sitofp <4 x i32> %0 to <4 x double> 660 ret <4 x double> %r`; 661 return LDCInlineIR!(ir, double4, __m128i)(a); 662 } 663 else static if (GDC_with_AVX) 664 { 665 return __builtin_ia32_cvtdq2pd256(a); 666 } 667 else 668 { 669 double4 r; 670 r.ptr[0] = a.array[0]; 671 r.ptr[1] = a.array[1]; 672 r.ptr[2] = a.array[2]; 673 r.ptr[3] = a.array[3]; 674 return r; 675 } 676 } 677 unittest 678 { 679 __m256d R = _mm256_cvtepi32_pd(_mm_set1_epi32(54)); 680 double[4] correct = [54.0, 54, 54, 54]; 681 assert(R.array == correct); 682 } 683 684 /// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point 685 /// elements. 686 __m256 _mm256_cvtepi32_ps (__m256i a) pure @trusted 687 { 688 version(LDC) 689 { 690 enum ir = ` 691 %r = sitofp <8 x i32> %0 to <8 x float> 692 ret <8 x float> %r`; 693 return LDCInlineIR!(ir, float8, int8)(cast(int8)a); 694 } 695 else static if (GDC_with_AVX) 696 { 697 return __builtin_ia32_cvtdq2ps256(cast(int8)a); 698 } 699 else 700 { 701 int8 ia = cast(int8)a; 702 __m256 r; 703 r.ptr[0] = ia.array[0]; 704 r.ptr[1] = ia.array[1]; 705 r.ptr[2] = ia.array[2]; 706 r.ptr[3] = ia.array[3]; 707 r.ptr[4] = ia.array[4]; 708 r.ptr[5] = ia.array[5]; 709 r.ptr[6] = ia.array[6]; 710 r.ptr[7] = ia.array[7]; 711 return r; 712 } 713 } 714 unittest 715 { 716 __m256 R = _mm256_cvtepi32_ps(_mm256_set1_epi32(5)); 717 float[8] correct = [5.0f, 5, 5, 5, 5, 5, 5, 5]; 718 assert(R.array == correct); 719 } 720 721 // TODO __m128i _mm256_cvtpd_epi32 (__m256d a) 722 723 724 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed single-precision (32-bit) 725 /// floating-point elements. 726 __m128 _mm256_cvtpd_ps (__m256d a) pure @trusted 727 { 728 // PERF DMD 729 static if (GDC_or_LDC_with_AVX) 730 { 731 return __builtin_ia32_cvtpd2ps256(a); 732 } 733 else 734 { 735 __m128 r; 736 r.ptr[0] = a.array[0]; 737 r.ptr[1] = a.array[1]; 738 r.ptr[2] = a.array[2]; 739 r.ptr[3] = a.array[3]; 740 return r; 741 } 742 } 743 unittest 744 { 745 __m256d A = _mm256_setr_pd(1.0, 2, 3, 5); 746 __m128 R = _mm256_cvtpd_ps(A); 747 float[4] correct = [1.0f, 2, 3, 5]; 748 assert(R.array == correct); 749 } 750 751 752 // TODO __m256i _mm256_cvtps_epi32 (__m256 a) 753 754 /// Convert packed single-precision (32-bit) floating-point elements in `a`` to packed double-precision 755 /// (64-bit) floating-point elements. 756 __m256d _mm256_cvtps_pd (__m128 a) pure @trusted 757 { 758 // PERF DMD 759 static if (GDC_with_AVX) 760 { 761 return __builtin_ia32_cvtps2pd256(a); // LDC doesn't have the builtin 762 } 763 else 764 { 765 // LDC: x86, needs -O2 to generate cvtps2pd since LDC 1.2.0 766 __m256d r; 767 r.ptr[0] = a.array[0]; 768 r.ptr[1] = a.array[1]; 769 r.ptr[2] = a.array[2]; 770 r.ptr[3] = a.array[3]; 771 return r; 772 } 773 } 774 unittest 775 { 776 __m128 A = _mm_setr_ps(1.0f, 2, 3, 5); 777 __m256d R = _mm256_cvtps_pd(A); 778 double[4] correct = [1.0, 2, 3, 5]; 779 assert(R.array == correct); 780 } 781 782 /// Return the lower double-precision (64-bit) floating-point element of `a`. 783 double _mm256_cvtsd_f64 (__m256d a) pure @safe 784 { 785 return a.array[0]; 786 } 787 788 /// Return the lower 32-bit integer in `a`. 789 int _mm256_cvtsi256_si32 (__m256i a) pure @safe 790 { 791 return (cast(int8)a).array[0]; 792 } 793 794 /// Return the lower single-precision (32-bit) floating-point element of `a`. 795 float _mm256_cvtss_f32 (__m256 a) pure @safe 796 { 797 return a.array[0]; 798 } 799 800 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit 801 /// integers with truncation. 802 __m128i _mm256_cvttpd_epi32 (__m256d a) pure @trusted 803 { 804 // PERF DMD 805 static if (GDC_or_LDC_with_AVX) 806 { 807 return cast(__m128i)__builtin_ia32_cvttpd2dq256(a); 808 } 809 else 810 { 811 __m128i r; 812 r.ptr[0] = cast(int)a.array[0]; 813 r.ptr[1] = cast(int)a.array[1]; 814 r.ptr[2] = cast(int)a.array[2]; 815 r.ptr[3] = cast(int)a.array[3]; 816 return r; 817 } 818 } 819 unittest 820 { 821 __m256d A = _mm256_set_pd(4.7, -1000.9, -7.1, 3.1); 822 __m128i R = _mm256_cvttpd_epi32(A); 823 int[4] correct = [3, -7, -1000, 4]; 824 assert(R.array == correct); 825 } 826 827 /// Convert packed single-precision (32-bit) floating-point elements in `a`. 828 __m256i _mm256_cvttps_epi32 (__m256 a) pure @trusted 829 { 830 // PERF DMD 831 static if (GDC_or_LDC_with_AVX) 832 { 833 return cast(__m256i)__builtin_ia32_cvttps2dq256(a); 834 } 835 else 836 { 837 int8 r; 838 r.ptr[0] = cast(int)a.array[0]; 839 r.ptr[1] = cast(int)a.array[1]; 840 r.ptr[2] = cast(int)a.array[2]; 841 r.ptr[3] = cast(int)a.array[3]; 842 r.ptr[4] = cast(int)a.array[4]; 843 r.ptr[5] = cast(int)a.array[5]; 844 r.ptr[6] = cast(int)a.array[6]; 845 r.ptr[7] = cast(int)a.array[7]; 846 return cast(__m256i)r; 847 } 848 } 849 unittest 850 { 851 __m256 A = _mm256_set_ps(4.7, -1000.9, -7.1, 3.1, 1.4, 2.9, -2.9, 0); 852 int8 R = cast(int8) _mm256_cvttps_epi32(A); 853 int[8] correct = [0, -2, 2, 1, 3, -7, -1000, 4]; 854 assert(R.array == correct); 855 } 856 857 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`. 858 __m256d _mm256_div_pd (__m256d a, __m256d b) pure @safe 859 { 860 return a / b; 861 } 862 unittest 863 { 864 __m256d a = [1.5, -2.0, 3.0, 1.0]; 865 a = _mm256_div_pd(a, a); 866 double[4] correct = [1.0, 1.0, 1.0, 1.0]; 867 assert(a.array == correct); 868 } 869 870 /// Divide packed single-precision (32-bit) floating-point elements in `a` by packed elements in `b`. 871 __m256 _mm256_div_ps (__m256 a, __m256 b) pure @safe 872 { 873 return a / b; 874 } 875 unittest 876 { 877 __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 4.5f, -5.0f, 6.0f, 7.0f]; 878 a = _mm256_div_ps(a, a); 879 float[8] correct = [1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f]; 880 assert(a.array == correct); 881 } 882 883 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements in `a` and 884 /// `b` using the high 4 bits in `imm8`, sum the four products, and conditionally store the sum 885 /// using the low 4 bits of `imm8`. 886 __m256 _mm256_dp_ps(int imm8)(__m256 a, __m256 b) 887 { 888 // PERF DMD 889 // PERF without AVX, can use 2 _mm_dp_ps exactly (beware the imm8 is tricky) 890 static if (GDC_or_LDC_with_AVX) 891 { 892 return __builtin_ia32_dpps256(a, b, cast(ubyte)imm8); 893 } 894 else 895 { 896 __m256 zero = _mm256_setzero_ps(); 897 enum ubyte op = (imm8 >>> 4) & 15; 898 __m256 temp = _mm256_blend_ps!( op | (op << 4) )(zero, a * b); 899 float lo = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3]; 900 float hi = temp.array[4] + temp.array[5] + temp.array[6] + temp.array[7]; 901 __m256 r = _mm256_set_m128(_mm_set1_ps(hi), _mm_set1_ps(lo)); 902 enum ubyte op2 = (imm8 & 15); 903 return _mm256_blend_ps!(op2 | (op2 << 4))(zero, r); 904 } 905 } 906 unittest 907 { 908 // Products: 9 14 20 24 6 16 12 -24 909 __m256 A = _mm256_setr_ps(1.0f, 2.0f, 4.0f, 8.0f, 1.0f, 2.0f, 4.0f, 8.0f); 910 __m256 B = _mm256_setr_ps(9.0f, 7.0f, 5.0f, 3.0f, 6.0f, 8.0f, 3.0f,-3.0f); 911 float8 R1 = _mm256_dp_ps!(0xf0 + 0xf)(A, B); 912 float8 R2 = _mm256_dp_ps!(0x30 + 0x5)(A, B); 913 float8 R3 = _mm256_dp_ps!(0x50 + 0xa)(A, B); 914 float[8] correct1 = [67.0f, 67.0f, 67.0f,67.0f, 10, 10, 10, 10]; 915 float[8] correct2 = [23.0f, 0.0f, 23.0f, 0.0f, 22, 0, 22, 0]; 916 float[8] correct3 = [0.0f, 29.0f, 0.0f, 29.0f, 0, 18, 0, 18]; 917 assert(R1.array == correct1); 918 assert(R2.array == correct2); 919 assert(R3.array == correct3); 920 } 921 922 /// Extract a 32-bit integer from `a`, selected with `imm8`. 923 int _mm256_extract_epi32 (__m256i a, const int imm8) pure @trusted 924 { 925 return (cast(int8)a).array[imm8 & 7]; 926 } 927 unittest 928 { 929 align(16) int[8] data = [-1, 2, -3, 4, 9, -7, 8, -6]; 930 auto A = _mm256_loadu_si256(cast(__m256i*) data.ptr); 931 assert(_mm256_extract_epi32(A, 0) == -1); 932 assert(_mm256_extract_epi32(A, 1 + 8) == 2); 933 assert(_mm256_extract_epi32(A, 3 + 16) == 4); 934 assert(_mm256_extract_epi32(A, 7 + 32) == -6); 935 } 936 937 /// Extract a 64-bit integer from `a`, selected with `index`. 938 long _mm256_extract_epi64 (__m256i a, const int index) pure @safe 939 { 940 return a.array[index & 3]; 941 } 942 unittest 943 { 944 __m256i A = _mm256_setr_epi64x(-7, 6, 42, 0); 945 assert(_mm256_extract_epi64(A, -8) == -7); 946 assert(_mm256_extract_epi64(A, 1) == 6); 947 assert(_mm256_extract_epi64(A, 2 + 4) == 42); 948 } 949 950 /// Extract a 128-bits lane from `a`, selected with `index` (0 or 1). 951 __m128d _mm256_extractf128_pd(ubyte imm8)(__m256d a) pure @trusted 952 { 953 // PERF DMD D_SIMD 954 static if (GDC_with_AVX) 955 { 956 // Note: needs to be a template intrinsics because of this builtin. 957 return __builtin_ia32_vextractf128_pd256(a, imm8 & 1); 958 } 959 else 960 { 961 double2 r = void; 962 enum int index = 2*(imm8 & 1); 963 r.ptr[0] = a.array[index+0]; 964 r.ptr[1] = a.array[index+1]; 965 return r; 966 } 967 } 968 unittest 969 { 970 __m256d A = _mm256_setr_pd(1.0, 2, 3, 4); 971 double[4] correct = [1.0, 2, 3, 4]; 972 __m128d l0 = _mm256_extractf128_pd!18(A); 973 __m128d l1 = _mm256_extractf128_pd!55(A); 974 assert(l0.array == correct[0..2]); 975 assert(l1.array == correct[2..4]); 976 } 977 978 ///ditto 979 __m128 _mm256_extractf128_ps(ubyte imm8)(__m256 a) pure @trusted 980 { 981 // PERF DMD D_SIMD 982 static if (GDC_with_AVX) 983 { 984 return __builtin_ia32_vextractf128_ps256(a, imm8 & 1); 985 } 986 else 987 { 988 float4 r = void; // Optimize well since LDC 1.1 -O1 989 enum int index = 4*(imm8 & 1); 990 r.ptr[0] = a.array[index+0]; 991 r.ptr[1] = a.array[index+1]; 992 r.ptr[2] = a.array[index+2]; 993 r.ptr[3] = a.array[index+3]; 994 return r; 995 } 996 } 997 unittest 998 { 999 __m256 A = _mm256_setr_ps(1.0, 2, 3, 4, 5, 6, 7, 8); 1000 float[8] correct = [1.0, 2, 3, 4, 5, 6, 7, 8]; 1001 __m128 l0 = _mm256_extractf128_ps!8(A); 1002 __m128 l1 = _mm256_extractf128_ps!255(A); 1003 assert(l0.array == correct[0..4]); 1004 assert(l1.array == correct[4..8]); 1005 } 1006 1007 ///ditto 1008 __m128i _mm256_extractf128_si256(ubyte imm8)(__m256i a) pure @trusted 1009 { 1010 // PERF DMD D_SIMD 1011 static if (GDC_with_AVX) 1012 { 1013 // Note: if it weren't for this GDC intrinsic, _mm256_extractf128_si256 1014 // could be a non-template, however, this wins in -O0. 1015 // Same story for _mm256_extractf128_ps and _mm256_extractf128_pd 1016 return __builtin_ia32_vextractf128_si256(cast(int8)a, imm8 & 1); 1017 } 1018 else 1019 { 1020 long2 r = void; 1021 enum int index = 2*(imm8 & 1); 1022 r.ptr[0] = a.array[index+0]; 1023 r.ptr[1] = a.array[index+1]; 1024 return cast(__m128i)r; 1025 } 1026 } 1027 unittest 1028 { 1029 __m256i A = _mm256_setr_epi32(9, 2, 3, 4, 5, 6, 7, 8); 1030 int[8] correct = [9, 2, 3, 4, 5, 6, 7, 8]; 1031 __m128i l0 = _mm256_extractf128_si256!0(A); 1032 __m128i l1 = _mm256_extractf128_si256!1(A); 1033 assert(l0.array == correct[0..4]); 1034 assert(l1.array == correct[4..8]); 1035 } 1036 1037 // TODO __m256d _mm256_floor_pd (__m256d a) 1038 // TODO __m256 _mm256_floor_ps (__m256 a) 1039 1040 /// Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in `a` 1041 /// and `b`. 1042 __m256d _mm256_hadd_pd (__m256d a, __m256d b) pure @trusted 1043 { 1044 static if (GDC_or_LDC_with_AVX) 1045 { 1046 return __builtin_ia32_haddpd256(a, b); 1047 } 1048 else 1049 { 1050 __m256d res; 1051 res.ptr[0] = a.array[1] + a.array[0]; 1052 res.ptr[1] = b.array[1] + b.array[0]; 1053 res.ptr[2] = a.array[3] + a.array[2]; 1054 res.ptr[3] = b.array[3] + b.array[2]; 1055 return res; 1056 } 1057 } 1058 unittest 1059 { 1060 __m256d A =_mm256_setr_pd(1.5, 2.0, 21.0, 9.0); 1061 __m256d B =_mm256_setr_pd(1.0, 7.0, 100.0, 14.0); 1062 __m256d C = _mm256_hadd_pd(A, B); 1063 double[4] correct = [3.5, 8.0, 30.0, 114.0]; 1064 assert(C.array == correct); 1065 } 1066 1067 /// Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in `a` and 1068 /// `b`. 1069 __m256 _mm256_hadd_ps (__m256 a, __m256 b) pure @trusted 1070 { 1071 // PERD DMD 1072 static if (GDC_or_LDC_with_AVX) 1073 { 1074 return __builtin_ia32_haddps256(a, b); 1075 } 1076 else static if (LDC_with_ARM64) 1077 { 1078 __m128 a_hi = _mm256_extractf128_ps!1(a); 1079 __m128 a_lo = _mm256_extractf128_ps!0(a); 1080 __m128 b_hi = _mm256_extractf128_ps!1(b); 1081 __m128 b_lo = _mm256_extractf128_ps!0(b); 1082 __m128 hi = vpaddq_f32(a_hi, b_hi); 1083 __m128 lo = vpaddq_f32(a_lo, b_lo); 1084 return _mm256_set_m128(hi, lo); 1085 } 1086 else 1087 { 1088 __m256 res; 1089 res.ptr[0] = a.array[1] + a.array[0]; 1090 res.ptr[1] = a.array[3] + a.array[2]; 1091 res.ptr[2] = b.array[1] + b.array[0]; 1092 res.ptr[3] = b.array[3] + b.array[2]; 1093 res.ptr[4] = a.array[5] + a.array[4]; 1094 res.ptr[5] = a.array[7] + a.array[6]; 1095 res.ptr[6] = b.array[5] + b.array[4]; 1096 res.ptr[7] = b.array[7] + b.array[6]; 1097 return res; 1098 } 1099 } 1100 unittest 1101 { 1102 __m256 A =_mm256_setr_ps(1.0f, 2.0f, 3.0f, 5.0f, 1.0f, 2.0f, 3.0f, 5.0f); 1103 __m256 B =_mm256_setr_ps(1.5f, 2.0f, 3.5f, 4.0f, 1.5f, 2.0f, 3.5f, 5.0f); 1104 __m256 R = _mm256_hadd_ps(A, B); 1105 float[8] correct = [3.0f, 8.0f, 3.5f, 7.5f, 3.0f, 8.0f, 3.5f, 8.5f]; 1106 assert(R.array == correct); 1107 } 1108 1109 /// Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in 1110 /// `a` and `b`. 1111 __m256d _mm256_hsub_pd (__m256d a, __m256d b) pure @trusted 1112 { 1113 static if (GDC_or_LDC_with_AVX) 1114 { 1115 return __builtin_ia32_hsubpd256(a, b); 1116 } 1117 else 1118 { 1119 // 2 zip1, 2 zip2, 2 fsub... I don't think there is better in arm64 1120 __m256d res; 1121 res.ptr[0] = a.array[0] - a.array[1]; 1122 res.ptr[1] = b.array[0] - b.array[1]; 1123 res.ptr[2] = a.array[2] - a.array[3]; 1124 res.ptr[3] = b.array[2] - b.array[3]; 1125 return res; 1126 } 1127 } 1128 unittest 1129 { 1130 __m256d A =_mm256_setr_pd(1.5, 2.0, 21.0, 9.0); 1131 __m256d B =_mm256_setr_pd(1.0, 7.0, 100.0, 14.0); 1132 __m256d C = _mm256_hsub_pd(A, B); 1133 double[4] correct = [-0.5, -6.0, 12.0, 86.0]; 1134 assert(C.array == correct); 1135 } 1136 1137 __m256 _mm256_hsub_ps (__m256 a, __m256 b) pure @trusted 1138 { 1139 // PERD DMD 1140 static if (GDC_or_LDC_with_AVX) 1141 { 1142 return __builtin_ia32_hsubps256(a, b); 1143 } 1144 else 1145 { 1146 __m128 a_hi = _mm256_extractf128_ps!1(a); 1147 __m128 a_lo = _mm256_extractf128_ps!0(a); 1148 __m128 b_hi = _mm256_extractf128_ps!1(b); 1149 __m128 b_lo = _mm256_extractf128_ps!0(b); 1150 __m128 hi = _mm_hsub_ps(a_hi, b_hi); 1151 __m128 lo = _mm_hsub_ps(a_lo, b_lo); 1152 return _mm256_set_m128(hi, lo); 1153 } 1154 } 1155 unittest 1156 { 1157 __m256 A =_mm256_setr_ps(1.0f, 2.0f, 3.0f, 5.0f, 1.0f, 2.0f, 3.0f, 5.0f); 1158 __m256 B =_mm256_setr_ps(1.5f, 2.0f, 3.5f, 4.0f, 1.5f, 2.0f, 3.5f, 5.0f); 1159 __m256 R = _mm256_hsub_ps(A, B); 1160 float[8] correct = [-1.0f, -2.0f, -0.5f, -0.5f, -1.0f, -2.0f, -0.5f, -1.5f]; 1161 assert(R.array == correct); 1162 } 1163 1164 // TODO __m256i _mm256_insert_epi16 (__m256i a, __int16 i, const int index) 1165 // TODO __m256i _mm256_insert_epi32 (__m256i a, __int32 i, const int index) 1166 // TODO __m256i _mm256_insert_epi64 (__m256i a, __int64 i, const int index) 1167 // TODO __m256i _mm256_insert_epi8 (__m256i a, __int8 i, const int index) 1168 1169 1170 /// Copy `a`, then insert 128 bits (composed of 2 packed double-precision (64-bit) 1171 /// floating-point elements) from `b` at the location specified by `imm8`. 1172 __m256d _mm256_insertf128_pd(int imm8)(__m256d a, __m128d b) pure @trusted 1173 { 1174 static if (GDC_with_AVX) 1175 { 1176 enum ubyte lane = imm8 & 1; 1177 return __builtin_ia32_vinsertf128_pd256(a, b, lane); 1178 } 1179 else 1180 { 1181 __m256d r = a; 1182 enum int index = (imm8 & 1) ? 2 : 0; 1183 r.ptr[index] = b.array[0]; 1184 r.ptr[index+1] = b.array[1]; 1185 return r; 1186 } 1187 } 1188 1189 /// Copy `a` then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point 1190 /// elements) from `b`, at the location specified by `imm8`. 1191 __m256 _mm256_insertf128_ps(int imm8)(__m256 a, __m128 b) pure @trusted 1192 { 1193 static if (GDC_with_AVX) 1194 { 1195 enum ubyte lane = imm8 & 1; 1196 return __builtin_ia32_vinsertf128_ps256(a, b, lane); 1197 } 1198 else 1199 { 1200 __m256 r = a; 1201 enum int index = (imm8 & 1) ? 4 : 0; 1202 r.ptr[index] = b.array[0]; 1203 r.ptr[index+1] = b.array[1]; 1204 r.ptr[index+2] = b.array[2]; 1205 r.ptr[index+3] = b.array[3]; 1206 return r; 1207 } 1208 } 1209 1210 /// Copy `a`, then insert 128 bits from `b` at the location specified by `imm8`. 1211 __m256i _mm256_insertf128_si256(int imm8)(__m256i a, __m128i b) pure @trusted 1212 { 1213 static if (GDC_with_AVX) 1214 { 1215 enum ubyte lane = imm8 & 1; 1216 return cast(__m256i) __builtin_ia32_vinsertf128_si256 (cast(int8)a, b, lane); 1217 } 1218 else 1219 { 1220 long2 lb = cast(long2)b; 1221 __m256i r = a; 1222 enum int index = (imm8 & 1) ? 2 : 0; 1223 r.ptr[index] = lb.array[0]; 1224 r.ptr[index+1] = lb.array[1]; 1225 return r; 1226 } 1227 } 1228 1229 /// Load 256-bits of integer data from unaligned memory into dst. 1230 /// This intrinsic may perform better than `_mm256_loadu_si256` when the data crosses a cache 1231 /// line boundary. 1232 __m256i _mm256_lddqu_si256(const(__m256i)* mem_addr) @trusted 1233 { 1234 // PERF DMD D_SIMD 1235 static if (GDC_or_LDC_with_AVX) 1236 { 1237 return cast(__m256i) __builtin_ia32_lddqu256(cast(const(char)*)mem_addr); 1238 } 1239 else 1240 return _mm256_loadu_si256(mem_addr); 1241 } 1242 unittest 1243 { 1244 int[10] correct = [0, -1, 2, -3, 4, 9, -7, 8, -6, 34]; 1245 int8 A = cast(int8) _mm256_lddqu_si256(cast(__m256i*) &correct[1]); 1246 assert(A.array == correct[1..9]); 1247 } 1248 1249 /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) 1250 /// from memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 1251 /// exception may be generated. 1252 __m256d _mm256_load_pd (const(double)* mem_addr) pure @trusted 1253 { 1254 return *cast(__m256d*)mem_addr; 1255 } 1256 unittest 1257 { 1258 static immutable align(32) double[4] correct = [1.0, 2.0, 3.5, -42.0]; 1259 __m256d A = _mm256_load_pd(correct.ptr); 1260 assert(A.array == correct); 1261 } 1262 1263 /// Load 256-bits (composed of 8 packed single-precision (32-bit) 1264 /// floating-point elements) from memory. 1265 /// `mem_addr` must be aligned on a 32-byte boundary or a 1266 /// general-protection exception may be generated. 1267 __m256 _mm256_load_ps (const(float)* mem_addr) pure @trusted 1268 { 1269 return *cast(__m256*)mem_addr; 1270 } 1271 unittest 1272 { 1273 static immutable align(32) float[8] correct = 1274 [1.0, 2.0, 3.5, -42.0, 7.43f, 0.0f, 3, 2]; 1275 __m256 A = _mm256_load_ps(correct.ptr); 1276 assert(A.array == correct); 1277 } 1278 1279 /// Load 256-bits of integer data from memory. `mem_addr` does not need to be aligned on 1280 /// any particular boundary. 1281 // See this dlang forum post => https://forum.dlang.org/thread/vymrsngsfibkmqsqffce@forum.dlang.org 1282 __m256i _mm256_loadu_si256 (const(__m256i)* mem_addr) pure @trusted // TODO: signature 1283 { 1284 // PERF DMD 1285 static if (GDC_with_AVX) 1286 { 1287 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) mem_addr); 1288 } 1289 else version(LDC) 1290 { 1291 return loadUnaligned!(__m256i)(cast(long*)mem_addr); 1292 } 1293 else 1294 { 1295 const(long)* p = cast(const(long)*)mem_addr; 1296 long4 r; 1297 r.ptr[0] = p[0]; 1298 r.ptr[1] = p[1]; 1299 r.ptr[2] = p[2]; 1300 r.ptr[3] = p[3]; 1301 return r; 1302 } 1303 } 1304 unittest 1305 { 1306 align(16) int[8] correct = [-1, 2, -3, 4, 9, -7, 8, -6]; 1307 int8 A = cast(int8) _mm256_loadu_si256(cast(__m256i*) correct.ptr); 1308 assert(A.array == correct); 1309 } 1310 1311 /// Load 256-bits of integer data from memory. `mem_addr` must be aligned on a 1312 /// 32-byte boundary or a general-protection exception may be generated. 1313 __m256i _mm256_load_si256 (const(void)* mem_addr) pure @system 1314 { 1315 return *cast(__m256i*)mem_addr; 1316 } 1317 unittest 1318 { 1319 static immutable align(64) long[4] correct = [1, -2, long.min, long.max]; 1320 __m256i A = _mm256_load_si256(correct.ptr); 1321 assert(A.array == correct); 1322 } 1323 1324 /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) 1325 /// from memory. `mem_addr` does not need to be aligned on any particular boundary. 1326 __m256d _mm256_loadu_pd (const(void)* mem_addr) pure @system 1327 { 1328 // PERF DMD 1329 static if (GDC_with_AVX) 1330 { 1331 return __builtin_ia32_loadupd256 ( cast(const(double)*) mem_addr); 1332 } 1333 else version(LDC) 1334 { 1335 return loadUnaligned!(__m256d)(cast(double*)mem_addr); 1336 } 1337 else 1338 { 1339 const(double)* p = cast(const(double)*)mem_addr; 1340 double4 r; 1341 r.ptr[0] = p[0]; 1342 r.ptr[1] = p[1]; 1343 r.ptr[2] = p[2]; 1344 r.ptr[3] = p[3]; 1345 return r; 1346 } 1347 } 1348 unittest 1349 { 1350 double[4] correct = [1.0, -2.0, 0.0, 768.5]; 1351 __m256d A = _mm256_loadu_pd(correct.ptr); 1352 assert(A.array == correct); 1353 } 1354 1355 /// Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory. 1356 /// `mem_addr` does not need to be aligned on any particular boundary. 1357 __m256 _mm256_loadu_ps (const(float)* mem_addr) pure @system 1358 { 1359 // PERF DMD 1360 static if (GDC_with_AVX) 1361 { 1362 return __builtin_ia32_loadups256 ( cast(const(float)*) mem_addr); 1363 } 1364 else version(LDC) 1365 { 1366 return loadUnaligned!(__m256)(cast(float*)mem_addr); 1367 } 1368 else 1369 { 1370 const(float)* p = cast(const(float)*)mem_addr; 1371 float8 r = void; 1372 r.ptr[0] = p[0]; 1373 r.ptr[1] = p[1]; 1374 r.ptr[2] = p[2]; 1375 r.ptr[3] = p[3]; 1376 r.ptr[4] = p[4]; 1377 r.ptr[5] = p[5]; 1378 r.ptr[6] = p[6]; 1379 r.ptr[7] = p[7]; 1380 return r; 1381 } 1382 } 1383 unittest 1384 { 1385 align(32) float[10] correct = [0.0f, 1, 2, 3, 4, 5, 6, 7, 8, 9]; 1386 __m256 A = _mm256_loadu_ps(&correct[1]); 1387 assert(A.array == correct[1..9]); 1388 } 1389 1390 /// Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point 1391 /// elements) from memory, and combine them into a 256-bit value. 1392 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 1393 __m256 _mm256_loadu2_m128 (const(float)* hiaddr, const(float)* loaddr) pure @system 1394 { 1395 // Note: no particular instruction for this in x86. 1396 return _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr)); 1397 } 1398 unittest 1399 { 1400 align(32) float[6] A = [4.5f, 2, 8, 97, -1, 3]; 1401 align(32) float[6] B = [6.5f, 3, 9, 98, -2, 4]; 1402 __m256 R = _mm256_loadu2_m128(&B[1], &A[1]); 1403 float[8] correct = [2.0f, 8, 97, -1, 3, 9, 98, -2]; 1404 assert(R.array == correct); 1405 } 1406 1407 /// Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point 1408 /// elements) from memory, and combine them into a 256-bit value. 1409 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 1410 __m256d _mm256_loadu2_m128d (const(double)* hiaddr, const(double)* loaddr) pure @system 1411 { 1412 // Note: no particular instruction for this in x86. 1413 return _mm256_set_m128d(_mm_loadu_pd(hiaddr), _mm_loadu_pd(loaddr)); 1414 } 1415 unittest 1416 { 1417 align(32) double[4] A = [4.5f, 2, 8, 97]; 1418 align(32) double[4] B = [6.5f, 3, 9, 98]; 1419 __m256d R = _mm256_loadu2_m128d(&B[1], &A[1]); 1420 double[4] correct = [2.0, 8, 3, 9]; 1421 assert(R.array == correct); 1422 } 1423 1424 /// Load two 128-bit values (composed of integer data) from memory, and combine them into a 1425 /// 256-bit value. `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 1426 __m256i _mm256_loadu2_m128i (const(__m128i)* hiaddr, const(__m128i)* loaddr) pure @trusted 1427 { 1428 // Note: no particular instruction for this in x86. 1429 return _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr)); 1430 } 1431 unittest 1432 { 1433 align(32) long[4] A = [5, 2, 8, 97]; 1434 align(32) long[4] B = [6, 3, 9, 98]; 1435 __m256i R = _mm256_loadu2_m128i(cast(const(__m128i)*) &B[1], cast(const(__m128i)*) &A[1]); 1436 long[4] correct = [2, 8, 3, 9]; 1437 assert(R.array == correct); 1438 } 1439 1440 1441 // TODO __m128d _mm_maskload_pd (double const * mem_addr, __m128i mask) 1442 // TODO __m256d _mm256_maskload_pd (double const * mem_addr, __m256i mask) 1443 // TODO __m128 _mm_maskload_ps (float const * mem_addr, __m128i mask) 1444 // TODO __m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask) 1445 // TODO void _mm_maskstore_pd (double * mem_addr, __m128i mask, __m128d a) 1446 // TODO void _mm256_maskstore_pd (double * mem_addr, __m256i mask, __m256d a) 1447 // TODO void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a) 1448 // TODO void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a) 1449 1450 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 1451 /// packed maximum values. 1452 __m256d _mm256_max_pd (__m256d a, __m256d b) pure @trusted 1453 { 1454 // PERF DMD D_SIMD 1455 static if (GDC_or_LDC_with_AVX) 1456 { 1457 return __builtin_ia32_maxpd256(a, b); 1458 } 1459 else 1460 { 1461 // LDC: becomes good in -O2 1462 // PERF: GDC without AVX 1463 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 1464 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 1465 a.ptr[2] = (a.array[2] > b.array[2]) ? a.array[2] : b.array[2]; 1466 a.ptr[3] = (a.array[3] > b.array[3]) ? a.array[3] : b.array[3]; 1467 return a; 1468 } 1469 } 1470 unittest 1471 { 1472 __m256d A = _mm256_setr_pd(4.0, 1.0, -9.0, double.infinity); 1473 __m256d B = _mm256_setr_pd(1.0, 8.0, 0.0, 100000.0); 1474 __m256d M = _mm256_max_pd(A, B); 1475 double[4] correct = [4.0, 8.0, 0.0, double.infinity]; 1476 } 1477 1478 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return 1479 /// packed maximum values. 1480 __m256 _mm256_max_ps (__m256 a, __m256 b) pure @trusted 1481 { 1482 // PERF DMD D_SIMD 1483 static if (GDC_or_LDC_with_AVX) 1484 { 1485 return __builtin_ia32_maxps256(a, b); 1486 } 1487 else 1488 { 1489 // LDC: becomes good in -O2, but looks brittle. 1490 // PERF GDC without AVX 1491 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 1492 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 1493 a.ptr[2] = (a.array[2] > b.array[2]) ? a.array[2] : b.array[2]; 1494 a.ptr[3] = (a.array[3] > b.array[3]) ? a.array[3] : b.array[3]; 1495 a.ptr[4] = (a.array[4] > b.array[4]) ? a.array[4] : b.array[4]; 1496 a.ptr[5] = (a.array[5] > b.array[5]) ? a.array[5] : b.array[5]; 1497 a.ptr[6] = (a.array[6] > b.array[6]) ? a.array[6] : b.array[6]; 1498 a.ptr[7] = (a.array[7] > b.array[7]) ? a.array[7] : b.array[7]; 1499 return a; 1500 } 1501 } 1502 unittest 1503 { 1504 __m256 A = _mm256_setr_ps(4.0, 1.0, -9.0, float.infinity, 1, 2, 3, 4); 1505 __m256 B = _mm256_setr_ps(1.0, 8.0, 0.0, 100000.0f , 4, 3, 2, 1); 1506 __m256 M = _mm256_max_ps(A, B); 1507 float[8] correct = [4.0, 8.0, 0.0, float.infinity , 4, 3, 3, 4]; 1508 } 1509 1510 // Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 1511 /// packed minimum values. 1512 __m256d _mm256_min_pd (__m256d a, __m256d b) pure @trusted 1513 { 1514 // PERF DMD D_SIMD 1515 static if (GDC_or_LDC_with_AVX) 1516 { 1517 return __builtin_ia32_minpd256(a, b); 1518 } 1519 else 1520 { 1521 // LDC: becomes good in -O2 1522 // PERF: GDC without AVX 1523 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 1524 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 1525 a.ptr[2] = (a.array[2] < b.array[2]) ? a.array[2] : b.array[2]; 1526 a.ptr[3] = (a.array[3] < b.array[3]) ? a.array[3] : b.array[3]; 1527 return a; 1528 } 1529 } 1530 unittest 1531 { 1532 __m256d A = _mm256_setr_pd(4.0, 1.0, -9.0, double.infinity); 1533 __m256d B = _mm256_setr_pd(1.0, 8.0, 0.0, 100000.0); 1534 __m256d M = _mm256_min_pd(A, B); 1535 double[4] correct = [1.0, 8.0, -9.0, 100000.0]; 1536 } 1537 1538 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return 1539 /// packed maximum values. 1540 __m256 _mm256_min_ps (__m256 a, __m256 b) pure @trusted 1541 { 1542 // PERF DMD D_SIMD 1543 static if (GDC_or_LDC_with_AVX) 1544 { 1545 return __builtin_ia32_minps256(a, b); 1546 } 1547 else 1548 { 1549 // LDC: becomes good in -O2, but looks brittle. 1550 // PERF GDC without AVX 1551 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 1552 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 1553 a.ptr[2] = (a.array[2] < b.array[2]) ? a.array[2] : b.array[2]; 1554 a.ptr[3] = (a.array[3] < b.array[3]) ? a.array[3] : b.array[3]; 1555 a.ptr[4] = (a.array[4] < b.array[4]) ? a.array[4] : b.array[4]; 1556 a.ptr[5] = (a.array[5] < b.array[5]) ? a.array[5] : b.array[5]; 1557 a.ptr[6] = (a.array[6] < b.array[6]) ? a.array[6] : b.array[6]; 1558 a.ptr[7] = (a.array[7] < b.array[7]) ? a.array[7] : b.array[7]; 1559 return a; 1560 } 1561 } 1562 unittest 1563 { 1564 __m256 A = _mm256_setr_ps(4.0, 1.0, -9.0, float.infinity, 1, 2, 3, 4); 1565 __m256 B = _mm256_setr_ps(1.0, 8.0, 0.0, 100000.0f , 4, 3, 2, 1); 1566 __m256 M = _mm256_min_ps(A, B); 1567 float[8] correct = [1.0, 1.0, -9.0, 100000.0f , 1, 2, 2, 1]; 1568 } 1569 1570 1571 // TODO __m256d _mm256_movedup_pd (__m256d a) 1572 // TODO __m256 _mm256_movehdup_ps (__m256 a) 1573 // TODO __m256 _mm256_moveldup_ps (__m256 a) 1574 // TODO int _mm256_movemask_pd (__m256d a) 1575 // TODO int _mm256_movemask_ps (__m256 a) 1576 1577 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`. 1578 __m256d _mm256_mul_pd (__m256d a, __m256d b) pure @safe 1579 { 1580 return a * b; 1581 } 1582 unittest 1583 { 1584 __m256d a = [-2.0, 1.5, -2.0, 1.5]; 1585 a = _mm256_mul_pd(a, a); 1586 assert(a.array == [4.0, 2.25, 4.0, 2.25]); 1587 } 1588 1589 /// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`. 1590 __m256 _mm256_mul_ps (__m256 a, __m256 b) pure @safe 1591 { 1592 return a * b; 1593 } 1594 unittest 1595 { 1596 __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 1.5f, -2.0f, 3.0f, 1.0f]; 1597 a = _mm256_mul_ps(a, a); 1598 float[8] correct = [2.25f, 4.0f, 9.0f, 1.0f, 2.25f, 4.0f, 9.0f, 1.0f]; 1599 assert(a.array == correct); 1600 } 1601 1602 1603 /// Compute the bitwise NOT of 256 bits in `a`. #BONUS 1604 __m256i _mm256_not_si256 (__m256i a) pure @safe 1605 { 1606 return ~a; 1607 } 1608 unittest 1609 { 1610 __m256i A = _mm256_set1_epi64x(-748); 1611 long4 notA = cast(long4) _mm256_not_si256(A); 1612 int[4] correct = [747, 747, 747, 747]; 1613 assert(notA.array == correct); 1614 } 1615 1616 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 1617 __m256d _mm256_or_pd (__m256d a, __m256d b) pure @safe 1618 { 1619 return cast(__m256d)( cast(__m256i)a | cast(__m256i)b ); 1620 } 1621 1622 /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in `a` and `b`. 1623 __m256 _mm256_or_ps (__m256 a, __m256 b) pure @safe 1624 { 1625 return cast(__m256)( cast(__m256i)a | cast(__m256i)b ); 1626 } 1627 1628 // TODO __m128d _mm_permute_pd (__m128d a, int imm8) 1629 // TODO __m256d _mm256_permute_pd (__m256d a, int imm8) 1630 // TODO __m128 _mm_permute_ps (__m128 a, int imm8) 1631 // TODO __m256 _mm256_permute_ps (__m256 a, int imm8) 1632 // TODO __m256d _mm256_permute2f128_pd (__m256d a, __m256d b, int imm8) 1633 // TODO __m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8) 1634 // TODO __m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8) 1635 // TODO __m128d _mm_permutevar_pd (__m128d a, __m128i b) 1636 // TODO __m256d _mm256_permutevar_pd (__m256d a, __m256i b) 1637 // TODO __m128 _mm_permutevar_ps (__m128 a, __m128i b) 1638 // TODO __m256 _mm256_permutevar_ps (__m256 a, __m256i b) 1639 1640 // TODO __m256 _mm256_rcp_ps (__m256 a) 1641 1642 // TODO __m256d _mm256_round_pd (__m256d a, int rounding) 1643 // TODO __m256 _mm256_round_ps (__m256 a, int rounding) 1644 1645 // TODO __m256 _mm256_rsqrt_ps (__m256 a) 1646 1647 1648 /// Set packed 16-bit integers with the supplied values. 1649 __m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 1650 { 1651 short16 r; // Note: = void would prevent GDC from inlining a constant short16... 1652 r.ptr[0] = e0; 1653 r.ptr[1] = e1; 1654 r.ptr[2] = e2; 1655 r.ptr[3] = e3; 1656 r.ptr[4] = e4; 1657 r.ptr[5] = e5; 1658 r.ptr[6] = e6; 1659 r.ptr[7] = e7; 1660 r.ptr[8] = e8; 1661 r.ptr[9] = e9; 1662 r.ptr[10] = e10; 1663 r.ptr[11] = e11; 1664 r.ptr[12] = e12; 1665 r.ptr[13] = e13; 1666 r.ptr[14] = e14; 1667 r.ptr[15] = e15; 1668 return cast(__m256i) r; 1669 } 1670 unittest 1671 { 1672 short16 A = cast(short16) _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 1673 7, 6, 5, 4, 3, 2, 1, 0); 1674 foreach(i; 0..16) 1675 assert(A.array[i] == i); 1676 } 1677 1678 /// Set packed 32-bit integers with the supplied values. 1679 __m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted 1680 { 1681 // Inlines a constant with GCC -O1, LDC -O2 1682 int8 r; // = void would prevent GCC from inlining a constant call 1683 r.ptr[0] = e0; 1684 r.ptr[1] = e1; 1685 r.ptr[2] = e2; 1686 r.ptr[3] = e3; 1687 r.ptr[4] = e4; 1688 r.ptr[5] = e5; 1689 r.ptr[6] = e6; 1690 r.ptr[7] = e7; 1691 return cast(__m256i)r; 1692 } 1693 unittest 1694 { 1695 int8 A = cast(int8) _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); 1696 foreach(i; 0..8) 1697 assert(A.array[i] == i); 1698 } 1699 1700 /// Set packed 64-bit integers with the supplied values. 1701 __m256i _mm256_set_epi64x (long e3, long e2, long e1, long e0) pure @trusted 1702 { 1703 long4 r = void; 1704 r.ptr[0] = e0; 1705 r.ptr[1] = e1; 1706 r.ptr[2] = e2; 1707 r.ptr[3] = e3; 1708 return r; 1709 } 1710 unittest 1711 { 1712 __m256i A = _mm256_set_epi64x(-1, 42, long.min, long.max); 1713 long[4] correct = [long.max, long.min, 42, -1]; 1714 assert(A.array == correct); 1715 } 1716 1717 /// Set packed 8-bit integers with the supplied values. 1718 __m256i _mm256_set_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24, 1719 byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16, 1720 byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9, byte e8, 1721 byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) 1722 { 1723 // Inline a constant call in GDC -O1 and LDC -O2 1724 align(32) byte[32] result = [ e0, e1, e2, e3, e4, e5, e6, e7, 1725 e8, e9, e10, e11, e12, e13, e14, e15, 1726 e16, e17, e18, e19, e20, e21, e22, e23, 1727 e24, e25, e26, e27, e28, e29, e30, e31 ]; 1728 return *cast(__m256i*)(result.ptr); 1729 } 1730 unittest 1731 { 1732 byte32 R = cast(byte32) _mm256_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7); 1733 byte[32] correct = [7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0, 3, 2, 1, 0, 1734 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1]; 1735 assert(R.array == correct); 1736 } 1737 1738 /// Set packed `__m256d` vector with the supplied values. 1739 __m256 _mm256_set_m128 (__m128 hi, __m128 lo) pure @trusted 1740 { 1741 // DMD PERF 1742 static if (GDC_with_AVX) 1743 { 1744 __m256 r = __builtin_ia32_ps256_ps(lo); 1745 return __builtin_ia32_vinsertf128_ps256(r, hi, 1); 1746 } 1747 else version(DigitalMars) 1748 { 1749 __m256 r = void; 1750 r.ptr[0] = lo.array[0]; 1751 r.ptr[1] = lo.array[1]; 1752 r.ptr[2] = lo.array[2]; 1753 r.ptr[3] = lo.array[3]; 1754 r.ptr[4] = hi.array[0]; 1755 r.ptr[5] = hi.array[1]; 1756 r.ptr[6] = hi.array[2]; 1757 r.ptr[7] = hi.array[3]; 1758 return r; 1759 } 1760 else 1761 { 1762 // TODO: BUG, doesn't work if AVX vector is emulated, but SSE vector is not 1763 // PERF: this crash on DMD v100.2 on Linux x86_64, find out why since 1764 // it would be better performance wise 1765 // Note: probably because emulated AVX vectors have no alignment requisites! 1766 __m256 r = void; 1767 __m128* p = cast(__m128*)(&r); 1768 p[0] = lo; 1769 p[1] = hi; 1770 return r; 1771 } 1772 } 1773 unittest 1774 { 1775 __m128 lo = _mm_setr_ps(1.0f, 2, 3, 4); 1776 __m128 hi = _mm_setr_ps(3.0f, 4, 5, 6); 1777 __m256 R = _mm256_set_m128(hi, lo); 1778 float[8] correct = [1.0f, 2, 3, 4, 3, 4, 5, 6]; 1779 assert(R.array == correct); 1780 } 1781 1782 /// Set packed `__m256d` vector with the supplied values. 1783 __m256d _mm256_set_m128d (__m128d hi, __m128d lo) pure @trusted 1784 { 1785 __m256d r = void; 1786 r.ptr[0] = lo.array[0]; 1787 r.ptr[1] = lo.array[1]; 1788 r.ptr[2] = hi.array[0]; 1789 r.ptr[3] = hi.array[1]; 1790 return r; 1791 } 1792 unittest 1793 { 1794 __m128d lo = _mm_setr_pd(1.0, 2.0); 1795 __m128d hi = _mm_setr_pd(3.0, 4.0); 1796 __m256d R = _mm256_set_m128d(hi, lo); 1797 double[4] correct = [1.0, 2.0, 3.0, 4.0]; 1798 assert(R.array == correct); 1799 } 1800 1801 /// Set packed `__m256i` vector with the supplied values. 1802 __m256i _mm256_set_m128i (__m128i hi, __m128i lo) pure @trusted 1803 { 1804 // DMD PERF 1805 static if (GDC_with_AVX) 1806 { 1807 __m256i r = cast(long4) __builtin_ia32_si256_si (lo); 1808 return cast(long4) __builtin_ia32_vinsertf128_si256(cast(int8)r, hi, 1); 1809 } 1810 else version(DigitalMars) 1811 { 1812 int8 r = void; 1813 r.ptr[0] = lo.array[0]; 1814 r.ptr[1] = lo.array[1]; 1815 r.ptr[2] = lo.array[2]; 1816 r.ptr[3] = lo.array[3]; 1817 r.ptr[4] = hi.array[0]; 1818 r.ptr[5] = hi.array[1]; 1819 r.ptr[6] = hi.array[2]; 1820 r.ptr[7] = hi.array[3]; 1821 return cast(long4)r; 1822 } 1823 else 1824 { 1825 // PERF Does this also vcrash for DMD? with DMD v100.2 on Linux x86_64 1826 __m256i r = void; 1827 __m128i* p = cast(__m128i*)(&r); 1828 p[0] = lo; 1829 p[1] = hi; 1830 return r; 1831 } 1832 } 1833 unittest 1834 { 1835 __m128i lo = _mm_setr_epi32( 1, 2, 3, 4); 1836 __m128i hi = _mm_set_epi32(-3, -4, -5, -6); 1837 int8 R = cast(int8)_mm256_set_m128i(hi, lo); 1838 int[8] correct = [1, 2, 3, 4, -6, -5, -4, -3]; 1839 assert(R.array == correct); 1840 } 1841 1842 /// Set packed double-precision (64-bit) floating-point elements with the supplied values. 1843 __m256d _mm256_set_pd (double e3, double e2, double e1, double e0) pure @trusted 1844 { 1845 __m256d r = void; 1846 r.ptr[0] = e0; 1847 r.ptr[1] = e1; 1848 r.ptr[2] = e2; 1849 r.ptr[3] = e3; 1850 return r; 1851 } 1852 unittest 1853 { 1854 __m256d A = _mm256_set_pd(3, 2, 1, 546); 1855 double[4] correct = [546.0, 1.0, 2.0, 3.0]; 1856 assert(A.array == correct); 1857 } 1858 1859 /// Set packed single-precision (32-bit) floating-point elements with the supplied values. 1860 __m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) pure @trusted 1861 { 1862 // PERF: see #102, use = void? 1863 __m256 r; 1864 r.ptr[0] = e0; 1865 r.ptr[1] = e1; 1866 r.ptr[2] = e2; 1867 r.ptr[3] = e3; 1868 r.ptr[4] = e4; 1869 r.ptr[5] = e5; 1870 r.ptr[6] = e6; 1871 r.ptr[7] = e7; 1872 return r; 1873 } 1874 unittest 1875 { 1876 __m256 A = _mm256_set_ps(3, 2, 1, 546.0f, -1.25f, -2, -3, 0); 1877 float[8] correct = [0, -3, -2, -1.25f, 546.0f, 1.0, 2.0, 3.0]; 1878 assert(A.array == correct); 1879 } 1880 1881 /// Broadcast 16-bit integer `a` to all elements of the return value. 1882 __m256i _mm256_set1_epi16 (short a) pure @trusted 1883 { 1884 // workaround https://issues.dlang.org/show_bug.cgi?id=21469 1885 // It used to ICE, now the codegen is just wrong. 1886 // TODO report this backend issue. 1887 version(DigitalMars) 1888 { 1889 short16 v = a; 1890 return cast(__m256i) v; 1891 } 1892 else 1893 { 1894 pragma(inline, true); 1895 return cast(__m256i)(short16(a)); 1896 } 1897 } 1898 unittest 1899 { 1900 short16 a = cast(short16) _mm256_set1_epi16(31); 1901 for (int i = 0; i < 16; ++i) 1902 assert(a.array[i] == 31); 1903 } 1904 1905 /// Broadcast 32-bit integer `a` to all elements. 1906 __m256i _mm256_set1_epi32 (int a) pure @trusted 1907 { 1908 // Bad codegen else in DMD. 1909 // TODO report this backend issue. 1910 version(DigitalMars) 1911 { 1912 int8 v = a; 1913 return cast(__m256i) v; 1914 } 1915 else 1916 { 1917 pragma(inline, true); 1918 return cast(__m256i)(int8(a)); 1919 } 1920 } 1921 unittest 1922 { 1923 int8 a = cast(int8) _mm256_set1_epi32(31); 1924 for (int i = 0; i < 8; ++i) 1925 assert(a.array[i] == 31); 1926 } 1927 1928 /// Broadcast 64-bit integer `a` to all elements of the return value. 1929 __m256i _mm256_set1_epi64x (long a) 1930 { 1931 return cast(__m256i)(long4(a)); 1932 } 1933 unittest 1934 { 1935 long4 a = cast(long4) _mm256_set1_epi64x(-31); 1936 for (int i = 0; i < 4; ++i) 1937 assert(a.array[i] == -31); 1938 } 1939 1940 /// Broadcast 8-bit integer `a` to all elements of the return value. 1941 __m256i _mm256_set1_epi8 (byte a) pure @trusted 1942 { 1943 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 1944 { 1945 byte32 v = a; 1946 return cast(__m256i) v; 1947 } 1948 else 1949 { 1950 pragma(inline, true); 1951 return cast(__m256i)(byte32(a)); 1952 } 1953 } 1954 unittest 1955 { 1956 byte32 a = cast(byte32) _mm256_set1_epi8(31); 1957 for (int i = 0; i < 32; ++i) 1958 assert(a.array[i] == 31); 1959 } 1960 1961 /// Broadcast double-precision (64-bit) floating-point value `a` to all elements of the return value. 1962 __m256d _mm256_set1_pd (double a) pure @trusted 1963 { 1964 return __m256d(a); 1965 } 1966 unittest 1967 { 1968 double a = 464.21; 1969 double[4] correct = [a, a, a, a]; 1970 double4 A = cast(double4) _mm256_set1_pd(a); 1971 assert(A.array == correct); 1972 } 1973 1974 /// Broadcast single-precision (32-bit) floating-point value `a` to all elements of the return value. 1975 __m256 _mm256_set1_ps (float a) pure @trusted 1976 { 1977 return __m256(a); 1978 } 1979 unittest 1980 { 1981 float a = 464.21f; 1982 float[8] correct = [a, a, a, a, a, a, a, a]; 1983 float8 A = cast(float8) _mm256_set1_ps(a); 1984 assert(A.array == correct); 1985 } 1986 1987 /// Set packed 16-bit integers with the supplied values in reverse order. 1988 __m256i _mm256_setr_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, 1989 short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 1990 { 1991 short[16] result = [ e15, e14, e13, e12, e11, e10, e9, e8, 1992 e7, e6, e5, e4, e3, e2, e1, e0]; 1993 static if (GDC_with_AVX) 1994 { 1995 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr); 1996 } 1997 else version(LDC) 1998 { 1999 return cast(__m256i)( loadUnaligned!(short16)(result.ptr) ); 2000 } 2001 else 2002 { 2003 short16 r; 2004 for(int n = 0; n < 16; ++n) 2005 r.ptr[n] = result[n]; 2006 return cast(__m256i)r; 2007 } 2008 } 2009 unittest 2010 { 2011 short16 A = cast(short16) _mm256_setr_epi16(-1, 0, -21, 21, 42, 127, -42, -128, 2012 -1, 0, -21, 21, 42, 127, -42, -128); 2013 short[16] correct = [-1, 0, -21, 21, 42, 127, -42, -128, 2014 -1, 0, -21, 21, 42, 127, -42, -128]; 2015 assert(A.array == correct); 2016 } 2017 2018 /// Set packed 32-bit integers with the supplied values in reverse order. 2019 __m256i _mm256_setr_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted 2020 { 2021 // Inlines a constant with GCC -O1, LDC -O2 2022 int8 r; // = void would prevent GDC from inlining a constant call 2023 r.ptr[0] = e7; 2024 r.ptr[1] = e6; 2025 r.ptr[2] = e5; 2026 r.ptr[3] = e4; 2027 r.ptr[4] = e3; 2028 r.ptr[5] = e2; 2029 r.ptr[6] = e1; 2030 r.ptr[7] = e0; 2031 return cast(__m256i)r; 2032 } 2033 unittest 2034 { 2035 int8 A = cast(int8) _mm256_setr_epi32(-1, 0, -2147483648, 2147483647, 42, 666, -42, -666); 2036 int[8] correct = [-1, 0, -2147483648, 2147483647, 42, 666, -42, -666]; 2037 assert(A.array == correct); 2038 } 2039 2040 /// Set packed 64-bit integers with the supplied values in reverse order. 2041 __m256i _mm256_setr_epi64x (long e3, long e2, long e1, long e0) pure @trusted 2042 { 2043 long4 r = void; 2044 r.ptr[0] = e3; 2045 r.ptr[1] = e2; 2046 r.ptr[2] = e1; 2047 r.ptr[3] = e0; 2048 return r; 2049 } 2050 unittest 2051 { 2052 __m256i A = _mm256_setr_epi64x(-1, 42, long.min, long.max); 2053 long[4] correct = [-1, 42, long.min, long.max]; 2054 assert(A.array == correct); 2055 } 2056 2057 /// Set packed 8-bit integers with the supplied values in reverse order. 2058 __m256i _mm256_setr_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24, 2059 byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16, 2060 byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9, byte e8, 2061 byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 2062 { 2063 // Inline a constant call in GDC -O1 and LDC -O2 2064 align(32) byte[32] result = [ e31, e30, e29, e28, e27, e26, e25, e24, 2065 e23, e22, e21, e20, e19, e18, e17, e16, 2066 e15, e14, e13, e12, e11, e10, e9, e8, 2067 e7, e6, e5, e4, e3, e2, e1, e0]; 2068 return *cast(__m256i*)(result.ptr); 2069 } 2070 unittest 2071 { 2072 byte32 A = cast(byte32) _mm256_setr_epi8( -1, 0, -21, 21, 42, 127, -42, -128, 2073 -1, 0, -21, 21, 42, 127, -42, -128, 2074 -1, 0, -21, 21, 42, 127, -42, -128, 2075 -1, 0, -21, 21, 42, 127, -42, -128); 2076 byte[32] correct = [-1, 0, -21, 21, 42, 127, -42, -128, 2077 -1, 0, -21, 21, 42, 127, -42, -128, 2078 -1, 0, -21, 21, 42, 127, -42, -128, 2079 -1, 0, -21, 21, 42, 127, -42, -128]; 2080 assert(A.array == correct); 2081 } 2082 2083 /// Set packed `__m256` vector with the supplied values. 2084 __m256 _mm256_setr_m128 (__m128 lo, __m128 hi) 2085 { 2086 return _mm256_set_m128(hi, lo); 2087 } 2088 unittest 2089 { 2090 __m128 A = _mm_setr_ps(1.0f, 2, 3, 4); 2091 __m128 B = _mm_setr_ps(3.0f, 4, 5, 6); 2092 __m256 R = _mm256_setr_m128(B, A); 2093 float[8] correct = [3.0f, 4, 5, 6, 1, 2, 3, 4,]; 2094 assert(R.array == correct); 2095 } 2096 2097 /// Set packed `__m256d` vector with the supplied values. 2098 __m256d _mm256_setr_m128d (__m128d lo, __m128d hi) 2099 { 2100 return _mm256_set_m128d(hi, lo); 2101 } 2102 unittest 2103 { 2104 __m128d A = _mm_setr_pd(1.0, 2.0); 2105 __m128d B = _mm_setr_pd(3.0, 4.0); 2106 __m256d R = _mm256_setr_m128d(B, A); 2107 double[4] correct = [3.0, 4.0, 1.0, 2.0]; 2108 assert(R.array == correct); 2109 } 2110 2111 /// Set packed `__m256i` vector with the supplied values. 2112 __m256i _mm256_setr_m128i (__m128i lo, __m128i hi) 2113 { 2114 return _mm256_set_m128i(hi, lo); 2115 } 2116 unittest 2117 { 2118 __m128i A = _mm_setr_epi32( 1, 2, 3, 4); 2119 __m128i B = _mm_set_epi32(-3, -4, -5, -6); 2120 int8 R = cast(int8)_mm256_setr_m128i(B, A); 2121 int[8] correct = [-6, -5, -4, -3, 1, 2, 3, 4]; 2122 assert(R.array == correct); 2123 } 2124 2125 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order. 2126 __m256d _mm256_setr_pd (double e3, double e2, double e1, double e0) pure @trusted 2127 { 2128 version(LDC) 2129 { 2130 // PERF, probably not the best 2131 double[4] result = [e3, e2, e1, e0]; 2132 return loadUnaligned!(double4)(result.ptr); 2133 } 2134 else 2135 { 2136 __m256d r; 2137 r.ptr[0] = e3; 2138 r.ptr[1] = e2; 2139 r.ptr[2] = e1; 2140 r.ptr[3] = e0; 2141 return r; 2142 } 2143 } 2144 unittest 2145 { 2146 __m256d A = _mm256_setr_pd(3, 2, 1, 546.125); 2147 double[4] correct = [3.0, 2.0, 1.0, 546.125]; 2148 assert(A.array == correct); 2149 } 2150 2151 2152 /// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order. 2153 __m256 _mm256_setr_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) pure @trusted 2154 { 2155 // PERF DMD 2156 static if (GDC_with_AVX) 2157 { 2158 align(32) float[8] r = [ e7, e6, e5, e4, e3, e2, e1, e0]; 2159 return *cast(__m256*)r; 2160 } 2161 else version(LDC) 2162 { 2163 align(32) float[8] r = [ e7, e6, e5, e4, e3, e2, e1, e0]; 2164 return *cast(__m256*)r; 2165 } 2166 else 2167 { 2168 __m256 r; 2169 r.ptr[0] = e7; 2170 r.ptr[1] = e6; 2171 r.ptr[2] = e5; 2172 r.ptr[3] = e4; 2173 r.ptr[4] = e3; 2174 r.ptr[5] = e2; 2175 r.ptr[6] = e1; 2176 r.ptr[7] = e0; 2177 return r; 2178 } 2179 } 2180 unittest 2181 { 2182 __m256 A = _mm256_setr_ps( 3, 2, 1, 546.125f, 4, 5, 6, 7); 2183 float[8] correct = [3.0f, 2, 1, 546.125f, 4, 5, 6, 7]; 2184 assert(A.array == correct); 2185 } 2186 2187 /// Return vector of type `__m256d` with all elements set to zero. 2188 __m256d _mm256_setzero_pd() pure @safe 2189 { 2190 return double4(0.0); 2191 } 2192 unittest 2193 { 2194 __m256d A = _mm256_setzero_pd(); 2195 double[4] correct = [0.0, 0.0, 0.0, 0.0]; 2196 assert(A.array == correct); 2197 } 2198 2199 /// Return vector of type `__m256` with all elements set to zero. 2200 __m256 _mm256_setzero_ps() pure @safe 2201 { 2202 return float8(0.0f); 2203 } 2204 unittest 2205 { 2206 __m256 A = _mm256_setzero_ps(); 2207 float[8] correct = [0.0f, 0, 0, 0, 0, 0, 0, 0]; 2208 assert(A.array == correct); 2209 } 2210 2211 /// Return vector of type `__m256i` with all elements set to zero. 2212 __m256i _mm256_setzero_si256() pure @trusted 2213 { 2214 return __m256i(0); 2215 } 2216 unittest 2217 { 2218 __m256i A = _mm256_setzero_si256(); 2219 long[4] correct = [0, 0, 0, 0]; 2220 assert(A.array == correct); 2221 } 2222 2223 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the 2224 /// control in `imm8`. 2225 __m256d _mm256_shuffle_pd(int imm8)(__m256d a, __m256d b) pure @trusted 2226 { 2227 // PERF DMD D_SIMD 2228 static if (GDC_with_AVX) 2229 { 2230 return __builtin_ia32_shufpd256(a, b, imm8); 2231 } 2232 else version(LDC) 2233 { 2234 return shufflevectorLDC!(double4, 2235 (imm8 >> 0) & 1, 2236 4 + ( (imm8 >> 1) & 1), 2237 2 + ( (imm8 >> 2) & 1), 2238 6 + ( (imm8 >> 3) & 1) )(a, b); 2239 } 2240 else 2241 { 2242 double4 r = void; 2243 r.ptr[0] = a.array[(imm8 >> 0) & 1]; 2244 r.ptr[1] = b.array[(imm8 >> 1) & 1]; 2245 r.ptr[2] = a.array[2 + ( (imm8 >> 2) & 1)]; 2246 r.ptr[3] = b.array[2 + ( (imm8 >> 3) & 1)]; 2247 return r; 2248 } 2249 } 2250 unittest 2251 { 2252 __m256d A = _mm256_setr_pd( 0, 1, 2, 3); 2253 __m256d B = _mm256_setr_pd( 4, 5, 6, 7); 2254 __m256d C = _mm256_shuffle_pd!75 /* 01001011 */(A, B); 2255 double[4] correct = [1.0, 5.0, 2.0, 7.0]; 2256 assert(C.array == correct); 2257 } 2258 2259 /// Shuffle single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using 2260 /// the control in `imm8`. 2261 __m256 _mm256_shuffle_ps(int imm8)(__m256 a, __m256 b) pure @trusted 2262 { 2263 // PERF DMD D_SIMD 2264 static if (GDC_with_AVX) 2265 { 2266 return __builtin_ia32_shufps256(a, b, imm8); 2267 } 2268 else version(LDC) 2269 { 2270 return shufflevectorLDC!(float8, (imm8 >> 0) & 3, 2271 (imm8 >> 2) & 3, 2272 8 + ( (imm8 >> 4) & 3), 2273 8 + ( (imm8 >> 6) & 3), 2274 4 + ( (imm8 >> 0) & 3), 2275 4 + ( (imm8 >> 2) & 3), 2276 12 + ( (imm8 >> 4) & 3), 2277 12 + ( (imm8 >> 6) & 3) )(a, b); 2278 } 2279 else 2280 { 2281 float8 r = void; 2282 r.ptr[0] = a.array[(imm8 >> 0) & 3]; 2283 r.ptr[1] = a.array[(imm8 >> 2) & 3]; 2284 r.ptr[2] = b.array[(imm8 >> 4) & 3]; 2285 r.ptr[3] = b.array[(imm8 >> 6) & 3]; 2286 r.ptr[4] = a.array[4 + ( (imm8 >> 0) & 3 )]; 2287 r.ptr[5] = a.array[4 + ( (imm8 >> 2) & 3 )]; 2288 r.ptr[6] = b.array[4 + ( (imm8 >> 4) & 3 )]; 2289 r.ptr[7] = b.array[4 + ( (imm8 >> 6) & 3 )]; 2290 return r; 2291 } 2292 } 2293 unittest 2294 { 2295 __m256 A = _mm256_setr_ps( 0, 1, 2, 3, 4, 5, 6, 7); 2296 __m256 B = _mm256_setr_ps( 8, 9, 10, 11, 12, 13, 14, 15); 2297 __m256 C = _mm256_shuffle_ps!75 /* 01001011 */(A, B); 2298 float[8] correct = [3.0f, 2, 8, 9, 7, 6, 12, 13]; 2299 assert(C.array == correct); 2300 } 2301 2302 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `a`. 2303 __m256d _mm256_sqrt_pd (__m256d a) pure @trusted 2304 { 2305 static if (GDC_with_AVX) 2306 { 2307 return __builtin_ia32_sqrtpd256(a); 2308 } 2309 else version(LDC) 2310 { 2311 return llvm_sqrt(a); 2312 } 2313 else 2314 { 2315 a.ptr[0] = sqrt(a.array[0]); 2316 a.ptr[1] = sqrt(a.array[1]); 2317 a.ptr[2] = sqrt(a.array[2]); 2318 a.ptr[3] = sqrt(a.array[3]); 2319 return a; 2320 } 2321 } 2322 unittest 2323 { 2324 __m256d A = _mm256_sqrt_pd(_mm256_set1_pd(4.0)); 2325 double[4] correct = [2.0, 2, 2, 2]; 2326 assert(A.array == correct); 2327 } 2328 2329 /// Compute the square root of packed single-precision (32-bit) floating-point elements in `a`. 2330 __m256 _mm256_sqrt_ps (__m256 a) pure @trusted 2331 { 2332 static if (GDC_with_AVX) 2333 { 2334 return __builtin_ia32_sqrtps256(a); 2335 } 2336 else version(LDC) 2337 { 2338 return llvm_sqrt(a); 2339 } 2340 else 2341 { 2342 a.ptr[0] = sqrt(a.array[0]); 2343 a.ptr[1] = sqrt(a.array[1]); 2344 a.ptr[2] = sqrt(a.array[2]); 2345 a.ptr[3] = sqrt(a.array[3]); 2346 a.ptr[4] = sqrt(a.array[4]); 2347 a.ptr[5] = sqrt(a.array[5]); 2348 a.ptr[6] = sqrt(a.array[6]); 2349 a.ptr[7] = sqrt(a.array[7]); 2350 return a; 2351 } 2352 } 2353 unittest 2354 { 2355 __m256 A = _mm256_sqrt_ps(_mm256_set1_ps(4.0f)); 2356 float[8] correct = [2.0f, 2, 2, 2, 2, 2, 2, 2]; 2357 assert(A.array == correct); 2358 } 2359 2360 /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 2361 /// `a` into memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 2362 /// exception may be generated. 2363 void _mm256_store_pd (double* mem_addr, __m256d a) pure @system 2364 { 2365 *cast(__m256d*)mem_addr = a; 2366 } 2367 unittest 2368 { 2369 align(32) double[4] mem; 2370 double[4] correct = [1.0, 2, 3, 4]; 2371 _mm256_store_pd(mem.ptr, _mm256_setr_pd(1.0, 2, 3, 4)); 2372 assert(mem == correct); 2373 } 2374 2375 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 2376 /// `a` into memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 2377 /// exception may be generated. 2378 void _mm256_store_ps (float* mem_addr, __m256 a) pure @system 2379 { 2380 *cast(__m256*)mem_addr = a; 2381 } 2382 unittest 2383 { 2384 align(32) float[8] mem; 2385 float[8] correct = [1.0, 2, 3, 4, 5, 6, 7, 8]; 2386 _mm256_store_ps(mem.ptr, _mm256_set_ps(8.0, 7, 6, 5, 4, 3, 2, 1)); 2387 assert(mem == correct); 2388 } 2389 2390 /// Store 256-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 32-byte 2391 /// boundary or a general-protection exception may be generated. 2392 void _mm256_store_si256 (__m256i * mem_addr, __m256i a) pure @safe 2393 { 2394 *mem_addr = a; 2395 } 2396 unittest 2397 { 2398 align(32) long[4] mem; 2399 long[4] correct = [5, -6, -7, 8]; 2400 _mm256_store_si256(cast(__m256i*)(mem.ptr), _mm256_setr_epi64x(5, -6, -7, 8)); 2401 assert(mem == correct); 2402 } 2403 2404 /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 2405 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary. 2406 void _mm256_storeu_pd (double * mem_addr, __m256d a) pure @system 2407 { 2408 // PERF: DMD 2409 static if (GDC_with_AVX) 2410 { 2411 __builtin_ia32_storeupd256(mem_addr, a); 2412 } 2413 else version(LDC) 2414 { 2415 storeUnaligned!__m256d(a, mem_addr); 2416 } 2417 else 2418 { 2419 for(int n = 0; n < 4; ++n) 2420 mem_addr[n] = a.array[n]; 2421 } 2422 } 2423 unittest 2424 { 2425 align(32) double[6] arr = [0.0, 0, 0, 0, 0, 0]; 2426 _mm256_storeu_pd(&arr[1], _mm256_set1_pd(4.0)); 2427 double[4] correct = [4.0, 4, 4, 4]; 2428 assert(arr[1..5] == correct); 2429 } 2430 2431 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 2432 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary. 2433 void _mm256_storeu_ps (float* mem_addr, __m256 a) pure @system 2434 { 2435 // PERF: DMD 2436 static if (GDC_with_AVX) 2437 { 2438 __builtin_ia32_storeups256(mem_addr, a); 2439 } 2440 else version(LDC) 2441 { 2442 storeUnaligned!__m256(a, mem_addr); 2443 } 2444 else 2445 { 2446 for(int n = 0; n < 8; ++n) 2447 mem_addr[n] = a.array[n]; 2448 } 2449 } 2450 unittest 2451 { 2452 align(32) float[10] arr = [0.0f, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 2453 _mm256_storeu_ps(&arr[1], _mm256_set1_ps(4.0f)); 2454 float[8] correct = [4.0f, 4, 4, 4, 4, 4, 4, 4]; 2455 assert(arr[1..9] == correct); 2456 } 2457 2458 2459 /// Store 256-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned 2460 /// on any particular boundary. 2461 void _mm256_storeu_si256 (__m256i* mem_addr, __m256i a) pure @trusted 2462 { 2463 // PERF: DMD 2464 static if (GDC_with_AVX) 2465 { 2466 __builtin_ia32_storedqu256(cast(char*)mem_addr, cast(ubyte32) a); 2467 } 2468 else version(LDC) 2469 { 2470 storeUnaligned!__m256i(a, cast(long*)mem_addr); 2471 } 2472 else 2473 { 2474 long4 v = cast(long4)a; 2475 long* p = cast(long*)mem_addr; 2476 for(int n = 0; n < 4; ++n) 2477 p[n] = v[n]; 2478 } 2479 } 2480 unittest 2481 { 2482 align(32) long[6] arr = [0, 0, 0, 0, 0, 0]; 2483 _mm256_storeu_si256( cast(__m256i*) &arr[1], _mm256_set1_epi64x(4)); 2484 long[4] correct = [4, 4, 4, 4]; 2485 assert(arr[1..5] == correct); 2486 } 2487 2488 /// Store the high and low 128-bit halves (each composed of 4 packed single-precision (32-bit) 2489 /// floating-point elements) from `a` into memory two different 128-bit locations. 2490 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 2491 void _mm256_storeu2_m128 (float* hiaddr, float* loaddr, __m256 a) pure @system 2492 { 2493 // This performed way better on GDC, and similarly in LDC, vs using other intrinsics 2494 loaddr[0] = a.array[0]; 2495 loaddr[1] = a.array[1]; 2496 loaddr[2] = a.array[2]; 2497 loaddr[3] = a.array[3]; 2498 hiaddr[0] = a.array[4]; 2499 hiaddr[1] = a.array[5]; 2500 hiaddr[2] = a.array[6]; 2501 hiaddr[3] = a.array[7]; 2502 } 2503 unittest 2504 { 2505 align(32) float[11] A = [0.0f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 2506 _mm256_storeu2_m128(&A[1], &A[6], _mm256_set1_ps(2.0f)); 2507 float[11] correct = [0.0f, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0]; 2508 assert(A == correct); 2509 } 2510 2511 /// Store the high and low 128-bit halves (each composed of 2 packed double-precision (64-bit) 2512 /// floating-point elements) from `a` into memory two different 128-bit locations. 2513 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 2514 void _mm256_storeu2_m128d (double* hiaddr, double* loaddr, __m256d a) pure @system 2515 { 2516 loaddr[0] = a.array[0]; 2517 loaddr[1] = a.array[1]; 2518 hiaddr[0] = a.array[2]; 2519 hiaddr[1] = a.array[3]; 2520 } 2521 unittest 2522 { 2523 double[2] A; 2524 double[2] B; 2525 _mm256_storeu2_m128d(A.ptr, B.ptr, _mm256_set1_pd(-43.0)); 2526 double[2] correct = [-43.0, -43]; 2527 assert(A == correct); 2528 assert(B == correct); 2529 } 2530 2531 /// Store the high and low 128-bit halves (each composed of integer data) from `a` into memory two 2532 /// different 128-bit locations. 2533 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 2534 void _mm256_storeu2_m128i (__m128i* hiaddr, __m128i* loaddr, __m256i a) pure @trusted // TODO: signature 2535 { 2536 long* hi = cast(long*)hiaddr; 2537 long* lo = cast(long*)loaddr; 2538 lo[0] = a.array[0]; 2539 lo[1] = a.array[1]; 2540 hi[0] = a.array[2]; 2541 hi[1] = a.array[3]; 2542 } 2543 unittest 2544 { 2545 long[2] A; 2546 long[2] B; 2547 _mm256_storeu2_m128i(cast(__m128i*)A.ptr, cast(__m128i*)B.ptr, _mm256_set1_epi64x(-42)); 2548 long[2] correct = [-42, -42]; 2549 assert(A == correct); 2550 assert(B == correct); 2551 } 2552 2553 /// Store 256-bits (composed of 4 packed single-precision (64-bit) floating-point elements) from 2554 /// `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 32-byte 2555 /// boundary or a general-protection exception may be generated. 2556 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 2557 void _mm256_stream_pd (double* mem_addr, __m256d a) pure @system 2558 { 2559 // PERF DMD 2560 // PERF GDC + SSE2 2561 version(LDC) 2562 { 2563 enum prefix = `!0 = !{ i32 1 }`; 2564 enum ir = ` 2565 store <4 x double> %1, <4 x double>* %0, align 32, !nontemporal !0 2566 ret void`; 2567 LDCInlineIREx!(prefix, ir, "", void, double4*, double4)(cast(double4*)mem_addr, a); 2568 } 2569 else static if (GDC_with_AVX) // any hope to be non-temporal? Using SSE2 instructions. 2570 { 2571 __builtin_ia32_movntpd256 (mem_addr, a); 2572 } 2573 else 2574 { 2575 // Regular store instead. 2576 __m256d* dest = cast(__m256d*)mem_addr; 2577 *dest = a; 2578 } 2579 } 2580 unittest 2581 { 2582 align(32) double[4] mem; 2583 double[4] correct = [5.0, -6, -7, 8]; 2584 _mm256_stream_pd(mem.ptr, _mm256_setr_pd(5.0, -6, -7, 8)); 2585 assert(mem == correct); 2586 } 2587 2588 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 2589 /// `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 32-byte 2590 /// boundary or a general-protection exception may be generated. 2591 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 2592 void _mm256_stream_ps (float* mem_addr, __m256 a) pure @system 2593 { 2594 // PERF DMD 2595 // PERF GDC + SSE2 2596 version(LDC) 2597 { 2598 enum prefix = `!0 = !{ i32 1 }`; 2599 enum ir = ` 2600 store <8 x float> %1, <8 x float>* %0, align 32, !nontemporal !0 2601 ret void`; 2602 LDCInlineIREx!(prefix, ir, "", void, float8*, float8)(cast(float8*)mem_addr, a); 2603 } 2604 else static if (GDC_with_AVX) 2605 { 2606 __builtin_ia32_movntps256 (mem_addr, a); 2607 } 2608 else 2609 { 2610 // Regular store instead. 2611 __m256* dest = cast(__m256*)mem_addr; 2612 *dest = a; 2613 } 2614 } 2615 unittest 2616 { 2617 align(32) float[8] mem; 2618 float[8] correct = [5, -6, -7, 8, 1, 2, 3, 4]; 2619 _mm256_stream_ps(mem.ptr, _mm256_setr_ps(5, -6, -7, 8, 1, 2, 3, 4)); 2620 assert(mem == correct); 2621 } 2622 2623 /// Store 256-bits of integer data from `a` into memory using a non-temporal memory hint. 2624 /// `mem_addr` must be aligned on a 32-byte boundary or a general-protection exception may be 2625 /// generated. 2626 /// Note: there isn't any particular instruction in AVX to do that. It just defers to SSE2. 2627 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 2628 void _mm256_stream_si256 (__m256i * mem_addr, __m256i a) pure @trusted 2629 { 2630 // PERF DMD 2631 // PERF GDC 2632 version(LDC) 2633 { 2634 enum prefix = `!0 = !{ i32 1 }`; 2635 enum ir = ` 2636 store <4 x i64> %1, <4 x i64>* %0, align 16, !nontemporal !0 2637 ret void`; 2638 LDCInlineIREx!(prefix, ir, "", void, long4*, long4)(mem_addr, a); 2639 } 2640 else static if (GDC_with_SSE2) // any hope to be non-temporal? Using SSE2 instructions. 2641 { 2642 long2 lo, hi; 2643 lo.ptr[0] = a.array[0]; 2644 lo.ptr[1] = a.array[1]; 2645 hi.ptr[0] = a.array[2]; 2646 hi.ptr[1] = a.array[3]; 2647 _mm_stream_si128(cast(__m128i*)mem_addr, cast(__m128i)lo); 2648 _mm_stream_si128((cast(__m128i*)mem_addr) + 1, cast(__m128i)hi); 2649 } 2650 else 2651 { 2652 // Regular store instead. 2653 __m256i* dest = cast(__m256i*)mem_addr; 2654 *dest = a; 2655 } 2656 } 2657 unittest 2658 { 2659 align(32) long[4] mem; 2660 long[4] correct = [5, -6, -7, 8]; 2661 _mm256_stream_si256(cast(__m256i*)(mem.ptr), _mm256_setr_epi64x(5, -6, -7, 8)); 2662 assert(mem == correct); 2663 } 2664 2665 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from 2666 /// packed double-precision (64-bit) floating-point elements in `a`. 2667 __m256d _mm256_sub_pd (__m256d a, __m256d b) pure @safe 2668 { 2669 return a - b; 2670 } 2671 unittest 2672 { 2673 __m256d a = [1.5, -2.0, 3.0, 200000.0]; 2674 a = _mm256_sub_pd(a, a); 2675 double[4] correct = [0.0, 0, 0, 0]; 2676 assert(a.array == correct); 2677 } 2678 2679 /// Subtract packed single-precision (32-bit) floating-point elements in `b` from 2680 /// packed single-precision (32-bit) floating-point elements in `a`. 2681 __m256 _mm256_sub_ps (__m256 a, __m256 b) pure @safe 2682 { 2683 return a - b; 2684 } 2685 unittest 2686 { 2687 __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 1.5f, -2000.0f, 3.0f, 1.0f]; 2688 a = _mm256_sub_ps(a, a); 2689 float[8] correct = [0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f]; 2690 assert(a.array == correct); 2691 } 2692 2693 2694 // TODO int _mm_testc_pd (__m128d a, __m128d b) 2695 // TODO int _mm256_testc_pd (__m256d a, __m256d b) 2696 // TODO int _mm_testc_ps (__m128 a, __m128 b) 2697 // TODO int _mm256_testc_ps (__m256 a, __m256 b) 2698 // TODO int _mm256_testc_si256 (__m256i a, __m256i b) 2699 // TODO int _mm_testnzc_pd (__m128d a, __m128d b) 2700 // TODO int _mm256_testnzc_pd (__m256d a, __m256d b) 2701 // TODO int _mm_testnzc_ps (__m128 a, __m128 b) 2702 // TODO int _mm256_testnzc_ps (__m256 a, __m256 b) 2703 // TODO int _mm256_testnzc_si256 (__m256i a, __m256i b) 2704 // TODO int _mm_testz_pd (__m128d a, __m128d b) 2705 // TODO int _mm256_testz_pd (__m256d a, __m256d b) 2706 // TODO int _mm_testz_ps (__m128 a, __m128 b) 2707 // TODO int _mm256_testz_ps (__m256 a, __m256 b) 2708 // TODO int _mm256_testz_si256 (__m256i a, __m256i b) 2709 2710 /// Return vector of type __m256d with undefined elements. 2711 __m256d _mm256_undefined_pd () pure @safe 2712 { 2713 __m256d r = void; 2714 return r; 2715 } 2716 2717 /// Return vector of type __m256 with undefined elements. 2718 __m256 _mm256_undefined_ps () pure @safe 2719 { 2720 __m256 r = void; 2721 return r; 2722 } 2723 2724 /// Return vector of type __m256i with undefined elements. 2725 __m256i _mm256_undefined_si256 () pure @safe 2726 { 2727 __m256i r = void; 2728 return r; 2729 } 2730 2731 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 2732 /// each 128-bit lane in `a` and `b`. 2733 __m256d _mm256_unpackhi_pd (__m256d a, __m256d b) pure @trusted 2734 { 2735 version(LDC) 2736 { 2737 return shufflevectorLDC!(double4, 1, 5, 3, 7)(a, b); 2738 } 2739 else static if (GDC_with_AVX) 2740 { 2741 return __builtin_ia32_unpckhpd256 (a, b); 2742 } 2743 else 2744 { 2745 __m256d r; 2746 r.ptr[0] = a.array[1]; 2747 r.ptr[1] = b.array[1]; 2748 r.ptr[2] = a.array[3]; 2749 r.ptr[3] = b.array[3]; 2750 return r; 2751 } 2752 } 2753 unittest 2754 { 2755 __m256d A = _mm256_setr_pd(1.0, 2, 3, 4); 2756 __m256d B = _mm256_setr_pd(5.0, 6, 7, 8); 2757 __m256d C = _mm256_unpackhi_pd(A, B); 2758 double[4] correct = [2.0, 6, 4, 8]; 2759 assert(C.array == correct); 2760 } 2761 2762 2763 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 2764 /// each 128-bit lane in `a` and `b`. 2765 __m256 _mm256_unpackhi_ps (__m256 a, __m256 b) pure @trusted 2766 { 2767 version(LDC) 2768 { 2769 return shufflevectorLDC!(float8, 2, 10, 3, 11, 6, 14, 7, 15)(a, b); 2770 } 2771 else static if (GDC_with_AVX) 2772 { 2773 return __builtin_ia32_unpckhps256 (a, b); 2774 } 2775 else 2776 { 2777 __m256 r; 2778 r.ptr[0] = a.array[2]; 2779 r.ptr[1] = b.array[2]; 2780 r.ptr[2] = a.array[3]; 2781 r.ptr[3] = b.array[3]; 2782 r.ptr[4] = a.array[6]; 2783 r.ptr[5] = b.array[6]; 2784 r.ptr[6] = a.array[7]; 2785 r.ptr[7] = b.array[7]; 2786 return r; 2787 } 2788 } 2789 unittest 2790 { 2791 __m256 A = _mm256_setr_ps(0.0f, 1, 2, 3, 4, 5, 6, 7); 2792 __m256 B = _mm256_setr_ps(8.0f, 9, 10, 11, 12, 13, 14, 15); 2793 __m256 C = _mm256_unpackhi_ps(A, B); 2794 float[8] correct = [2.0f, 10, 3, 11, 6, 14, 7, 15]; 2795 assert(C.array == correct); 2796 } 2797 2798 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of 2799 /// each 128-bit lane in `a` and `b`. 2800 __m256d _mm256_unpacklo_pd (__m256d a, __m256d b) 2801 { 2802 version(LDC) 2803 { 2804 return shufflevectorLDC!(double4, 0, 4, 2, 6)(a, b); 2805 } 2806 else static if (GDC_with_AVX) 2807 { 2808 return __builtin_ia32_unpcklpd256 (a, b); 2809 } 2810 else 2811 { 2812 __m256d r; 2813 r.ptr[0] = a.array[0]; 2814 r.ptr[1] = b.array[0]; 2815 r.ptr[2] = a.array[2]; 2816 r.ptr[3] = b.array[2]; 2817 return r; 2818 } 2819 } 2820 unittest 2821 { 2822 __m256d A = _mm256_setr_pd(1.0, 2, 3, 4); 2823 __m256d B = _mm256_setr_pd(5.0, 6, 7, 8); 2824 __m256d C = _mm256_unpacklo_pd(A, B); 2825 double[4] correct = [1.0, 5, 3, 7]; 2826 assert(C.array == correct); 2827 } 2828 2829 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of 2830 /// each 128-bit lane in `a` and `b`. 2831 __m256 _mm256_unpacklo_ps (__m256 a, __m256 b) 2832 { 2833 version(LDC) 2834 { 2835 return shufflevectorLDC!(float8, 0, 8, 1, 9, 4, 12, 5, 13)(a, b); 2836 } 2837 else static if (GDC_with_AVX) 2838 { 2839 return __builtin_ia32_unpcklps256 (a, b); 2840 } 2841 else 2842 { 2843 __m256 r; 2844 r.ptr[0] = a.array[0]; 2845 r.ptr[1] = b.array[0]; 2846 r.ptr[2] = a.array[1]; 2847 r.ptr[3] = b.array[1]; 2848 r.ptr[4] = a.array[4]; 2849 r.ptr[5] = b.array[4]; 2850 r.ptr[6] = a.array[5]; 2851 r.ptr[7] = b.array[5]; 2852 return r; 2853 } 2854 } 2855 unittest 2856 { 2857 __m256 A = _mm256_setr_ps(0.0f, 1, 2, 3, 4, 5, 6, 7); 2858 __m256 B = _mm256_setr_ps(8.0f, 9, 10, 11, 12, 13, 14, 15); 2859 __m256 C = _mm256_unpacklo_ps(A, B); 2860 float[8] correct = [0.0f, 8, 1, 9, 4, 12, 5, 13]; 2861 assert(C.array == correct); 2862 } 2863 2864 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 2865 __m256d _mm256_xor_pd (__m256d a, __m256d b) pure @safe 2866 { 2867 return cast(__m256d)( cast(__m256i)a ^ cast(__m256i)b ); 2868 } 2869 2870 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`. 2871 __m256 _mm256_xor_ps (__m256 a, __m256 b) pure @safe 2872 { 2873 return cast(__m256)( cast(__m256i)a ^ cast(__m256i)b ); 2874 } 2875 2876 void _mm256_zeroall () pure @safe 2877 { 2878 // PERF: DMD needs to do it explicitely if AVX is ever used. 2879 2880 static if (GDC_with_AVX) 2881 { 2882 __builtin_ia32_vzeroall(); 2883 } 2884 else 2885 { 2886 // Do nothing. The transitions penalty are supposed handled by the backend. 2887 } 2888 } 2889 2890 void _mm256_zeroupper () pure @safe 2891 { 2892 // PERF: DMD needs to do it explicitely if AVX is ever used. 2893 2894 static if (GDC_with_AVX) 2895 { 2896 __builtin_ia32_vzeroupper(); 2897 } 2898 else 2899 { 2900 // Do nothing. The transitions penalty are supposed handled by the backend. 2901 } 2902 2903 } 2904 2905 /// Cast vector of type `__m128d` to type `__m256d`; the upper 128 bits of the result are zeroed. 2906 __m256d _mm256_zextpd128_pd256 (__m128d a) pure @trusted 2907 { 2908 __m256d r; 2909 r.ptr[0] = a.array[0]; 2910 r.ptr[1] = a.array[1]; 2911 r.ptr[2] = 0; 2912 r.ptr[3] = 0; 2913 return r; 2914 } 2915 unittest 2916 { 2917 __m256d R = _mm256_zextpd128_pd256(_mm_setr_pd(2.0, -3.0)); 2918 double[4] correct = [2.0, -3, 0, 0]; 2919 assert(R.array == correct); 2920 } 2921 2922 /// Cast vector of type `__m128` to type `__m256`; the upper 128 bits of the result are zeroed. 2923 __m256 _mm256_zextps128_ps256 (__m128 a) pure @trusted 2924 { 2925 double2 la = cast(double2)a; 2926 double4 r; 2927 r.ptr[0] = la.array[0]; 2928 r.ptr[1] = la.array[1]; 2929 r.ptr[2] = 0; 2930 r.ptr[3] = 0; 2931 return cast(__m256)r; 2932 } 2933 unittest 2934 { 2935 __m256 R = _mm256_zextps128_ps256(_mm_setr_ps(2.0, -3.0, 4, -5)); 2936 float[8] correct = [2.0, -3, 4, -5, 0, 0, 0, 0]; 2937 assert(R.array == correct); 2938 } 2939 2940 /// Cast vector of type `__m128i` to type `__m256i`; the upper 128 bits of the result are zeroed. 2941 __m256i _mm256_zextsi128_si256 (__m128i a) pure @trusted 2942 { 2943 long2 la = cast(long2)a; 2944 __m256i r; 2945 r.ptr[0] = la.array[0]; 2946 r.ptr[1] = la.array[1]; 2947 r.ptr[2] = 0; 2948 r.ptr[3] = 0; 2949 return r; 2950 } 2951 unittest 2952 { 2953 __m256i R = _mm256_zextsi128_si256(_mm_setr_epi64(-1, 99)); 2954 long[4] correct = [-1, 99, 0, 0]; 2955 assert(R.array == correct); 2956 } 2957 2958 /+ 2959 2960 2961 pragma(LDC_intrinsic, "llvm.x86.avx.cvtt.pd2dq.256") 2962 int4 __builtin_ia32_cvttpd2dq256(double4) pure @safe; 2963 2964 pragma(LDC_intrinsic, "llvm.x86.avx.cvtt.ps2dq.256") 2965 int8 __builtin_ia32_cvttps2dq256(float8) pure @safe; 2966 2967 pragma(LDC_intrinsic, "llvm.x86.avx.hadd.pd.256") 2968 double4 __builtin_ia32_haddpd256(double4, double4) pure @safe; 2969 2970 pragma(LDC_intrinsic, "llvm.x86.avx.hadd.ps.256") 2971 float8 __builtin_ia32_haddps256(float8, float8) pure @safe; 2972 2973 pragma(LDC_intrinsic, "llvm.x86.avx.hsub.pd.256") 2974 double4 __builtin_ia32_hsubpd256(double4, double4) pure @safe; 2975 2976 pragma(LDC_intrinsic, "llvm.x86.avx.hsub.ps.256") 2977 float8 __builtin_ia32_hsubps256(float8, float8) pure @safe; 2978 2979 2980 pragma(LDC_intrinsic, "llvm.x86.avx.maskload.pd") 2981 double2 __builtin_ia32_maskloadpd(const void*, long2); 2982 2983 pragma(LDC_intrinsic, "llvm.x86.avx.maskload.pd.256") 2984 double4 __builtin_ia32_maskloadpd256(const void*, long4); 2985 2986 pragma(LDC_intrinsic, "llvm.x86.avx.maskload.ps") 2987 float4 __builtin_ia32_maskloadps(const void*, int4); 2988 2989 pragma(LDC_intrinsic, "llvm.x86.avx.maskload.ps.256") 2990 float8 __builtin_ia32_maskloadps256(const void*, int8); 2991 2992 pragma(LDC_intrinsic, "llvm.x86.avx.maskstore.pd") 2993 void __builtin_ia32_maskstorepd(void*, long2, double2); 2994 2995 pragma(LDC_intrinsic, "llvm.x86.avx.maskstore.pd.256") 2996 void __builtin_ia32_maskstorepd256(void*, long4, double4); 2997 2998 pragma(LDC_intrinsic, "llvm.x86.avx.maskstore.ps") 2999 void __builtin_ia32_maskstoreps(void*, int4, float4); 3000 3001 pragma(LDC_intrinsic, "llvm.x86.avx.maskstore.ps.256") 3002 void __builtin_ia32_maskstoreps256(void*, int8, float8); 3003 3004 3005 3006 pragma(LDC_intrinsic, "llvm.x86.avx.movmsk.pd.256") 3007 int __builtin_ia32_movmskpd256(double4) pure @safe; 3008 3009 pragma(LDC_intrinsic, "llvm.x86.avx.movmsk.ps.256") 3010 int __builtin_ia32_movmskps256(float8) pure @safe; 3011 3012 pragma(LDC_intrinsic, "llvm.x86.avx.ptestc.256") 3013 int __builtin_ia32_ptestc256(long4, long4) pure @safe; 3014 3015 pragma(LDC_intrinsic, "llvm.x86.avx.ptestnzc.256") 3016 int __builtin_ia32_ptestnzc256(long4, long4) pure @safe; 3017 3018 pragma(LDC_intrinsic, "llvm.x86.avx.ptestz.256") 3019 int __builtin_ia32_ptestz256(long4, long4) pure @safe; 3020 3021 pragma(LDC_intrinsic, "llvm.x86.avx.rcp.ps.256") 3022 float8 __builtin_ia32_rcpps256(float8) pure @safe; 3023 3024 pragma(LDC_intrinsic, "llvm.x86.avx.round.pd.256") 3025 double4 __builtin_ia32_roundpd256(double4, int) pure @safe; 3026 3027 pragma(LDC_intrinsic, "llvm.x86.avx.round.ps.256") 3028 float8 __builtin_ia32_roundps256(float8, int) pure @safe; 3029 3030 pragma(LDC_intrinsic, "llvm.x86.avx.rsqrt.ps.256") 3031 float8 __builtin_ia32_rsqrtps256(float8) pure @safe; 3032 3033 pragma(LDC_intrinsic, "llvm.x86.avx.vpermilvar.pd") 3034 double2 __builtin_ia32_vpermilvarpd(double2, long2) pure @safe; 3035 3036 pragma(LDC_intrinsic, "llvm.x86.avx.vpermilvar.pd.256") 3037 double4 __builtin_ia32_vpermilvarpd256(double4, long4) pure @safe; 3038 3039 pragma(LDC_intrinsic, "llvm.x86.avx.vpermilvar.ps") 3040 float4 __builtin_ia32_vpermilvarps(float4, int4) pure @safe; 3041 3042 pragma(LDC_intrinsic, "llvm.x86.avx.vpermilvar.ps.256") 3043 float8 __builtin_ia32_vpermilvarps256(float8, int8) pure @safe; 3044 3045 pragma(LDC_intrinsic, "llvm.x86.avx.vtestc.pd") 3046 int __builtin_ia32_vtestcpd(double2, double2) pure @safe; 3047 3048 pragma(LDC_intrinsic, "llvm.x86.avx.vtestc.pd.256") 3049 int __builtin_ia32_vtestcpd256(double4, double4) pure @safe; 3050 3051 pragma(LDC_intrinsic, "llvm.x86.avx.vtestc.ps") 3052 int __builtin_ia32_vtestcps(float4, float4) pure @safe; 3053 3054 pragma(LDC_intrinsic, "llvm.x86.avx.vtestc.ps.256") 3055 int __builtin_ia32_vtestcps256(float8, float8) pure @safe; 3056 3057 pragma(LDC_intrinsic, "llvm.x86.avx.vtestnzc.pd") 3058 int __builtin_ia32_vtestnzcpd(double2, double2) pure @safe; 3059 3060 pragma(LDC_intrinsic, "llvm.x86.avx.vtestnzc.pd.256") 3061 int __builtin_ia32_vtestnzcpd256(double4, double4) pure @safe; 3062 3063 pragma(LDC_intrinsic, "llvm.x86.avx.vtestnzc.ps") 3064 int __builtin_ia32_vtestnzcps(float4, float4) pure @safe; 3065 3066 pragma(LDC_intrinsic, "llvm.x86.avx.vtestnzc.ps.256") 3067 int __builtin_ia32_vtestnzcps256(float8, float8) pure @safe; 3068 3069 pragma(LDC_intrinsic, "llvm.x86.avx.vtestz.pd") 3070 int __builtin_ia32_vtestzpd(double2, double2) pure @safe; 3071 3072 pragma(LDC_intrinsic, "llvm.x86.avx.vtestz.pd.256") 3073 int __builtin_ia32_vtestzpd256(double4, double4) pure @safe; 3074 3075 pragma(LDC_intrinsic, "llvm.x86.avx.vtestz.ps") 3076 int __builtin_ia32_vtestzps(float4, float4) pure @safe; 3077 3078 pragma(LDC_intrinsic, "llvm.x86.avx.vtestz.ps.256") 3079 int __builtin_ia32_vtestzps256(float8, float8) pure @safe; 3080 3081 +/