1 /** 2 * AVX intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX 4 * 5 * Copyright: Guillaume Piolat 2022. 6 * Johan Engelen 2022. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.avxintrin; 10 11 // AVX instructions 12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX 13 // Note: this header will work whether you have AVX enabled or not. 14 // With LDC, use "dflags-ldc": ["-mattr=+avx"] or equivalent to actively 15 // generate AVX instructions. 16 // With GDC, use "dflags-gdc": ["-mavx"] or equivalent to actively 17 // generate AVX instructions. 18 19 public import inteli.types; 20 import inteli.internals; 21 22 // Pull in all previous instruction set intrinsics. 23 public import inteli.tmmintrin; 24 25 nothrow @nogc: 26 27 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 28 __m256d _mm256_add_pd (__m256d a, __m256d b) pure @trusted 29 { 30 return a + b; 31 } 32 unittest 33 { 34 align(32) double[4] A = [-1, 2, -3, 40000]; 35 align(32) double[4] B = [ 9, -7, 8, -0.5]; 36 __m256d R = _mm256_add_pd(_mm256_load_pd(A.ptr), _mm256_load_pd(B.ptr)); 37 double[4] correct = [8, -5, 5, 39999.5]; 38 assert(R.array == correct); 39 } 40 41 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`. 42 __m256 _mm256_add_ps (__m256 a, __m256 b) pure @trusted 43 { 44 return a + b; 45 } 46 unittest 47 { 48 align(32) float[8] A = [-1.0f, 2, -3, 40000, 0, 3, 5, 6]; 49 align(32) float[8] B = [ 9.0f, -7, 8, -0.5, 8, 7, 3, -1]; 50 __m256 R = _mm256_add_ps(_mm256_load_ps(A.ptr), _mm256_load_ps(B.ptr)); 51 float[8] correct = [8, -5, 5, 39999.5, 8, 10, 8, 5]; 52 assert(R.array == correct); 53 } 54 55 /// Alternatively add and subtract packed double-precision (64-bit) floating-point 56 /// elements in `a` to/from packed elements in `b`. 57 __m256d _mm256_addsub_pd (__m256d a, __m256d b) pure @trusted 58 { 59 // PERF DMD 60 static if (GDC_with_AVX) 61 { 62 return __builtin_ia32_addsubpd256(a, b); 63 } 64 else static if (LDC_with_AVX) 65 { 66 return __builtin_ia32_addsubpd256(a, b); 67 } 68 else 69 { 70 //// Note: GDC x86 generates addsubpd since GDC 11.1 with -O3 71 //// LDC x86 generates addsubpd since LDC 1.18 with -O2 72 //// LDC ARM: not fantastic, ok since LDC 1.18 -O2 73 a.ptr[0] = a.array[0] + (-b.array[0]); 74 a.ptr[1] = a.array[1] + b.array[1]; 75 a.ptr[2] = a.array[2] + (-b.array[2]); 76 a.ptr[3] = a.array[3] + b.array[3]; 77 return a; 78 } 79 } 80 unittest 81 { 82 align(32) double[4] A = [-1, 2, -3, 40000]; 83 align(32) double[4] B = [ 9, -7, 8, -0.5]; 84 __m256d R = _mm256_addsub_pd(_mm256_load_pd(A.ptr), _mm256_load_pd(B.ptr)); 85 double[4] correct = [-10, -5, -11, 39999.5]; 86 assert(R.array == correct); 87 } 88 89 /// Alternatively add and subtract packed single-precision (32-bit) floating-point elements 90 /// in `a` to/from packed elements in `b`. 91 __m256 _mm256_addsub_ps (__m256 a, __m256 b) pure @trusted 92 { 93 // PERF DMD 94 static if (GDC_with_AVX) 95 { 96 return __builtin_ia32_addsubps256(a, b); 97 } 98 else static if (LDC_with_AVX) 99 { 100 return __builtin_ia32_addsubps256(a, b); 101 } 102 else 103 { 104 // Note: GDC x86 generates addsubps since GDC 11 -O3 105 // and in absence of AVX, a pair of SSE3 addsubps since GDC 12 -O2 106 // LDC x86 generates addsubps since LDC 1.18 -O2 107 // and in absence of AVX, a pair of SSE3 addsubps since LDC 1.1 -O1 108 // LDC ARM: neat output since LDC 1.21 -O2 109 110 a.ptr[0] = a.array[0] + (-b.array[0]); 111 a.ptr[1] = a.array[1] + b.array[1]; 112 a.ptr[2] = a.array[2] + (-b.array[2]); 113 a.ptr[3] = a.array[3] + b.array[3]; 114 a.ptr[4] = a.array[4] + (-b.array[4]); 115 a.ptr[5] = a.array[5] + b.array[5]; 116 a.ptr[6] = a.array[6] + (-b.array[6]); 117 a.ptr[7] = a.array[7] + b.array[7]; 118 return a; 119 } 120 } 121 unittest 122 { 123 align(32) float[8] A = [-1.0f, 2, -3, 40000, 0, 3, 5, 6]; 124 align(32) float[8] B = [ 9.0f, -7, 8, -0.5, 8, 7, 3, -1]; 125 __m256 R = _mm256_addsub_ps(_mm256_load_ps(A.ptr), _mm256_load_ps(B.ptr)); 126 float[8] correct = [ -10, -5, -11, 39999.5, -8, 10, 2, 5]; 127 assert(R.array == correct); 128 } 129 130 /// Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in `a` and `b`. 131 __m256d _mm256_and_pd (__m256d a, __m256d b) pure @trusted 132 { 133 // Note: GCC avxintrin.h uses the builtins for AND NOTAND OR of _ps and _pd, 134 // but those do not seem needed at any optimization level. 135 return cast(__m256d)(cast(__m256i)a & cast(__m256i)b); 136 } 137 unittest 138 { 139 double a = 4.32; 140 double b = -78.99; 141 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 142 __m256d A = _mm256_set_pd(a, b, a, b); 143 __m256d B = _mm256_set_pd(b, a, b, a); 144 long4 R = cast(long4)( _mm256_and_pd(A, B) ); 145 assert(R.array[0] == correct); 146 assert(R.array[1] == correct); 147 assert(R.array[2] == correct); 148 assert(R.array[3] == correct); 149 } 150 151 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`. 152 __m256 _mm256_and_ps (__m256 a, __m256 b) pure @trusted 153 { 154 return cast(__m256)(cast(__m256i)a & cast(__m256i)b); 155 } 156 unittest 157 { 158 float a = 4.32f; 159 float b = -78.99f; 160 int correct = (*cast(int*)(&a)) & (*cast(int*)(&b)); 161 __m256 A = _mm256_set_ps(a, b, a, b, a, b, a, b); 162 __m256 B = _mm256_set_ps(b, a, b, a, b, a, b, a); 163 int8 R = cast(int8)( _mm256_and_ps(A, B) ); 164 foreach(i; 0..8) 165 assert(R.array[i] == correct); 166 } 167 168 /// Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in `a` 169 /// and then AND with b. 170 __m256d _mm256_andnot_pd (__m256d a, __m256d b) pure @trusted 171 { 172 // PERF DMD 173 __m256i notA = _mm256_not_si256(cast(__m256i)a); 174 __m256i ib = cast(__m256i)b; 175 __m256i ab = notA & ib; 176 return cast(__m256d)ab; 177 } 178 unittest 179 { 180 double a = 4.32; 181 double b = -78.99; 182 long notA = ~ ( *cast(long*)(&a) ); 183 long correct = notA & (*cast(long*)(&b)); 184 __m256d A = _mm256_set_pd(a, a, a, a); 185 __m256d B = _mm256_set_pd(b, b, b, b); 186 long4 R = cast(long4)( _mm256_andnot_pd(A, B) ); 187 foreach(i; 0..4) 188 assert(R.array[i] == correct); 189 } 190 191 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` 192 /// and then AND with b. 193 __m256 _mm256_andnot_ps (__m256 a, __m256 b) pure @trusted 194 { 195 // PERF DMD 196 __m256i notA = _mm256_not_si256(cast(__m256i)a); 197 __m256i ib = cast(__m256i)b; 198 __m256i ab = notA & ib; 199 return cast(__m256)ab; 200 } 201 unittest 202 { 203 float a = 4.32f; 204 float b = -78.99f; 205 int notA = ~ ( *cast(int*)(&a) ); 206 int correct = notA & (*cast(int*)(&b)); 207 __m256 A = _mm256_set1_ps(a); 208 __m256 B = _mm256_set1_ps(b); 209 int8 R = cast(int8)( _mm256_andnot_ps(A, B) ); 210 foreach(i; 0..8) 211 assert(R.array[i] == correct); 212 } 213 214 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control 215 /// mask `imm8`. 216 __m256d _mm256_blend_pd(int imm8)(__m256d a, __m256d b) 217 { 218 static assert(imm8 >= 0 && imm8 < 16); 219 220 // PERF DMD 221 static if (GDC_with_AVX) 222 { 223 return __builtin_ia32_blendpd256 (a, b, imm8); 224 } 225 else 226 { 227 // Works great with LDC. 228 double4 r; 229 for (int n = 0; n < 4; ++n) 230 { 231 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 232 } 233 return r; 234 } 235 } 236 unittest 237 { 238 __m256d A = _mm256_setr_pd(0, 1, 2, 3); 239 __m256d B = _mm256_setr_pd(8, 9, 10, 11); 240 double4 C = _mm256_blend_pd!0x06(A, B); 241 double[4] correct = [0, 9, 10, 3]; 242 assert(C.array == correct); 243 } 244 245 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control 246 /// mask `imm8`. 247 __m256 _mm256_blend_ps(int imm8)(__m256 a, __m256 b) pure @trusted 248 { 249 static assert(imm8 >= 0 && imm8 < 256); 250 // PERF DMD 251 // PERF ARM64: not awesome with some constant values, up to 8/9 instructions 252 static if (GDC_with_AVX) 253 { 254 return __builtin_ia32_blendps256 (a, b, imm8); 255 } 256 else 257 { 258 // LDC x86: vblendps generated since LDC 1.27 -O1 259 float8 r; 260 for (int n = 0; n < 8; ++n) 261 { 262 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 263 } 264 return r; 265 } 266 } 267 unittest 268 { 269 __m256 A = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7); 270 __m256 B = _mm256_setr_ps(8, 9, 10, 11, 12, 13, 14, 15); 271 float8 C = _mm256_blend_ps!0xe7(A, B); 272 float[8] correct = [8, 9, 10, 3, 4, 13, 14, 15]; 273 assert(C.array == correct); 274 } 275 276 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using mask. 277 __m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask) @trusted 278 { 279 // PERF DMD 280 static if (GDC_with_AVX) 281 { 282 // Amazingly enough, GCC/GDC generates the vblendvpd instruction 283 // with -mavx2 but not -mavx. 284 // Not sure what is the reason, and there is a replacement sequence. 285 // PERF: Sounds like a bug, similar to _mm_blendv_pd 286 return __builtin_ia32_blendvpd256(a, b, mask); 287 } 288 else static if (LDC_with_AVX) 289 { 290 return __builtin_ia32_blendvpd256(a, b, mask); 291 } 292 else 293 { 294 // LDC x86: vblendvpd since LDC 1.27 -O2 295 // arm64: only 4 instructions, since LDC 1.27 -O2 296 __m256d r; 297 long4 lmask = cast(long4)mask; 298 for (int n = 0; n < 4; ++n) 299 { 300 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 301 } 302 return r; 303 } 304 } 305 unittest 306 { 307 __m256d A = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0); 308 __m256d B = _mm256_setr_pd(5.0, 6.0, 7.0, 8.0); 309 __m256d M = _mm256_setr_pd(-3.0, 2.0, 1.0, -4.0); 310 __m256d R = _mm256_blendv_pd(A, B, M); 311 double[4] correct1 = [5.0, 2.0, 3.0, 8.0]; 312 assert(R.array == correct1); // Note: probably the same NaN-mask oddity exist on arm64+linux than with _mm_blendv_pd 313 } 314 315 // TODO __m256 _mm256_blendv_ps (__m256 a, __m256 b, __m256 mask) 316 317 /// Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit) 318 /// floating-point elements) to all elements. 319 /// This effectively duplicates the 128-bit vector. 320 __m256d _mm256_broadcast_pd (const(__m128d)* mem_addr) pure @trusted 321 { 322 // PERF DMD 323 static if (GDC_with_AVX) 324 { 325 return __builtin_ia32_vbroadcastf128_pd256(cast(float4*)mem_addr); 326 } 327 else 328 { 329 const(double)* p = cast(const(double)*) mem_addr; 330 __m256d r; 331 r.ptr[0] = p[0]; 332 r.ptr[1] = p[1]; 333 r.ptr[2] = p[0]; 334 r.ptr[3] = p[1]; 335 return r; 336 } 337 } 338 unittest 339 { 340 __m128d A = _mm_setr_pd(3, -4); 341 __m256d B = _mm256_broadcast_pd(&A); 342 double[4] correct = [3, -4, 3, -4]; 343 assert(B.array == correct); 344 } 345 346 /// Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) 347 /// floating-point elements) to all elements. 348 /// This effectively duplicates the 128-bit vector. 349 __m256 _mm256_broadcast_ps (const(__m128)* mem_addr) pure @trusted 350 { 351 // PERF DMD 352 static if (GDC_with_AVX) 353 { 354 return __builtin_ia32_vbroadcastf128_ps256(cast(float4*)mem_addr); 355 } 356 else 357 { 358 const(float)* p = cast(const(float)*)mem_addr; 359 __m256 r; 360 r.ptr[0] = p[0]; 361 r.ptr[1] = p[1]; 362 r.ptr[2] = p[2]; 363 r.ptr[3] = p[3]; 364 r.ptr[4] = p[0]; 365 r.ptr[5] = p[1]; 366 r.ptr[6] = p[2]; 367 r.ptr[7] = p[3]; 368 return r; 369 } 370 } 371 unittest 372 { 373 __m128 A = _mm_setr_ps(1, 2, 3, -4); 374 __m256 B = _mm256_broadcast_ps(&A); 375 float[8] correct = [1.0f, 2, 3, -4, 1, 2, 3, -4]; 376 assert(B.array == correct); 377 } 378 379 /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements. 380 __m256d _mm256_broadcast_sd (const(double)* mem_addr) pure @trusted 381 { 382 static if (GDC_with_AVX) 383 { 384 return __builtin_ia32_vbroadcastsd256(mem_addr); 385 } 386 else 387 { 388 double a = *mem_addr; 389 __m256d r; 390 r.ptr[0] = a; 391 r.ptr[1] = a; 392 r.ptr[2] = a; 393 r.ptr[3] = a; 394 return r; 395 } 396 } 397 unittest 398 { 399 double t = 7.5f; 400 __m256d A = _mm256_broadcast_sd(&t); 401 double[4] correct = [7.5, 7.5, 7.5, 7.5]; 402 assert(A.array == correct); 403 } 404 405 /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements. 406 __m128 _mm_broadcast_ss (const(float)* mem_addr) pure @trusted 407 { 408 // PERF: DMD 409 static if (GDC_with_AVX) 410 { 411 return __builtin_ia32_vbroadcastss(mem_addr); 412 } 413 else 414 { 415 float a = *mem_addr; 416 __m128 r; 417 r.ptr[0] = a; 418 r.ptr[1] = a; 419 r.ptr[2] = a; 420 r.ptr[3] = a; 421 return r; 422 } 423 } 424 unittest 425 { 426 float t = 7.5f; 427 __m128 A = _mm_broadcast_ss(&t); 428 float[4] correct = [7.5f, 7.5f, 7.5f, 7.5f]; 429 assert(A.array == correct); 430 } 431 432 __m256 _mm256_broadcast_ss (const(float)* mem_addr) 433 { 434 // PERF: DMD 435 static if (GDC_with_AVX) 436 { 437 return __builtin_ia32_vbroadcastss256 (mem_addr); 438 } 439 else 440 { 441 float a = *mem_addr; 442 __m256 r = __m256(a); 443 return r; 444 } 445 } 446 unittest 447 { 448 float t = 7.5f; 449 __m256 A = _mm256_broadcast_ss(&t); 450 float[8] correct = [7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f]; 451 assert(A.array == correct); 452 } 453 454 /// Cast vector of type `__m256d` to type `__m256`. 455 __m256 _mm256_castpd_ps (__m256d a) pure @safe 456 { 457 return cast(__m256)a; 458 } 459 460 /// Cast vector of type `__m256d` to type `__m256i`. 461 __m256i _mm256_castpd_si256 (__m256d a) pure @safe 462 { 463 return cast(__m256i)a; 464 } 465 466 /// Cast vector of type `__m128d` to type `__m256d`; the upper 128 bits of the result are undefined. 467 __m256d _mm256_castpd128_pd256 (__m128d a) pure @trusted 468 { 469 static if (GDC_with_AVX) 470 { 471 return __builtin_ia32_pd256_pd(a); 472 } 473 else 474 { 475 __m256d r = void; 476 r.ptr[0] = a.array[0]; 477 r.ptr[1] = a.array[1]; 478 return r; 479 } 480 } 481 unittest 482 { 483 __m128d A = _mm_setr_pd(4.0, -6.125); 484 __m256d B = _mm256_castpd128_pd256(A); 485 assert(B.array[0] == 4.0); 486 assert(B.array[1] == -6.125); 487 } 488 489 /// Cast vector of type `__m256d` to type `__m128d`; the upper 128 bits of `a` are lost. 490 __m128d _mm256_castpd256_pd128 (__m256d a) pure @trusted 491 { 492 static if (GDC_with_AVX) 493 { 494 return __builtin_ia32_pd_pd256(a); 495 } 496 else 497 { 498 __m128d r; 499 r.ptr[0] = a.array[0]; 500 r.ptr[1] = a.array[1]; 501 return r; 502 } 503 } 504 unittest 505 { 506 __m256d A = _mm256_set_pd(1, 2, -6.25, 4.0); 507 __m128d B = _mm256_castpd256_pd128(A); 508 assert(B.array[0] == 4.0); 509 assert(B.array[1] == -6.25); 510 } 511 512 /// Cast vector of type `__m256` to type `__m256d`. 513 __m256d _mm256_castps_pd (__m256 a) pure @safe 514 { 515 return cast(__m256d)a; 516 } 517 518 /// Cast vector of type `__m256` to type `__m256i`. 519 __m256i _mm256_castps_si256 (__m256 a) pure @safe 520 { 521 return cast(__m256i)a; 522 } 523 524 /// Cast vector of type `__m128` to type `__m256`; the upper 128 bits of the result are undefined. 525 __m256 _mm256_castps128_ps256 (__m128 a) pure @trusted 526 { 527 static if (GDC_with_AVX) 528 { 529 return __builtin_ia32_ps256_ps(a); 530 } 531 else 532 { 533 __m256 r = void; 534 r.ptr[0] = a.array[0]; 535 r.ptr[1] = a.array[1]; 536 r.ptr[2] = a.array[2]; 537 r.ptr[3] = a.array[3]; 538 return r; 539 } 540 } 541 542 // TODO __m128 _mm256_castps256_ps128 (__m256 a) 543 // TODO __m256i _mm256_castsi128_si256 (__m128i a) 544 // TODO __m256d _mm256_castsi256_pd (__m256i a) 545 // TODO __m256 _mm256_castsi256_ps (__m256i a) 546 // TODO __m128i _mm256_castsi256_si128 (__m256i a) 547 548 549 550 // TODO __m256d _mm256_ceil_pd (__m256d a) 551 // TODO __m256 _mm256_ceil_ps (__m256 a) 552 // TODO __m128d _mm_cmp_pd (__m128d a, __m128d b, const int imm8) 553 // TODO __m256d _mm256_cmp_pd (__m256d a, __m256d b, const int imm8) 554 // TODO __m128 _mm_cmp_ps (__m128 a, __m128 b, const int imm8) 555 // TODO __m256 _mm256_cmp_ps (__m256 a, __m256 b, const int imm8) 556 // TODO __m128d _mm_cmp_sd (__m128d a, __m128d b, const int imm8) 557 // TODO __m128 _mm_cmp_ss (__m128 a, __m128 b, const int imm8) 558 // TODO __m256d _mm256_cvtepi32_pd (__m128i a) 559 // TODO __m256 _mm256_cvtepi32_ps (__m256i a) 560 // TODO __m128i _mm256_cvtpd_epi32 (__m256d a) 561 // TODO __m128 _mm256_cvtpd_ps (__m256d a) 562 // TODO __m256i _mm256_cvtps_epi32 (__m256 a) 563 // TODO __m256d _mm256_cvtps_pd (__m128 a) 564 // TODO double _mm256_cvtsd_f64 (__m256d a) 565 // TODO int _mm256_cvtsi256_si32 (__m256i a) 566 // TODO float _mm256_cvtss_f32 (__m256 a) 567 // TODO __m128i _mm256_cvttpd_epi32 (__m256d a) 568 // TODO __m256i _mm256_cvttps_epi32 (__m256 a) 569 // TODO __m256d _mm256_div_pd (__m256d a, __m256d b) 570 // TODO __m256 _mm256_div_ps (__m256 a, __m256 b) 571 // TODO __m256 _mm256_dp_ps (__m256 a, __m256 b, const int imm8) 572 573 574 /// Extract a 32-bit integer from `a`, selected with `imm8`. 575 int _mm256_extract_epi32 (__m256i a, const int imm8) pure @trusted 576 { 577 return (cast(int8)a).array[imm8 & 7]; 578 } 579 unittest 580 { 581 align(16) int[8] data = [-1, 2, -3, 4, 9, -7, 8, -6]; 582 auto A = _mm256_loadu_si256(cast(__m256i*) data.ptr); 583 assert(_mm256_extract_epi32(A, 0) == -1); 584 assert(_mm256_extract_epi32(A, 1 + 8) == 2); 585 assert(_mm256_extract_epi32(A, 3 + 16) == 4); 586 assert(_mm256_extract_epi32(A, 7 + 32) == -6); 587 } 588 589 // TODO __int64 _mm256_extract_epi64 (__m256i a, const int index) 590 // TODO __m128d _mm256_extractf128_pd (__m256d a, const int imm8) 591 // TODO __m128 _mm256_extractf128_ps (__m256 a, const int imm8) 592 // TODO __m128i _mm256_extractf128_si256 (__m256i a, const int imm8) 593 // TODO __m256d _mm256_floor_pd (__m256d a) 594 // TODO __m256 _mm256_floor_ps (__m256 a) 595 // TODO __m256d _mm256_hadd_pd (__m256d a, __m256d b) 596 // TODO __m256 _mm256_hadd_ps (__m256 a, __m256 b) 597 // TODO __m256d _mm256_hsub_pd (__m256d a, __m256d b) 598 // TODO __m256 _mm256_hsub_ps (__m256 a, __m256 b) 599 // TODO __m256i _mm256_insert_epi16 (__m256i a, __int16 i, const int index) 600 // TODO __m256i _mm256_insert_epi32 (__m256i a, __int32 i, const int index) 601 // TODO __m256i _mm256_insert_epi64 (__m256i a, __int64 i, const int index) 602 // TODO __m256i _mm256_insert_epi8 (__m256i a, __int8 i, const int index) 603 // TODO __m256d _mm256_insertf128_pd (__m256d a, __m128d b, int imm8) 604 // TODO __m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8) 605 // TODO __m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8) 606 // TODO __m256i _mm256_lddqu_si256 (__m256i const * mem_addr) 607 608 /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) 609 /// from memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 610 /// exception may be generated. 611 __m256d _mm256_load_pd (const(double)* mem_addr) pure @trusted 612 { 613 return *cast(__m256d*)mem_addr; 614 } 615 unittest 616 { 617 static immutable align(32) double[4] correct = [1.0, 2.0, 3.5, -42.0]; 618 __m256d A = _mm256_load_pd(correct.ptr); 619 assert(A.array == correct); 620 } 621 622 /// Load 256-bits (composed of 8 packed single-precision (32-bit) 623 /// floating-point elements) from memory. 624 /// `mem_addr` must be aligned on a 32-byte boundary or a 625 /// general-protection exception may be generated. 626 __m256 _mm256_load_ps (const(float)* mem_addr) pure @trusted 627 { 628 return *cast(__m256*)mem_addr; 629 } 630 unittest 631 { 632 static immutable align(32) float[8] correct = 633 [1.0, 2.0, 3.5, -42.0, 7.43f, 0.0f, 3, 2]; 634 __m256 A = _mm256_load_ps(correct.ptr); 635 assert(A.array == correct); 636 } 637 638 /// Load 256-bits of integer data from memory. `mem_addr` does not need to be aligned on 639 /// any particular boundary. 640 // TODO: take void* as input 641 // TODO: make that @system 642 __m256i _mm256_loadu_si256 (const(__m256i)* mem_addr) pure @trusted 643 { 644 // PERF DMD 645 static if (GDC_with_AVX) 646 { 647 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) mem_addr); 648 } 649 else version(LDC) 650 { 651 return loadUnaligned!(__m256i)(cast(long*)mem_addr); 652 } 653 else 654 { 655 const(long)* p = cast(const(long)*)mem_addr; 656 long4 r; 657 r.ptr[0] = p[0]; 658 r.ptr[1] = p[1]; 659 r.ptr[2] = p[2]; 660 r.ptr[3] = p[3]; 661 return r; 662 } 663 } 664 unittest 665 { 666 align(16) int[8] correct = [-1, 2, -3, 4, 9, -7, 8, -6]; 667 int8 A = cast(int8) _mm256_loadu_si256(cast(__m256i*) correct.ptr); 668 assert(A.array == correct); 669 } 670 671 /// Load 256-bits of integer data from memory. `mem_addr` must be aligned on a 672 /// 32-byte boundary or a general-protection exception may be generated. 673 __m256i _mm256_load_si256 (const(void)* mem_addr) pure @trusted // TODO @system 674 { 675 return *cast(__m256i*)mem_addr; 676 } 677 unittest 678 { 679 static immutable align(64) long[4] correct = [1, -2, long.min, long.max]; 680 __m256i A = _mm256_load_si256(correct.ptr); 681 assert(A.array == correct); 682 } 683 684 /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) 685 /// from memory. `mem_addr` does not need to be aligned on any particular boundary. 686 __m256d _mm256_loadu_pd (const(void)* mem_addr) pure @trusted // TODO @system 687 { 688 // PERF DMD 689 static if (GDC_with_AVX) 690 { 691 return __builtin_ia32_loadupd256 ( cast(const(double)*) mem_addr); 692 } 693 else version(LDC) 694 { 695 return loadUnaligned!(__m256d)(cast(double*)mem_addr); 696 } 697 else 698 { 699 const(double)* p = cast(const(double)*)mem_addr; 700 double4 r; 701 r.ptr[0] = p[0]; 702 r.ptr[1] = p[1]; 703 r.ptr[2] = p[2]; 704 r.ptr[3] = p[3]; 705 return r; 706 } 707 } 708 unittest 709 { 710 double[4] correct = [1.0, -2.0, 0.0, 768.5]; 711 __m256d A = _mm256_loadu_pd(correct.ptr); 712 assert(A.array == correct); 713 } 714 715 716 // TODO __m256 _mm256_loadu_ps (float const * mem_addr) 717 // TODO __m256 _mm256_loadu2_m128 (float const* hiaddr, float const* loaddr) 718 // TODO __m256d _mm256_loadu2_m128d (double const* hiaddr, double const* loaddr) 719 // TODO __m256i _mm256_loadu2_m128i (__m128i const* hiaddr, __m128i const* loaddr) 720 // TODO __m128d _mm_maskload_pd (double const * mem_addr, __m128i mask) 721 // TODO __m256d _mm256_maskload_pd (double const * mem_addr, __m256i mask) 722 // TODO __m128 _mm_maskload_ps (float const * mem_addr, __m128i mask) 723 // TODO __m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask) 724 // TODO void _mm_maskstore_pd (double * mem_addr, __m128i mask, __m128d a) 725 // TODO void _mm256_maskstore_pd (double * mem_addr, __m256i mask, __m256d a) 726 // TODO void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a) 727 // TODO void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a) 728 // TODO __m256d _mm256_max_pd (__m256d a, __m256d b) 729 // TODO __m256 _mm256_max_ps (__m256 a, __m256 b) 730 // TODO __m256d _mm256_min_pd (__m256d a, __m256d b) 731 // TODO __m256 _mm256_min_ps (__m256 a, __m256 b) 732 // TODO __m256d _mm256_movedup_pd (__m256d a) 733 // TODO __m256 _mm256_movehdup_ps (__m256 a) 734 // TODO __m256 _mm256_moveldup_ps (__m256 a) 735 // TODO int _mm256_movemask_pd (__m256d a) 736 // TODO int _mm256_movemask_ps (__m256 a) 737 738 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`. 739 __m256d _mm256_mul_pd (__m256d a, __m256d b) pure @safe 740 { 741 return a * b; 742 } 743 unittest 744 { 745 __m256d a = [-2.0, 1.5, -2.0, 1.5]; 746 a = _mm256_mul_pd(a, a); 747 assert(a.array == [4.0, 2.25, 4.0, 2.25]); 748 } 749 750 /// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`. 751 __m256 _mm256_mul_ps (__m256 a, __m256 b) pure @safe 752 { 753 return a * b; 754 } 755 unittest 756 { 757 __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 1.5f, -2.0f, 3.0f, 1.0f]; 758 a = _mm256_mul_ps(a, a); 759 float[8] correct = [2.25f, 4.0f, 9.0f, 1.0f, 2.25f, 4.0f, 9.0f, 1.0f]; 760 assert(a.array == correct); 761 } 762 763 764 /// Compute the bitwise NOT of 256 bits in `a`. #BONUS 765 __m256i _mm256_not_si256 (__m256i a) pure @safe 766 { 767 return ~a; 768 } 769 unittest 770 { 771 __m256i A = _mm256_set1_epi64x(-748); 772 long4 notA = cast(long4) _mm256_not_si256(A); 773 int[4] correct = [747, 747, 747, 747]; 774 assert(notA.array == correct); 775 } 776 777 778 // TODO __m256d _mm256_or_pd (__m256d a, __m256d b) 779 // TODO __m256 _mm256_or_ps (__m256 a, __m256 b) 780 // TODO __m128d _mm_permute_pd (__m128d a, int imm8) 781 // TODO __m256d _mm256_permute_pd (__m256d a, int imm8) 782 // TODO __m128 _mm_permute_ps (__m128 a, int imm8) 783 // TODO __m256 _mm256_permute_ps (__m256 a, int imm8) 784 // TODO __m256d _mm256_permute2f128_pd (__m256d a, __m256d b, int imm8) 785 // TODO __m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8) 786 // TODO __m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8) 787 // TODO __m128d _mm_permutevar_pd (__m128d a, __m128i b) 788 // TODO __m256d _mm256_permutevar_pd (__m256d a, __m256i b) 789 // TODO __m128 _mm_permutevar_ps (__m128 a, __m128i b) 790 // TODO __m256 _mm256_permutevar_ps (__m256 a, __m256i b) 791 // TODO __m256 _mm256_rcp_ps (__m256 a) 792 // TODO __m256d _mm256_round_pd (__m256d a, int rounding) 793 // TODO __m256 _mm256_round_ps (__m256 a, int rounding) 794 // TODO __m256 _mm256_rsqrt_ps (__m256 a) 795 // TODO __m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) 796 // TODO __m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) 797 // TODO __m256i _mm256_set_epi64x (__int64 e3, __int64 e2, __int64 e1, __int64 e0) 798 799 // TODO __m256i _mm256_set_epi8 (char e31, char e30, char e29, char e28, char e27, char e26, char e25, char e24, char e23, char e22, char e21, char e20, char e19, char e18, char e17, char e16, char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0) 800 // TODO __m256 _mm256_set_m128 (__m128 hi, __m128 lo) 801 // TODO __m256d _mm256_set_m128d (__m128d hi, __m128d lo) 802 // TODO __m256i _mm256_set_m128i (__m128i hi, __m128i lo) 803 804 /// Set packed double-precision (64-bit) floating-point elements with the supplied values. 805 __m256d _mm256_set_pd (double e3, double e2, double e1, double e0) pure @trusted 806 { 807 // Note: with LDC, beats a load-unaligned thing. 808 // PERF: see #102, use = void 809 __m256d r; 810 r.ptr[0] = e0; 811 r.ptr[1] = e1; 812 r.ptr[2] = e2; 813 r.ptr[3] = e3; 814 return r; 815 } 816 unittest 817 { 818 __m256d A = _mm256_set_pd(3, 2, 1, 546); 819 double[4] correct = [546.0, 1.0, 2.0, 3.0]; 820 assert(A.array == correct); 821 } 822 823 /// Set packed single-precision (32-bit) floating-point elements with the supplied values. 824 __m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) pure @trusted 825 { 826 // PERF: see #102, use = void? 827 __m256 r; 828 r.ptr[0] = e0; 829 r.ptr[1] = e1; 830 r.ptr[2] = e2; 831 r.ptr[3] = e3; 832 r.ptr[4] = e4; 833 r.ptr[5] = e5; 834 r.ptr[6] = e6; 835 r.ptr[7] = e7; 836 return r; 837 } 838 unittest 839 { 840 __m256 A = _mm256_set_ps(3, 2, 1, 546.0f, -1.25f, -2, -3, 0); 841 float[8] correct = [0, -3, -2, -1.25f, 546.0f, 1.0, 2.0, 3.0]; 842 assert(A.array == correct); 843 } 844 845 /// Broadcast 16-bit integer `a` to all elements of the return value. 846 __m256i _mm256_set1_epi16 (short a) pure @trusted 847 { 848 // workaround https://issues.dlang.org/show_bug.cgi?id=21469 849 // It used to ICE, now the codegen is just wrong. 850 // TODO report this backend issue. 851 version(DigitalMars) 852 { 853 short16 v = a; 854 return cast(__m256i) v; 855 } 856 else 857 { 858 pragma(inline, true); 859 return cast(__m256i)(short16(a)); 860 } 861 } 862 unittest 863 { 864 short16 a = cast(short16) _mm256_set1_epi16(31); 865 for (int i = 0; i < 16; ++i) 866 assert(a.array[i] == 31); 867 } 868 869 /// Broadcast 32-bit integer `a` to all elements. 870 __m256i _mm256_set1_epi32 (int a) pure @trusted 871 { 872 // Bad codegen else in DMD. 873 // TODO report this backend issue. 874 version(DigitalMars) 875 { 876 int8 v = a; 877 return cast(__m256i) v; 878 } 879 else 880 { 881 pragma(inline, true); 882 return cast(__m256i)(int8(a)); 883 } 884 } 885 unittest 886 { 887 int8 a = cast(int8) _mm256_set1_epi32(31); 888 for (int i = 0; i < 8; ++i) 889 assert(a.array[i] == 31); 890 } 891 892 /// Broadcast 64-bit integer `a` to all elements of the return value. 893 __m256i _mm256_set1_epi64x (long a) 894 { 895 return cast(__m256i)(long4(a)); 896 } 897 unittest 898 { 899 long4 a = cast(long4) _mm256_set1_epi64x(-31); 900 for (int i = 0; i < 4; ++i) 901 assert(a.array[i] == -31); 902 } 903 904 /// Broadcast 8-bit integer `a` to all elements of the return value. 905 __m256i _mm256_set1_epi8 (byte a) pure @trusted 906 { 907 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 908 { 909 byte32 v = a; 910 return cast(__m256i) v; 911 } 912 else 913 { 914 pragma(inline, true); 915 return cast(__m256i)(byte32(a)); 916 } 917 } 918 unittest 919 { 920 byte32 a = cast(byte32) _mm256_set1_epi8(31); 921 for (int i = 0; i < 32; ++i) 922 assert(a.array[i] == 31); 923 } 924 925 /// Broadcast double-precision (64-bit) floating-point value `a` to all elements of the return value. 926 __m256d _mm256_set1_pd (double a) pure @trusted 927 { 928 return __m256d(a); 929 } 930 unittest 931 { 932 double a = 464.21; 933 double[4] correct = [a, a, a, a]; 934 double4 A = cast(double4) _mm256_set1_pd(a); 935 assert(A.array == correct); 936 } 937 938 /// Broadcast single-precision (32-bit) floating-point value `a` to all elements of the return value. 939 __m256 _mm256_set1_ps (float a) pure @trusted 940 { 941 return __m256(a); 942 } 943 unittest 944 { 945 float a = 464.21f; 946 float[8] correct = [a, a, a, a, a, a, a, a]; 947 float8 A = cast(float8) _mm256_set1_ps(a); 948 assert(A.array == correct); 949 } 950 951 /// Set packed 16-bit integers with the supplied values in reverse order. 952 __m256i _mm256_setr_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, 953 short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 954 { 955 short[16] result = [ e15, e14, e13, e12, e11, e10, e9, e8, 956 e7, e6, e5, e4, e3, e2, e1, e0]; 957 static if (GDC_with_AVX) 958 { 959 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr); 960 } 961 else version(LDC) 962 { 963 return cast(__m256i)( loadUnaligned!(short16)(result.ptr) ); 964 } 965 else 966 { 967 short16 r; 968 for(int n = 0; n < 16; ++n) 969 r.ptr[n] = result[n]; 970 return cast(__m256i)r; 971 } 972 } 973 unittest 974 { 975 short16 A = cast(short16) _mm256_setr_epi16(-1, 0, -21, 21, 42, 127, -42, -128, 976 -1, 0, -21, 21, 42, 127, -42, -128); 977 short[16] correct = [-1, 0, -21, 21, 42, 127, -42, -128, 978 -1, 0, -21, 21, 42, 127, -42, -128]; 979 assert(A.array == correct); 980 } 981 982 /// Set packed 32-bit integers with the supplied values in reverse order. 983 __m256i _mm256_setr_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted 984 { 985 int[8] result = [e7, e6, e5, e4, e3, e2, e1, e0]; 986 static if (GDC_with_AVX) 987 { 988 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr); 989 } 990 else version(LDC) 991 { 992 return cast(__m256i)( loadUnaligned!(int8)(result.ptr) ); 993 } 994 else 995 { 996 int8 r; 997 for(int n = 0; n < 8; ++n) 998 r.ptr[n] = result[n]; 999 return cast(__m256i)r; 1000 } 1001 } 1002 unittest 1003 { 1004 int8 A = cast(int8) _mm256_setr_epi32(-1, 0, -2147483648, 2147483647, 42, 666, -42, -666); 1005 int[8] correct = [-1, 0, -2147483648, 2147483647, 42, 666, -42, -666]; 1006 assert(A.array == correct); 1007 } 1008 1009 // TODO __m256i _mm256_setr_epi64x (__int64 e3, __int64 e2, __int64 e1, __int64 e0) 1010 1011 /// Set packed 8-bit integers with the supplied values in reverse order. 1012 __m256i _mm256_setr_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24, 1013 byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16, 1014 byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9, byte e8, 1015 byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 1016 { 1017 // PERF GDC, not checked 1018 byte[32] result = [ e31, e30, e29, e28, e27, e26, e25, e24, 1019 e23, e22, e21, e20, e19, e18, e17, e16, 1020 e15, e14, e13, e12, e11, e10, e9, e8, 1021 e7, e6, e5, e4, e3, e2, e1, e0]; 1022 static if (GDC_with_AVX) 1023 { 1024 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr); 1025 } 1026 else version(LDC) 1027 { 1028 return cast(__m256i)( loadUnaligned!(byte32)(result.ptr) ); 1029 } 1030 else 1031 { 1032 byte32 r; 1033 for(int n = 0; n < 32; ++n) 1034 r.ptr[n] = result[n]; 1035 return cast(__m256i)r; 1036 } 1037 } 1038 unittest 1039 { 1040 byte32 A = cast(byte32) _mm256_setr_epi8( -1, 0, -21, 21, 42, 127, -42, -128, 1041 -1, 0, -21, 21, 42, 127, -42, -128, 1042 -1, 0, -21, 21, 42, 127, -42, -128, 1043 -1, 0, -21, 21, 42, 127, -42, -128); 1044 byte[32] correct = [-1, 0, -21, 21, 42, 127, -42, -128, 1045 -1, 0, -21, 21, 42, 127, -42, -128, 1046 -1, 0, -21, 21, 42, 127, -42, -128, 1047 -1, 0, -21, 21, 42, 127, -42, -128]; 1048 assert(A.array == correct); 1049 } 1050 1051 // TODO __m256 _mm256_setr_m128 (__m128 lo, __m128 hi) 1052 // TODO __m256d _mm256_setr_m128d (__m128d lo, __m128d hi) 1053 // TODO __m256i _mm256_setr_m128i (__m128i lo, __m128i hi) 1054 1055 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order. 1056 __m256d _mm256_setr_pd (double e3, double e2, double e1, double e0) pure @trusted 1057 { 1058 version(LDC) 1059 { 1060 // PERF, probably not the best 1061 double[4] result = [e3, e2, e1, e0]; 1062 return loadUnaligned!(double4)(result.ptr); 1063 } 1064 else 1065 { 1066 __m256d r; 1067 r.ptr[0] = e3; 1068 r.ptr[1] = e2; 1069 r.ptr[2] = e1; 1070 r.ptr[3] = e0; 1071 return r; 1072 } 1073 } 1074 unittest 1075 { 1076 __m256d A = _mm256_setr_pd(3, 2, 1, 546.125); 1077 double[4] correct = [3.0, 2.0, 1.0, 546.125]; 1078 assert(A.array == correct); 1079 } 1080 1081 1082 /// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order. 1083 __m256 _mm256_setr_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) pure @trusted 1084 { 1085 // PERF DMD 1086 static if (GDC_with_AVX) 1087 { 1088 align(32) float[8] r = [ e7, e6, e5, e4, e3, e2, e1, e0]; 1089 return *cast(__m256*)r; 1090 } 1091 else version(LDC) 1092 { 1093 align(32) float[8] r = [ e7, e6, e5, e4, e3, e2, e1, e0]; 1094 return *cast(__m256*)r; 1095 } 1096 else 1097 { 1098 __m256 r; 1099 r.ptr[0] = e7; 1100 r.ptr[1] = e6; 1101 r.ptr[2] = e5; 1102 r.ptr[3] = e4; 1103 r.ptr[4] = e3; 1104 r.ptr[5] = e2; 1105 r.ptr[6] = e1; 1106 r.ptr[7] = e0; 1107 return r; 1108 } 1109 } 1110 unittest 1111 { 1112 __m256 A = _mm256_setr_ps( 3, 2, 1, 546.125f, 4, 5, 6, 7); 1113 float[8] correct = [3.0f, 2, 1, 546.125f, 4, 5, 6, 7]; 1114 assert(A.array == correct); 1115 } 1116 1117 /// Return vector of type `__m256d` with all elements set to zero. 1118 __m256d _mm256_setzero_pd () 1119 { 1120 return double4(0.0); 1121 } 1122 unittest 1123 { 1124 __m256d A = _mm256_setzero_pd(); 1125 double[4] correct = [0.0, 0.0, 0.0, 0.0]; 1126 assert(A.array == correct); 1127 } 1128 1129 /// Return vector of type `__m256` with all elements set to zero. 1130 __m256 _mm256_setzero_ps () 1131 { 1132 return float8(0.0f); 1133 } 1134 unittest 1135 { 1136 __m256 A = _mm256_setzero_ps(); 1137 float[8] correct = [0.0f, 0, 0, 0, 0, 0, 0, 0]; 1138 assert(A.array == correct); 1139 } 1140 1141 /// Return vector of type `__m256i` with all elements set to zero. 1142 __m256i _mm256_setzero_si256() pure @trusted 1143 { 1144 return __m256i(0); 1145 } 1146 unittest 1147 { 1148 __m256i A = _mm256_setzero_si256(); 1149 long[4] correct = [0, 0, 0, 0]; 1150 assert(A.array == correct); 1151 } 1152 1153 1154 // TODO __m256d _mm256_shuffle_pd (__m256d a, __m256d b, const int imm8) 1155 // TODO __m256 _mm256_shuffle_ps (__m256 a, __m256 b, const int imm8) 1156 // TODO __m256d _mm256_sqrt_pd (__m256d a) 1157 // TODO __m256 _mm256_sqrt_ps (__m256 a) 1158 // TODO void _mm256_store_pd (double * mem_addr, __m256d a) 1159 // TODO void _mm256_store_ps (float * mem_addr, __m256 a) 1160 // TODO void _mm256_store_si256 (__m256i * mem_addr, __m256i a) 1161 // TODO void _mm256_storeu_pd (double * mem_addr, __m256d a) 1162 // TODO void _mm256_storeu_ps (float * mem_addr, __m256 a) 1163 1164 /// Store 256-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular boundary. 1165 void _mm256_storeu_si256 (const(__m256i)* mem_addr, __m256i a) pure @trusted 1166 { 1167 // PERF: DMD and GDC 1168 version(LDC) 1169 { 1170 storeUnaligned!__m256i(a, cast(long*)mem_addr); 1171 } 1172 else 1173 { 1174 long4 v = cast(long4)a; 1175 long* p = cast(long*)mem_addr; 1176 for(int n = 0; n < 4; ++n) 1177 p[n] = v[n]; 1178 } 1179 } 1180 1181 // TODO void _mm256_storeu2_m128 (float* hiaddr, float* loaddr, __m256 a) 1182 // TODO void _mm256_storeu2_m128d (double* hiaddr, double* loaddr, __m256d a) 1183 // TODO void _mm256_storeu2_m128i (__m128i* hiaddr, __m128i* loaddr, __m256i a) 1184 // TODO void _mm256_stream_pd (double * mem_addr, __m256d a) 1185 // TODO void _mm256_stream_ps (float * mem_addr, __m256 a) 1186 // TODO void _mm256_stream_si256 (__m256i * mem_addr, __m256i a) 1187 1188 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from 1189 /// packed double-precision (64-bit) floating-point elements in `a`. 1190 __m256d _mm256_sub_pd (__m256d a, __m256d b) pure @safe 1191 { 1192 return a - b; 1193 } 1194 unittest 1195 { 1196 __m256d a = [1.5, -2.0, 3.0, 200000.0]; 1197 a = _mm256_sub_pd(a, a); 1198 double[4] correct = [0.0, 0, 0, 0]; 1199 assert(a.array == correct); 1200 } 1201 1202 /// Subtract packed single-precision (32-bit) floating-point elements in `b` from 1203 /// packed single-precision (32-bit) floating-point elements in `a`. 1204 __m256 _mm256_sub_ps (__m256 a, __m256 b) pure @safe 1205 { 1206 return a - b; 1207 } 1208 unittest 1209 { 1210 __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 1.5f, -2000.0f, 3.0f, 1.0f]; 1211 a = _mm256_sub_ps(a, a); 1212 float[8] correct = [0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f]; 1213 assert(a.array == correct); 1214 } 1215 1216 1217 // TODO int _mm_testc_pd (__m128d a, __m128d b) 1218 // TODO int _mm256_testc_pd (__m256d a, __m256d b) 1219 // TODO int _mm_testc_ps (__m128 a, __m128 b) 1220 // TODO int _mm256_testc_ps (__m256 a, __m256 b) 1221 // TODO int _mm256_testc_si256 (__m256i a, __m256i b) 1222 // TODO int _mm_testnzc_pd (__m128d a, __m128d b) 1223 // TODO int _mm256_testnzc_pd (__m256d a, __m256d b) 1224 // TODO int _mm_testnzc_ps (__m128 a, __m128 b) 1225 // TODO int _mm256_testnzc_ps (__m256 a, __m256 b) 1226 // TODO int _mm256_testnzc_si256 (__m256i a, __m256i b) 1227 // TODO int _mm_testz_pd (__m128d a, __m128d b) 1228 // TODO int _mm256_testz_pd (__m256d a, __m256d b) 1229 // TODO int _mm_testz_ps (__m128 a, __m128 b) 1230 // TODO int _mm256_testz_ps (__m256 a, __m256 b) 1231 // TODO int _mm256_testz_si256 (__m256i a, __m256i b) 1232 1233 /// Return vector of type __m256d with undefined elements. 1234 __m256d _mm256_undefined_pd () pure @safe 1235 { 1236 __m256d r = void; 1237 return r; 1238 } 1239 1240 /// Return vector of type __m256 with undefined elements. 1241 __m256 _mm256_undefined_ps () pure @safe 1242 { 1243 __m256 r = void; 1244 return r; 1245 } 1246 1247 /// Return vector of type __m256i with undefined elements. 1248 __m256i _mm256_undefined_si256 () pure @safe 1249 { 1250 __m256i r = void; 1251 return r; 1252 } 1253 1254 // TODO __m256d _mm256_unpackhi_pd (__m256d a, __m256d b) 1255 // TODO __m256 _mm256_unpackhi_ps (__m256 a, __m256 b) 1256 // TODO __m256d _mm256_unpacklo_pd (__m256d a, __m256d b) 1257 // TODO __m256 _mm256_unpacklo_ps (__m256 a, __m256 b) 1258 // TODO __m256d _mm256_xor_pd (__m256d a, __m256d b) 1259 // TODO __m256 _mm256_xor_ps (__m256 a, __m256 b) 1260 1261 void _mm256_zeroall () pure @safe 1262 { 1263 // TODO: on GDC too? 1264 // Do nothing. The transitions penalty are handled by the backend. 1265 } 1266 1267 void _mm256_zeroupper () pure @safe 1268 { 1269 // TODO: on GDC too? 1270 // Do nothing. The transitions penalty are handled by the backend. 1271 } 1272 1273 // TODO __m256d _mm256_zextpd128_pd256 (__m128d a) 1274 // TODO __m256 _mm256_zextps128_ps256 (__m128 a) 1275 // TODO __m256i _mm256_zextsi128_si256 (__m128i a) 1276 1277 1278 /+ 1279 1280 1281 pragma(LDC_intrinsic, "llvm.x86.avx.blendv.pd.256") 1282 double4 __builtin_ia32_blendvpd256(double4, double4, double4) pure @safe; 1283 1284 pragma(LDC_intrinsic, "llvm.x86.avx.blendv.ps.256") 1285 float8 __builtin_ia32_blendvps256(float8, float8, float8) pure @safe; 1286 1287 pragma(LDC_intrinsic, "llvm.x86.avx.cvt.pd2.ps.256") 1288 float4 __builtin_ia32_cvtpd2ps256(double4) pure @safe; 1289 1290 pragma(LDC_intrinsic, "llvm.x86.avx.cvt.pd2dq.256") 1291 int4 __builtin_ia32_cvtpd2dq256(double4) pure @safe; 1292 1293 pragma(LDC_intrinsic, "llvm.x86.avx.cvt.ps2dq.256") 1294 int8 __builtin_ia32_cvtps2dq256(float8) pure @safe; 1295 1296 pragma(LDC_intrinsic, "llvm.x86.avx.cvtt.pd2dq.256") 1297 int4 __builtin_ia32_cvttpd2dq256(double4) pure @safe; 1298 1299 pragma(LDC_intrinsic, "llvm.x86.avx.cvtt.ps2dq.256") 1300 int8 __builtin_ia32_cvttps2dq256(float8) pure @safe; 1301 1302 pragma(LDC_intrinsic, "llvm.x86.avx.dp.ps.256") 1303 float8 __builtin_ia32_dpps256(float8, float8, byte) pure @safe; 1304 1305 pragma(LDC_intrinsic, "llvm.x86.avx.hadd.pd.256") 1306 double4 __builtin_ia32_haddpd256(double4, double4) pure @safe; 1307 1308 pragma(LDC_intrinsic, "llvm.x86.avx.hadd.ps.256") 1309 float8 __builtin_ia32_haddps256(float8, float8) pure @safe; 1310 1311 pragma(LDC_intrinsic, "llvm.x86.avx.hsub.pd.256") 1312 double4 __builtin_ia32_hsubpd256(double4, double4) pure @safe; 1313 1314 pragma(LDC_intrinsic, "llvm.x86.avx.hsub.ps.256") 1315 float8 __builtin_ia32_hsubps256(float8, float8) pure @safe; 1316 1317 pragma(LDC_intrinsic, "llvm.x86.avx.ldu.dq.256") 1318 byte32 __builtin_ia32_lddqu256(const void*); 1319 1320 pragma(LDC_intrinsic, "llvm.x86.avx.maskload.pd") 1321 double2 __builtin_ia32_maskloadpd(const void*, long2); 1322 1323 pragma(LDC_intrinsic, "llvm.x86.avx.maskload.pd.256") 1324 double4 __builtin_ia32_maskloadpd256(const void*, long4); 1325 1326 pragma(LDC_intrinsic, "llvm.x86.avx.maskload.ps") 1327 float4 __builtin_ia32_maskloadps(const void*, int4); 1328 1329 pragma(LDC_intrinsic, "llvm.x86.avx.maskload.ps.256") 1330 float8 __builtin_ia32_maskloadps256(const void*, int8); 1331 1332 pragma(LDC_intrinsic, "llvm.x86.avx.maskstore.pd") 1333 void __builtin_ia32_maskstorepd(void*, long2, double2); 1334 1335 pragma(LDC_intrinsic, "llvm.x86.avx.maskstore.pd.256") 1336 void __builtin_ia32_maskstorepd256(void*, long4, double4); 1337 1338 pragma(LDC_intrinsic, "llvm.x86.avx.maskstore.ps") 1339 void __builtin_ia32_maskstoreps(void*, int4, float4); 1340 1341 pragma(LDC_intrinsic, "llvm.x86.avx.maskstore.ps.256") 1342 void __builtin_ia32_maskstoreps256(void*, int8, float8); 1343 1344 pragma(LDC_intrinsic, "llvm.x86.avx.max.pd.256") 1345 double4 __builtin_ia32_maxpd256(double4, double4) pure @safe; 1346 1347 pragma(LDC_intrinsic, "llvm.x86.avx.max.ps.256") 1348 float8 __builtin_ia32_maxps256(float8, float8) pure @safe; 1349 1350 pragma(LDC_intrinsic, "llvm.x86.avx.min.pd.256") 1351 double4 __builtin_ia32_minpd256(double4, double4) pure @safe; 1352 1353 pragma(LDC_intrinsic, "llvm.x86.avx.min.ps.256") 1354 float8 __builtin_ia32_minps256(float8, float8) pure @safe; 1355 1356 pragma(LDC_intrinsic, "llvm.x86.avx.movmsk.pd.256") 1357 int __builtin_ia32_movmskpd256(double4) pure @safe; 1358 1359 pragma(LDC_intrinsic, "llvm.x86.avx.movmsk.ps.256") 1360 int __builtin_ia32_movmskps256(float8) pure @safe; 1361 1362 pragma(LDC_intrinsic, "llvm.x86.avx.ptestc.256") 1363 int __builtin_ia32_ptestc256(long4, long4) pure @safe; 1364 1365 pragma(LDC_intrinsic, "llvm.x86.avx.ptestnzc.256") 1366 int __builtin_ia32_ptestnzc256(long4, long4) pure @safe; 1367 1368 pragma(LDC_intrinsic, "llvm.x86.avx.ptestz.256") 1369 int __builtin_ia32_ptestz256(long4, long4) pure @safe; 1370 1371 pragma(LDC_intrinsic, "llvm.x86.avx.rcp.ps.256") 1372 float8 __builtin_ia32_rcpps256(float8) pure @safe; 1373 1374 pragma(LDC_intrinsic, "llvm.x86.avx.round.pd.256") 1375 double4 __builtin_ia32_roundpd256(double4, int) pure @safe; 1376 1377 pragma(LDC_intrinsic, "llvm.x86.avx.round.ps.256") 1378 float8 __builtin_ia32_roundps256(float8, int) pure @safe; 1379 1380 pragma(LDC_intrinsic, "llvm.x86.avx.rsqrt.ps.256") 1381 float8 __builtin_ia32_rsqrtps256(float8) pure @safe; 1382 1383 pragma(LDC_intrinsic, "llvm.x86.avx.vpermilvar.pd") 1384 double2 __builtin_ia32_vpermilvarpd(double2, long2) pure @safe; 1385 1386 pragma(LDC_intrinsic, "llvm.x86.avx.vpermilvar.pd.256") 1387 double4 __builtin_ia32_vpermilvarpd256(double4, long4) pure @safe; 1388 1389 pragma(LDC_intrinsic, "llvm.x86.avx.vpermilvar.ps") 1390 float4 __builtin_ia32_vpermilvarps(float4, int4) pure @safe; 1391 1392 pragma(LDC_intrinsic, "llvm.x86.avx.vpermilvar.ps.256") 1393 float8 __builtin_ia32_vpermilvarps256(float8, int8) pure @safe; 1394 1395 pragma(LDC_intrinsic, "llvm.x86.avx.vtestc.pd") 1396 int __builtin_ia32_vtestcpd(double2, double2) pure @safe; 1397 1398 pragma(LDC_intrinsic, "llvm.x86.avx.vtestc.pd.256") 1399 int __builtin_ia32_vtestcpd256(double4, double4) pure @safe; 1400 1401 pragma(LDC_intrinsic, "llvm.x86.avx.vtestc.ps") 1402 int __builtin_ia32_vtestcps(float4, float4) pure @safe; 1403 1404 pragma(LDC_intrinsic, "llvm.x86.avx.vtestc.ps.256") 1405 int __builtin_ia32_vtestcps256(float8, float8) pure @safe; 1406 1407 pragma(LDC_intrinsic, "llvm.x86.avx.vtestnzc.pd") 1408 int __builtin_ia32_vtestnzcpd(double2, double2) pure @safe; 1409 1410 pragma(LDC_intrinsic, "llvm.x86.avx.vtestnzc.pd.256") 1411 int __builtin_ia32_vtestnzcpd256(double4, double4) pure @safe; 1412 1413 pragma(LDC_intrinsic, "llvm.x86.avx.vtestnzc.ps") 1414 int __builtin_ia32_vtestnzcps(float4, float4) pure @safe; 1415 1416 pragma(LDC_intrinsic, "llvm.x86.avx.vtestnzc.ps.256") 1417 int __builtin_ia32_vtestnzcps256(float8, float8) pure @safe; 1418 1419 pragma(LDC_intrinsic, "llvm.x86.avx.vtestz.pd") 1420 int __builtin_ia32_vtestzpd(double2, double2) pure @safe; 1421 1422 pragma(LDC_intrinsic, "llvm.x86.avx.vtestz.pd.256") 1423 int __builtin_ia32_vtestzpd256(double4, double4) pure @safe; 1424 1425 pragma(LDC_intrinsic, "llvm.x86.avx.vtestz.ps") 1426 int __builtin_ia32_vtestzps(float4, float4) pure @safe; 1427 1428 pragma(LDC_intrinsic, "llvm.x86.avx.vtestz.ps.256") 1429 int __builtin_ia32_vtestzps256(float8, float8) pure @safe; 1430 1431 +/