1 /** 2 * AVX intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX 4 * 5 * Copyright: Guillaume Piolat 2022. 6 * Johan Engelen 2022. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.avxintrin; 10 11 // AVX instructions 12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX 13 // Note: this header will work whether you have AVX enabled or not. 14 // With LDC, use "dflags-ldc": ["-mattr=+avx"] or equivalent to actively 15 // generate AVX instructions. 16 // With GDC, use "dflags-gdc": ["-mavx"] or equivalent to actively 17 // generate AVX instructions. 18 19 20 /// IMPORTANT NOTE ABOUT MASK LOAD/STORE: 21 /// 22 /// In theory, masked load/store can adress unadressable memory provided the mask is zero. 23 /// In practice, that is not the case for the following reasons: 24 /// 25 /// - AMD manual says: 26 /// "Exception and trap behavior for elements not selected for loading or storing from/to memory 27 /// is implementation dependent. For instance, a given implementation may signal a data 28 /// breakpoint or a page fault for doublewords that are zero-masked and not actually written." 29 /// 30 /// - Intel fetches the whole cacheline anyway: 31 /// https://erik.science/2019/06/21/AVX-fun.html 32 /// "Even if the mask is stored in the special mask registers, it will still first fetch the data 33 /// before checking the mask." 34 /// 35 /// So intel-intrinsics adopted the tightened semantics of only adressing fully addressable memory 36 /// with masked loads and stores. 37 38 39 /// Some AVX intrinsics takes a float comparison constant. 40 /// When labelled "ordered" it means "AND ordered" 41 /// When labelled "unordered" it means "OR unordered" 42 alias _CMP_EQ = int; 43 ///ditto 44 enum : _CMP_EQ 45 { 46 _CMP_EQ_OQ = 0x00, // Equal (ordered, non-signaling) 47 _CMP_LT_OS = 0x01, // Less-than (ordered, signaling) 48 _CMP_LE_OS = 0x02, // Less-than-or-equal (ordered, signaling) 49 _CMP_UNORD_Q = 0x03, // Unordered (non-signaling) 50 _CMP_NEQ_UQ = 0x04, // Not-equal (unordered, non-signaling) 51 _CMP_NLT_US = 0x05, // Not-less-than (unordered, signaling) 52 _CMP_NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling) 53 _CMP_ORD_Q = 0x07, // Ordered (nonsignaling) 54 _CMP_EQ_UQ = 0x08, // Equal (unordered, non-signaling) 55 _CMP_NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling) 56 _CMP_NGT_US = 0x0a, // Not-greater-than (unordered, signaling) 57 _CMP_FALSE_OQ = 0x0b, // False (ordered, non-signaling) 58 _CMP_NEQ_OQ = 0x0c, // Not-equal (ordered, non-signaling) 59 _CMP_GE_OS = 0x0d, // Greater-than-or-equal (ordered, signaling) 60 _CMP_GT_OS = 0x0e, // Greater-than (ordered, signaling) 61 _CMP_TRUE_UQ = 0x0f, // True (unordered, non-signaling) 62 _CMP_EQ_OS = 0x10, // Equal (ordered, signaling) 63 _CMP_LT_OQ = 0x11, // Less-than (ordered, non-signaling) 64 _CMP_LE_OQ = 0x12, // Less-than-or-equal (ordered, non-signaling) 65 _CMP_UNORD_S = 0x13, // Unordered (signaling) 66 _CMP_NEQ_US = 0x14, // Not-equal (unordered, signaling) 67 _CMP_NLT_UQ = 0x15, // Not-less-than (unordered, non-signaling) 68 _CMP_NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, non-signaling) 69 _CMP_ORD_S = 0x17, // Ordered (signaling) 70 _CMP_EQ_US = 0x18, // Equal (unordered, signaling) 71 _CMP_NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, non-signaling) 72 _CMP_NGT_UQ = 0x1a, // Not-greater-than (unordered, non-signaling) 73 _CMP_FALSE_OS = 0x1b, // False (ordered, signaling) 74 _CMP_NEQ_OS = 0x1c, // Not-equal (ordered, signaling) 75 _CMP_GE_OQ = 0x1d, // Greater-than-or-equal (ordered, non-signaling) 76 _CMP_GT_OQ = 0x1e, // Greater-than (ordered, non-signaling) 77 _CMP_TRUE_US = 0x1f // (unordered, signaling) 78 } 79 80 public import inteli.types; 81 import inteli.internals; 82 83 // Pull in all previous instruction set intrinsics. 84 public import inteli.smmintrin; 85 public import inteli.tmmintrin; 86 87 88 89 // In x86, LDC earlier version may have trouble preserving the stack pointer when an unsupported 90 // 256-bit vector type is passed, and AVX is disabled. 91 // This leads to disabling some intrinsics in this particular situation, since they are not safe for 92 // the caller. 93 version(LDC) 94 { 95 version(X86) 96 { 97 enum llvm256BitStackWorkaroundIn32BitX86 = __VERSION__ < 2099; 98 } 99 else 100 enum llvm256BitStackWorkaroundIn32BitX86 = false; 101 } 102 else 103 enum llvm256BitStackWorkaroundIn32BitX86 = false; 104 105 106 107 108 nothrow @nogc: 109 110 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 111 __m256d _mm256_add_pd (__m256d a, __m256d b) pure @trusted 112 { 113 return a + b; 114 } 115 unittest 116 { 117 align(32) double[4] A = [-1, 2, -3, 40000]; 118 align(32) double[4] B = [ 9, -7, 8, -0.5]; 119 __m256d R = _mm256_add_pd(_mm256_load_pd(A.ptr), _mm256_load_pd(B.ptr)); 120 double[4] correct = [8, -5, 5, 39999.5]; 121 assert(R.array == correct); 122 } 123 124 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`. 125 __m256 _mm256_add_ps (__m256 a, __m256 b) pure @trusted 126 { 127 return a + b; 128 } 129 unittest 130 { 131 align(32) float[8] A = [-1.0f, 2, -3, 40000, 0, 3, 5, 6]; 132 align(32) float[8] B = [ 9.0f, -7, 8, -0.5, 8, 7, 3, -1]; 133 __m256 R = _mm256_add_ps(_mm256_load_ps(A.ptr), _mm256_load_ps(B.ptr)); 134 float[8] correct = [8, -5, 5, 39999.5, 8, 10, 8, 5]; 135 assert(R.array == correct); 136 } 137 138 /// Alternatively add and subtract packed double-precision (64-bit) floating-point 139 /// elements in `a` to/from packed elements in `b`. 140 __m256d _mm256_addsub_pd (__m256d a, __m256d b) pure @trusted 141 { 142 // PERF DMD 143 static if (GDC_or_LDC_with_AVX) 144 { 145 return __builtin_ia32_addsubpd256(a, b); 146 } 147 else 148 { 149 //// Note: GDC x86 generates addsubpd since GDC 11.1 with -O3 150 //// LDC x86 generates addsubpd since LDC 1.18 with -O2 151 //// LDC ARM: not fantastic, ok since LDC 1.18 -O2 152 a.ptr[0] = a.array[0] + (-b.array[0]); 153 a.ptr[1] = a.array[1] + b.array[1]; 154 a.ptr[2] = a.array[2] + (-b.array[2]); 155 a.ptr[3] = a.array[3] + b.array[3]; 156 return a; 157 } 158 } 159 unittest 160 { 161 align(32) double[4] A = [-1, 2, -3, 40000]; 162 align(32) double[4] B = [ 9, -7, 8, -0.5]; 163 __m256d R = _mm256_addsub_pd(_mm256_load_pd(A.ptr), _mm256_load_pd(B.ptr)); 164 double[4] correct = [-10, -5, -11, 39999.5]; 165 assert(R.array == correct); 166 } 167 168 /// Alternatively add and subtract packed single-precision (32-bit) floating-point elements 169 /// in `a` to/from packed elements in `b`. 170 __m256 _mm256_addsub_ps (__m256 a, __m256 b) pure @trusted 171 { 172 // PERF DMD 173 static if (GDC_or_LDC_with_AVX) 174 { 175 return __builtin_ia32_addsubps256(a, b); 176 } 177 else 178 { 179 // Note: GDC x86 generates addsubps since GDC 11 -O3 180 // and in absence of AVX, a pair of SSE3 addsubps since GDC 12 -O2 181 // LDC x86 generates addsubps since LDC 1.18 -O2 182 // and in absence of AVX, a pair of SSE3 addsubps since LDC 1.1 -O1 183 // LDC ARM: neat output since LDC 1.21 -O2 184 185 a.ptr[0] = a.array[0] + (-b.array[0]); 186 a.ptr[1] = a.array[1] + b.array[1]; 187 a.ptr[2] = a.array[2] + (-b.array[2]); 188 a.ptr[3] = a.array[3] + b.array[3]; 189 a.ptr[4] = a.array[4] + (-b.array[4]); 190 a.ptr[5] = a.array[5] + b.array[5]; 191 a.ptr[6] = a.array[6] + (-b.array[6]); 192 a.ptr[7] = a.array[7] + b.array[7]; 193 return a; 194 } 195 } 196 unittest 197 { 198 align(32) float[8] A = [-1.0f, 2, -3, 40000, 0, 3, 5, 6]; 199 align(32) float[8] B = [ 9.0f, -7, 8, -0.5, 8, 7, 3, -1]; 200 __m256 R = _mm256_addsub_ps(_mm256_load_ps(A.ptr), _mm256_load_ps(B.ptr)); 201 float[8] correct = [ -10, -5, -11, 39999.5, -8, 10, 2, 5]; 202 assert(R.array == correct); 203 } 204 205 /// Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in `a` and `b`. 206 __m256d _mm256_and_pd (__m256d a, __m256d b) pure @trusted 207 { 208 // Note: GCC avxintrin.h uses the builtins for AND NOTAND OR of _ps and _pd, 209 // but those do not seem needed at any optimization level. 210 return cast(__m256d)(cast(__m256i)a & cast(__m256i)b); 211 } 212 unittest 213 { 214 double a = 4.32; 215 double b = -78.99; 216 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 217 __m256d A = _mm256_set_pd(a, b, a, b); 218 __m256d B = _mm256_set_pd(b, a, b, a); 219 long4 R = cast(long4)( _mm256_and_pd(A, B) ); 220 assert(R.array[0] == correct); 221 assert(R.array[1] == correct); 222 assert(R.array[2] == correct); 223 assert(R.array[3] == correct); 224 } 225 226 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`. 227 __m256 _mm256_and_ps (__m256 a, __m256 b) pure @trusted 228 { 229 return cast(__m256)(cast(__m256i)a & cast(__m256i)b); 230 } 231 unittest 232 { 233 float a = 4.32f; 234 float b = -78.99f; 235 int correct = (*cast(int*)(&a)) & (*cast(int*)(&b)); 236 __m256 A = _mm256_set_ps(a, b, a, b, a, b, a, b); 237 __m256 B = _mm256_set_ps(b, a, b, a, b, a, b, a); 238 int8 R = cast(int8)( _mm256_and_ps(A, B) ); 239 foreach(i; 0..8) 240 assert(R.array[i] == correct); 241 } 242 243 /// Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in `a` 244 /// and then AND with b. 245 __m256d _mm256_andnot_pd (__m256d a, __m256d b) pure @trusted 246 { 247 // PERF DMD 248 __m256i notA = _mm256_not_si256(cast(__m256i)a); 249 __m256i ib = cast(__m256i)b; 250 __m256i ab = notA & ib; 251 return cast(__m256d)ab; 252 } 253 unittest 254 { 255 double a = 4.32; 256 double b = -78.99; 257 long notA = ~ ( *cast(long*)(&a) ); 258 long correct = notA & (*cast(long*)(&b)); 259 __m256d A = _mm256_set_pd(a, a, a, a); 260 __m256d B = _mm256_set_pd(b, b, b, b); 261 long4 R = cast(long4)( _mm256_andnot_pd(A, B) ); 262 foreach(i; 0..4) 263 assert(R.array[i] == correct); 264 } 265 266 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` 267 /// and then AND with b. 268 __m256 _mm256_andnot_ps (__m256 a, __m256 b) pure @trusted 269 { 270 // PERF DMD 271 __m256i notA = _mm256_not_si256(cast(__m256i)a); 272 __m256i ib = cast(__m256i)b; 273 __m256i ab = notA & ib; 274 return cast(__m256)ab; 275 } 276 unittest 277 { 278 float a = 4.32f; 279 float b = -78.99f; 280 int notA = ~ ( *cast(int*)(&a) ); 281 int correct = notA & (*cast(int*)(&b)); 282 __m256 A = _mm256_set1_ps(a); 283 __m256 B = _mm256_set1_ps(b); 284 int8 R = cast(int8)( _mm256_andnot_ps(A, B) ); 285 foreach(i; 0..8) 286 assert(R.array[i] == correct); 287 } 288 289 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control 290 /// mask `imm8`. 291 __m256d _mm256_blend_pd(int imm8)(__m256d a, __m256d b) 292 { 293 static assert(imm8 >= 0 && imm8 < 16); 294 295 // PERF DMD 296 static if (GDC_with_AVX) 297 { 298 return __builtin_ia32_blendpd256 (a, b, imm8); 299 } 300 else 301 { 302 // Works great with LDC. 303 double4 r; 304 for (int n = 0; n < 4; ++n) 305 { 306 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 307 } 308 return r; 309 } 310 } 311 unittest 312 { 313 __m256d A = _mm256_setr_pd(0, 1, 2, 3); 314 __m256d B = _mm256_setr_pd(8, 9, 10, 11); 315 double4 C = _mm256_blend_pd!0x06(A, B); 316 double[4] correct = [0, 9, 10, 3]; 317 assert(C.array == correct); 318 } 319 320 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control 321 /// mask `imm8`. 322 __m256 _mm256_blend_ps(int imm8)(__m256 a, __m256 b) pure @trusted 323 { 324 static assert(imm8 >= 0 && imm8 < 256); 325 // PERF DMD 326 // PERF ARM64: not awesome with some constant values, up to 8/9 instructions 327 static if (GDC_with_AVX) 328 { 329 return __builtin_ia32_blendps256 (a, b, imm8); 330 } 331 else 332 { 333 // LDC x86: vblendps generated since LDC 1.27 -O1 334 float8 r; 335 for (int n = 0; n < 8; ++n) 336 { 337 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 338 } 339 return r; 340 } 341 } 342 unittest 343 { 344 __m256 A = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7); 345 __m256 B = _mm256_setr_ps(8, 9, 10, 11, 12, 13, 14, 15); 346 float8 C = _mm256_blend_ps!0xe7(A, B); 347 float[8] correct = [8, 9, 10, 3, 4, 13, 14, 15]; 348 assert(C.array == correct); 349 } 350 351 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using mask. 352 __m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask) @trusted 353 { 354 // PERF DMD 355 static if (GDC_with_AVX) 356 { 357 // Amazingly enough, GCC/GDC generates the vblendvpd instruction 358 // with -mavx2 but not -mavx. 359 // Not sure what is the reason, and there is a replacement sequence. 360 // PERF: Sounds like a bug, similar to _mm_blendv_pd 361 // or maybe the instruction in unsafe? 362 return __builtin_ia32_blendvpd256(a, b, mask); 363 } 364 else static if (LDC_with_AVX) 365 { 366 return __builtin_ia32_blendvpd256(a, b, mask); 367 } 368 else 369 { 370 // LDC x86: vblendvpd since LDC 1.27 -O2 371 // arm64: only 4 instructions, since LDC 1.27 -O2 372 __m256d r; 373 long4 lmask = cast(long4)mask; 374 for (int n = 0; n < 4; ++n) 375 { 376 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 377 } 378 return r; 379 } 380 } 381 unittest 382 { 383 __m256d A = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0); 384 __m256d B = _mm256_setr_pd(5.0, 6.0, 7.0, 8.0); 385 __m256d M = _mm256_setr_pd(-3.0, 2.0, 1.0, -4.0); 386 __m256d R = _mm256_blendv_pd(A, B, M); 387 double[4] correct1 = [5.0, 2.0, 3.0, 8.0]; 388 assert(R.array == correct1); 389 } 390 391 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` 392 /// using `mask`. 393 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` 394 /// using `mask`. 395 __m256 _mm256_blendv_ps (__m256 a, __m256 b, __m256 mask) @trusted 396 { 397 // PERF DMD 398 // PERF LDC/GDC without AVX could use two intrinsics for each part 399 static if (GDC_or_LDC_with_AVX) 400 { 401 return __builtin_ia32_blendvps256(a, b, mask); 402 } 403 else static if (LDC_with_ARM64) 404 { 405 int8 shift; 406 shift = 31; 407 int8 lmask = cast(int8)mask >> shift; 408 int8 ia = cast(int8)a; 409 int8 ib = cast(int8)b; 410 return cast(__m256)(ia ^ ((ia ^ ib) & lmask)); 411 } 412 else 413 { 414 __m256 r = void; // PERF =void; 415 int8 lmask = cast(int8)mask; 416 for (int n = 0; n < 8; ++n) 417 { 418 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 419 } 420 return r; 421 } 422 } 423 unittest 424 { 425 __m256 A = _mm256_setr_ps(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); 426 __m256 B = _mm256_setr_ps(5.0f, 6.0f, 7.0f, 8.0f, 5.0f, 6.0f, 7.0f, 8.0f); 427 __m256 M = _mm256_setr_ps(-3.0f, 2.0f, 1.0f, -4.0f, -3.0f, 2.0f, 1.0f, -4.0f); 428 __m256 R = _mm256_blendv_ps(A, B, M); 429 float[8] correct1 = [5.0f, 2.0f, 3.0f, 8.0f, 5.0f, 2.0f, 3.0f, 8.0f]; 430 assert(R.array == correct1); 431 } 432 433 /// Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit) 434 /// floating-point elements) to all elements. 435 /// This effectively duplicates the 128-bit vector. 436 __m256d _mm256_broadcast_pd (const(__m128d)* mem_addr) pure @trusted 437 { 438 // PERF DMD 439 static if (GDC_with_AVX) 440 { 441 return __builtin_ia32_vbroadcastf128_pd256(cast(float4*)mem_addr); 442 } 443 else 444 { 445 const(double)* p = cast(const(double)*) mem_addr; 446 __m256d r; 447 r.ptr[0] = p[0]; 448 r.ptr[1] = p[1]; 449 r.ptr[2] = p[0]; 450 r.ptr[3] = p[1]; 451 return r; 452 } 453 } 454 unittest 455 { 456 __m128d A = _mm_setr_pd(3, -4); 457 __m256d B = _mm256_broadcast_pd(&A); 458 double[4] correct = [3, -4, 3, -4]; 459 assert(B.array == correct); 460 } 461 462 /// Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) 463 /// floating-point elements) to all elements. 464 /// This effectively duplicates the 128-bit vector. 465 __m256 _mm256_broadcast_ps (const(__m128)* mem_addr) pure @trusted 466 { 467 // PERF DMD 468 static if (GDC_with_AVX) 469 { 470 return __builtin_ia32_vbroadcastf128_ps256(cast(float4*)mem_addr); 471 } 472 else 473 { 474 const(float)* p = cast(const(float)*)mem_addr; 475 __m256 r; 476 r.ptr[0] = p[0]; 477 r.ptr[1] = p[1]; 478 r.ptr[2] = p[2]; 479 r.ptr[3] = p[3]; 480 r.ptr[4] = p[0]; 481 r.ptr[5] = p[1]; 482 r.ptr[6] = p[2]; 483 r.ptr[7] = p[3]; 484 return r; 485 } 486 } 487 unittest 488 { 489 __m128 A = _mm_setr_ps(1, 2, 3, -4); 490 __m256 B = _mm256_broadcast_ps(&A); 491 float[8] correct = [1.0f, 2, 3, -4, 1, 2, 3, -4]; 492 assert(B.array == correct); 493 } 494 495 /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements. 496 __m256d _mm256_broadcast_sd (const(double)* mem_addr) pure @trusted 497 { 498 static if (GDC_with_AVX) 499 { 500 return __builtin_ia32_vbroadcastsd256(mem_addr); 501 } 502 else 503 { 504 double a = *mem_addr; 505 __m256d r; 506 r.ptr[0] = a; 507 r.ptr[1] = a; 508 r.ptr[2] = a; 509 r.ptr[3] = a; 510 return r; 511 } 512 } 513 unittest 514 { 515 double t = 7.5f; 516 __m256d A = _mm256_broadcast_sd(&t); 517 double[4] correct = [7.5, 7.5, 7.5, 7.5]; 518 assert(A.array == correct); 519 } 520 521 /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements. 522 __m128 _mm_broadcast_ss (const(float)* mem_addr) pure @trusted 523 { 524 // PERF: DMD 525 static if (GDC_with_AVX) 526 { 527 return __builtin_ia32_vbroadcastss(mem_addr); 528 } 529 else 530 { 531 float a = *mem_addr; 532 __m128 r; 533 r.ptr[0] = a; 534 r.ptr[1] = a; 535 r.ptr[2] = a; 536 r.ptr[3] = a; 537 return r; 538 } 539 } 540 unittest 541 { 542 float t = 7.5f; 543 __m128 A = _mm_broadcast_ss(&t); 544 float[4] correct = [7.5f, 7.5f, 7.5f, 7.5f]; 545 assert(A.array == correct); 546 } 547 548 __m256 _mm256_broadcast_ss (const(float)* mem_addr) 549 { 550 // PERF: DMD 551 static if (GDC_with_AVX) 552 { 553 return __builtin_ia32_vbroadcastss256 (mem_addr); 554 } 555 else 556 { 557 float a = *mem_addr; 558 __m256 r = __m256(a); 559 return r; 560 } 561 } 562 unittest 563 { 564 float t = 7.5f; 565 __m256 A = _mm256_broadcast_ss(&t); 566 float[8] correct = [7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f]; 567 assert(A.array == correct); 568 } 569 570 /// Cast vector of type `__m256d` to type `__m256`. 571 __m256 _mm256_castpd_ps (__m256d a) pure @safe 572 { 573 return cast(__m256)a; 574 } 575 576 /// Cast vector of type `__m256d` to type `__m256i`. 577 __m256i _mm256_castpd_si256 (__m256d a) pure @safe 578 { 579 return cast(__m256i)a; 580 } 581 582 /// Cast vector of type `__m128d` to type `__m256d`; the upper 128 bits of the result are undefined. 583 __m256d _mm256_castpd128_pd256 (__m128d a) pure @trusted 584 { 585 static if (GDC_with_AVX) 586 { 587 return __builtin_ia32_pd256_pd(a); 588 } 589 else 590 { 591 __m256d r = void; 592 r.ptr[0] = a.array[0]; 593 r.ptr[1] = a.array[1]; 594 return r; 595 } 596 } 597 unittest 598 { 599 __m128d A = _mm_setr_pd(4.0, -6.125); 600 __m256d B = _mm256_castpd128_pd256(A); 601 assert(B.array[0] == 4.0); 602 assert(B.array[1] == -6.125); 603 } 604 605 /// Cast vector of type `__m256d` to type `__m128d`; the upper 128 bits of `a` are lost. 606 __m128d _mm256_castpd256_pd128 (__m256d a) pure @trusted 607 { 608 static if (GDC_with_AVX) 609 { 610 return __builtin_ia32_pd_pd256(a); 611 } 612 else 613 { 614 __m128d r; 615 r.ptr[0] = a.array[0]; 616 r.ptr[1] = a.array[1]; 617 return r; 618 } 619 } 620 unittest 621 { 622 __m256d A = _mm256_set_pd(1, 2, -6.25, 4.0); 623 __m128d B = _mm256_castpd256_pd128(A); 624 assert(B.array[0] == 4.0); 625 assert(B.array[1] == -6.25); 626 } 627 628 /// Cast vector of type `__m256` to type `__m256d`. 629 __m256d _mm256_castps_pd (__m256 a) pure @safe 630 { 631 return cast(__m256d)a; 632 } 633 634 /// Cast vector of type `__m256` to type `__m256i`. 635 __m256i _mm256_castps_si256 (__m256 a) pure @safe 636 { 637 return cast(__m256i)a; 638 } 639 640 /// Cast vector of type `__m128` to type `__m256`; the upper 128 bits of the result are undefined. 641 __m256 _mm256_castps128_ps256 (__m128 a) pure @trusted 642 { 643 static if (GDC_with_AVX) 644 { 645 return __builtin_ia32_ps256_ps(a); 646 } 647 else 648 { 649 __m256 r = void; 650 r.ptr[0] = a.array[0]; 651 r.ptr[1] = a.array[1]; 652 r.ptr[2] = a.array[2]; 653 r.ptr[3] = a.array[3]; 654 return r; 655 } 656 } 657 unittest 658 { 659 __m128 A = _mm_setr_ps(1.0f, 2, 3, 4); 660 __m256 B = _mm256_castps128_ps256(A); 661 float[4] correct = [1.0f, 2, 3, 4]; 662 assert(B.array[0..4] == correct); 663 } 664 665 /// Cast vector of type `__m256` to type `__m128`. The upper 128-bit of `a` are lost. 666 __m128 _mm256_castps256_ps128 (__m256 a) pure @trusted 667 { 668 return *cast(const(__m128)*)(&a); 669 } 670 unittest 671 { 672 __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8); 673 __m128 B = _mm256_castps256_ps128(A); 674 float[4] correct = [1.0f, 2, 3, 4]; 675 assert(B.array == correct); 676 } 677 678 /// Cast vector of type `__m128i` to type `__m256i`; the upper 128 bits of the result are undefined. 679 __m256i _mm256_castsi128_si256 (__m128i a) pure @trusted 680 { 681 long2 la = cast(long2)a; 682 long4 r = void; 683 r.ptr[0] = la.array[0]; 684 r.ptr[1] = la.array[1]; 685 return r; 686 } 687 unittest 688 { 689 __m128i A = _mm_setr_epi64(-1, 42); 690 __m256i B = _mm256_castsi128_si256(A); 691 long[2] correct = [-1, 42]; 692 assert(B.array[0..2] == correct); 693 } 694 695 /// Cast vector of type `__m256i` to type `__m256d`. 696 __m256d _mm256_castsi256_pd (__m256i a) pure @safe 697 { 698 return cast(__m256d)a; 699 } 700 701 /// Cast vector of type `__m256i` to type `__m256`. 702 __m256 _mm256_castsi256_ps (__m256i a) pure @safe 703 { 704 return cast(__m256)a; 705 } 706 707 /// Cast vector of type `__m256i` to type `__m128i`. The upper 128-bit of `a` are lost. 708 __m128i _mm256_castsi256_si128 (__m256i a) pure @trusted 709 { 710 long2 r = void; 711 r.ptr[0] = a.array[0]; 712 r.ptr[1] = a.array[1]; 713 return cast(__m128i)r; 714 } 715 unittest 716 { 717 long4 A; 718 A.ptr[0] = -1; 719 A.ptr[1] = 42; 720 long2 B = cast(long2)(_mm256_castsi256_si128(A)); 721 long[2] correct = [-1, 42]; 722 assert(B.array[0..2] == correct); 723 } 724 725 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer 726 /// value, and store the results as packed double-precision floating-point elements. 727 __m256d _mm256_ceil_pd (__m256d a) @safe 728 { 729 static if (LDC_with_ARM64) 730 { 731 __m128d lo = _mm256_extractf128_pd!0(a); 732 __m128d hi = _mm256_extractf128_pd!1(a); 733 __m128d ilo = _mm_ceil_pd(lo); 734 __m128d ihi = _mm_ceil_pd(hi); 735 return _mm256_set_m128d(ihi, ilo); 736 } 737 else 738 { 739 return _mm256_round_pd!2(a); 740 } 741 } 742 unittest 743 { 744 __m256d A = _mm256_setr_pd(1.3f, -2.12f, 53.6f, -2.7f); 745 A = _mm256_ceil_pd(A); 746 double[4] correct = [2.0, -2.0, 54.0, -2.0]; 747 assert(A.array == correct); 748 } 749 750 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer 751 /// value, and store the results as packed single-precision floating-point elements. 752 __m256 _mm256_ceil_ps (__m256 a) @safe 753 { 754 static if (LDC_with_ARM64) 755 { 756 __m128 lo = _mm256_extractf128_ps!0(a); 757 __m128 hi = _mm256_extractf128_ps!1(a); 758 __m128 ilo = _mm_ceil_ps(lo); 759 __m128 ihi = _mm_ceil_ps(hi); 760 return _mm256_set_m128(ihi, ilo); 761 } 762 else 763 { 764 return _mm256_round_ps!2(a); 765 } 766 } 767 unittest 768 { 769 __m256 A = _mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f); 770 __m256 C = _mm256_ceil_ps(A); 771 float[8] correct = [2.0f, -2.0f, 54.0f, -2.0f, -1, 3, -53, 3]; 772 assert(C.array == correct); 773 } 774 775 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b` based on the 776 /// comparison operand specified by `imm8`. 777 __m128d _mm_cmp_pd(int imm8)(__m128d a, __m128d b) pure @safe 778 { 779 enum comparison = mapAVXFPComparison(imm8); 780 return cast(__m128d) cmppd!comparison(a, b); 781 } 782 unittest 783 { 784 __m128d A = _mm_setr_pd(double.infinity, double.nan); 785 __m128d B = _mm_setr_pd(3.0, 4.0); 786 long2 R = cast(long2) _mm_cmp_pd!_CMP_GT_OS(A, B); 787 long[2] correct = [-1, 0]; 788 assert(R.array == correct); 789 790 long2 R2 = cast(long2) _mm_cmp_pd!_CMP_NLE_UQ(A, B); 791 long[2] correct2 = [-1, -1]; 792 assert(R2.array == correct2); 793 } 794 795 ///ditto 796 __m256d _mm256_cmp_pd(int imm8)(__m256d a, __m256d b) pure @safe 797 { 798 enum comparison = mapAVXFPComparison(imm8); 799 return cast(__m256d) cmppd256!comparison(a, b); 800 } 801 unittest 802 { 803 __m256d A = _mm256_setr_pd(1.0, 2.0, 3.0, double.nan); 804 __m256d B = _mm256_setr_pd(3.0, 2.0, 1.0, double.nan); 805 __m256i R = cast(__m256i) _mm256_cmp_pd!_CMP_LT_OS(A, B); 806 long[4] correct = [-1, 0, 0, 0]; 807 assert(R.array == correct); 808 } 809 810 /// Compare packed double-precision (32-bit) floating-point elements in `a` and `b` based on the 811 /// comparison operand specified by `imm8`. 812 __m128 _mm_cmp_ps(int imm8)(__m128 a, __m128 b) pure @safe 813 { 814 enum comparison = mapAVXFPComparison(imm8); 815 return cast(__m128) cmpps!comparison(a, b); 816 } 817 818 ///ditto 819 _m256 _mm256_cmp_ps(int imm8)(__m256 a, __m256 b) pure @safe 820 { 821 enum comparison = mapAVXFPComparison(imm8); 822 return cast(_m256) cmpps256!comparison(a, b); 823 } 824 825 /// Compare the lower double-precision (64-bit) floating-point element in `a` and `b` based on the 826 /// comparison operand specified by `imm8`, store the result in the lower element of result, and 827 /// copy the upper element from `a` to the upper element of result. 828 __m128d _mm_cmp_sd(int imm8)(__m128d a, __m128d b) pure @safe 829 { 830 enum comparison = mapAVXFPComparison(imm8); 831 return cast(_m256) cmpsd!comparison(a, b); 832 } 833 834 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` based on the 835 /// comparison operand specified by `imm8`, store the result in the lower element of result, and 836 /// copy the upper 3 packed elements from `a` to the upper elements of result. 837 __m128 _mm_cmp_ss(int imm8)(__m128 a, __m128 b) pure @safe 838 { 839 enum comparison = mapAVXFPComparison(imm8); 840 return cast(_m256) cmpss!comparison(a, b); 841 } 842 843 /// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point 844 /// elements. 845 __m256d _mm256_cvtepi32_pd (__m128i a) pure @trusted 846 { 847 version(LDC) 848 { 849 enum ir = ` 850 %r = sitofp <4 x i32> %0 to <4 x double> 851 ret <4 x double> %r`; 852 return LDCInlineIR!(ir, double4, __m128i)(a); 853 } 854 else static if (GDC_with_AVX) 855 { 856 return __builtin_ia32_cvtdq2pd256(a); 857 } 858 else 859 { 860 double4 r; 861 r.ptr[0] = a.array[0]; 862 r.ptr[1] = a.array[1]; 863 r.ptr[2] = a.array[2]; 864 r.ptr[3] = a.array[3]; 865 return r; 866 } 867 } 868 unittest 869 { 870 __m256d R = _mm256_cvtepi32_pd(_mm_set1_epi32(54)); 871 double[4] correct = [54.0, 54, 54, 54]; 872 assert(R.array == correct); 873 } 874 875 /// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point 876 /// elements. 877 __m256 _mm256_cvtepi32_ps (__m256i a) pure @trusted 878 { 879 version(LDC) 880 { 881 enum ir = ` 882 %r = sitofp <8 x i32> %0 to <8 x float> 883 ret <8 x float> %r`; 884 return LDCInlineIR!(ir, float8, int8)(cast(int8)a); 885 } 886 else static if (GDC_with_AVX) 887 { 888 return __builtin_ia32_cvtdq2ps256(cast(int8)a); 889 } 890 else 891 { 892 int8 ia = cast(int8)a; 893 __m256 r; 894 r.ptr[0] = ia.array[0]; 895 r.ptr[1] = ia.array[1]; 896 r.ptr[2] = ia.array[2]; 897 r.ptr[3] = ia.array[3]; 898 r.ptr[4] = ia.array[4]; 899 r.ptr[5] = ia.array[5]; 900 r.ptr[6] = ia.array[6]; 901 r.ptr[7] = ia.array[7]; 902 return r; 903 } 904 } 905 unittest 906 { 907 __m256 R = _mm256_cvtepi32_ps(_mm256_set1_epi32(5)); 908 float[8] correct = [5.0f, 5, 5, 5, 5, 5, 5, 5]; 909 assert(R.array == correct); 910 } 911 912 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit 913 /// integers. Follows the current rounding mode. 914 __m128i _mm256_cvtpd_epi32 (__m256d a) @safe 915 { 916 static if (GDC_or_LDC_with_AVX) 917 { 918 return __builtin_ia32_cvtpd2dq256(a); 919 } 920 else 921 { 922 __m128d lo = _mm256_extractf128_pd!0(a); 923 __m128d hi = _mm256_extractf128_pd!1(a); 924 __m128i ilo = _mm_cvtpd_epi32(lo); // Only lower 64-bit contains significant values 925 __m128i ihi = _mm_cvtpd_epi32(hi); 926 return _mm_unpacklo_epi64(ilo, ihi); 927 } 928 } 929 unittest 930 { 931 int4 A = _mm256_cvtpd_epi32(_mm256_setr_pd(61.0, 55.0, -100, 1_000_000)); 932 int[4] correct = [61, 55, -100, 1_000_000]; 933 assert(A.array == correct); 934 } 935 936 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed single-precision (32-bit) 937 /// floating-point elements. 938 __m128 _mm256_cvtpd_ps (__m256d a) pure @trusted 939 { 940 // PERF DMD 941 static if (GDC_or_LDC_with_AVX) 942 { 943 return __builtin_ia32_cvtpd2ps256(a); 944 } 945 else 946 { 947 __m128 r; 948 r.ptr[0] = a.array[0]; 949 r.ptr[1] = a.array[1]; 950 r.ptr[2] = a.array[2]; 951 r.ptr[3] = a.array[3]; 952 return r; 953 } 954 } 955 unittest 956 { 957 __m256d A = _mm256_setr_pd(1.0, 2, 3, 5); 958 __m128 R = _mm256_cvtpd_ps(A); 959 float[4] correct = [1.0f, 2, 3, 5]; 960 assert(R.array == correct); 961 } 962 963 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit 964 /// integers, using the current rounding mode. 965 __m256i _mm256_cvtps_epi32 (__m256 a) @trusted 966 { 967 static if (GDC_or_LDC_with_AVX) 968 { 969 return cast(__m256i) __builtin_ia32_cvtps2dq256(a); 970 } 971 else 972 { 973 __m128 lo = _mm256_extractf128_ps!0(a); 974 __m128 hi = _mm256_extractf128_ps!1(a); 975 __m128i ilo = _mm_cvtps_epi32(lo); 976 __m128i ihi = _mm_cvtps_epi32(hi); 977 return _mm256_set_m128i(ihi, ilo); 978 } 979 } 980 unittest 981 { 982 uint savedRounding = _MM_GET_ROUNDING_MODE(); 983 984 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 985 __m256i A = _mm256_cvtps_epi32(_mm256_setr_ps(1.4f, -2.1f, 53.5f, -2.9f, -1.4f, 2.1f, -53.5f, 2.9f)); 986 assert( (cast(int8)A).array == [1, -2, 54, -3, -1, 2, -54, 3]); 987 988 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 989 A = _mm256_cvtps_epi32(_mm256_setr_ps(1.3f, -2.11f, 53.4f, -2.8f, -1.3f, 2.11f, -53.4f, 2.8f)); 990 assert( (cast(int8)A).array == [1, -3, 53, -3, -2, 2, -54, 2]); 991 992 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 993 A = _mm256_cvtps_epi32(_mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f)); 994 assert( (cast(int8)A).array == [2, -2, 54, -2, -1, 3, -53, 3]); 995 996 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 997 A = _mm256_cvtps_epi32(_mm256_setr_ps(1.4f, -2.17f, 53.8f, -2.91f, -1.4f, 2.17f, -53.8f, 2.91f)); 998 assert( (cast(int8)A).array == [1, -2, 53, -2, -1, 2, -53, 2]); 999 1000 _MM_SET_ROUNDING_MODE(savedRounding); 1001 } 1002 1003 1004 /// Convert packed single-precision (32-bit) floating-point elements in `a`` to packed double-precision 1005 /// (64-bit) floating-point elements. 1006 __m256d _mm256_cvtps_pd (__m128 a) pure @trusted 1007 { 1008 // PERF DMD 1009 static if (GDC_with_AVX) 1010 { 1011 return __builtin_ia32_cvtps2pd256(a); // LDC doesn't have the builtin 1012 } 1013 else 1014 { 1015 // LDC: x86, needs -O2 to generate cvtps2pd since LDC 1.2.0 1016 __m256d r; 1017 r.ptr[0] = a.array[0]; 1018 r.ptr[1] = a.array[1]; 1019 r.ptr[2] = a.array[2]; 1020 r.ptr[3] = a.array[3]; 1021 return r; 1022 } 1023 } 1024 unittest 1025 { 1026 __m128 A = _mm_setr_ps(1.0f, 2, 3, 5); 1027 __m256d R = _mm256_cvtps_pd(A); 1028 double[4] correct = [1.0, 2, 3, 5]; 1029 assert(R.array == correct); 1030 } 1031 1032 /// Return the lower double-precision (64-bit) floating-point element of `a`. 1033 double _mm256_cvtsd_f64 (__m256d a) pure @safe 1034 { 1035 return a.array[0]; 1036 } 1037 1038 /// Return the lower 32-bit integer in `a`. 1039 int _mm256_cvtsi256_si32 (__m256i a) pure @safe 1040 { 1041 return (cast(int8)a).array[0]; 1042 } 1043 1044 /// Return the lower single-precision (32-bit) floating-point element of `a`. 1045 float _mm256_cvtss_f32 (__m256 a) pure @safe 1046 { 1047 return a.array[0]; 1048 } 1049 1050 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit 1051 /// integers with truncation. 1052 __m128i _mm256_cvttpd_epi32 (__m256d a) pure @trusted 1053 { 1054 // PERF DMD 1055 static if (GDC_or_LDC_with_AVX) 1056 { 1057 return cast(__m128i)__builtin_ia32_cvttpd2dq256(a); 1058 } 1059 else 1060 { 1061 __m128i r; 1062 r.ptr[0] = cast(int)a.array[0]; 1063 r.ptr[1] = cast(int)a.array[1]; 1064 r.ptr[2] = cast(int)a.array[2]; 1065 r.ptr[3] = cast(int)a.array[3]; 1066 return r; 1067 } 1068 } 1069 unittest 1070 { 1071 __m256d A = _mm256_set_pd(4.7, -1000.9, -7.1, 3.1); 1072 __m128i R = _mm256_cvttpd_epi32(A); 1073 int[4] correct = [3, -7, -1000, 4]; 1074 assert(R.array == correct); 1075 } 1076 1077 /// Convert packed single-precision (32-bit) floating-point elements in `a`. 1078 __m256i _mm256_cvttps_epi32 (__m256 a) pure @trusted 1079 { 1080 // PERF DMD 1081 static if (GDC_or_LDC_with_AVX) 1082 { 1083 return cast(__m256i)__builtin_ia32_cvttps2dq256(a); 1084 } 1085 else 1086 { 1087 int8 r; 1088 r.ptr[0] = cast(int)a.array[0]; 1089 r.ptr[1] = cast(int)a.array[1]; 1090 r.ptr[2] = cast(int)a.array[2]; 1091 r.ptr[3] = cast(int)a.array[3]; 1092 r.ptr[4] = cast(int)a.array[4]; 1093 r.ptr[5] = cast(int)a.array[5]; 1094 r.ptr[6] = cast(int)a.array[6]; 1095 r.ptr[7] = cast(int)a.array[7]; 1096 return cast(__m256i)r; 1097 } 1098 } 1099 unittest 1100 { 1101 __m256 A = _mm256_set_ps(4.7, -1000.9, -7.1, 3.1, 1.4, 2.9, -2.9, 0); 1102 int8 R = cast(int8) _mm256_cvttps_epi32(A); 1103 int[8] correct = [0, -2, 2, 1, 3, -7, -1000, 4]; 1104 assert(R.array == correct); 1105 } 1106 1107 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`. 1108 __m256d _mm256_div_pd (__m256d a, __m256d b) pure @safe 1109 { 1110 return a / b; 1111 } 1112 unittest 1113 { 1114 __m256d a = [1.5, -2.0, 3.0, 1.0]; 1115 a = _mm256_div_pd(a, a); 1116 double[4] correct = [1.0, 1.0, 1.0, 1.0]; 1117 assert(a.array == correct); 1118 } 1119 1120 /// Divide packed single-precision (32-bit) floating-point elements in `a` by packed elements in `b`. 1121 __m256 _mm256_div_ps (__m256 a, __m256 b) pure @safe 1122 { 1123 return a / b; 1124 } 1125 unittest 1126 { 1127 __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 4.5f, -5.0f, 6.0f, 7.0f]; 1128 a = _mm256_div_ps(a, a); 1129 float[8] correct = [1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f]; 1130 assert(a.array == correct); 1131 } 1132 1133 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements in `a` and 1134 /// `b` using the high 4 bits in `imm8`, sum the four products, and conditionally store the sum 1135 /// using the low 4 bits of `imm8`. 1136 __m256 _mm256_dp_ps(int imm8)(__m256 a, __m256 b) 1137 { 1138 // PERF DMD 1139 // PERF without AVX, can use 2 _mm_dp_ps exactly (beware the imm8 is tricky) 1140 static if (GDC_or_LDC_with_AVX) 1141 { 1142 return __builtin_ia32_dpps256(a, b, cast(ubyte)imm8); 1143 } 1144 else 1145 { 1146 __m256 zero = _mm256_setzero_ps(); 1147 enum ubyte op = (imm8 >>> 4) & 15; 1148 __m256 temp = _mm256_blend_ps!( op | (op << 4) )(zero, a * b); 1149 float lo = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3]; 1150 float hi = temp.array[4] + temp.array[5] + temp.array[6] + temp.array[7]; 1151 __m256 r = _mm256_set_m128(_mm_set1_ps(hi), _mm_set1_ps(lo)); 1152 enum ubyte op2 = (imm8 & 15); 1153 return _mm256_blend_ps!(op2 | (op2 << 4))(zero, r); 1154 } 1155 } 1156 unittest 1157 { 1158 // Products: 9 14 20 24 6 16 12 -24 1159 __m256 A = _mm256_setr_ps(1.0f, 2.0f, 4.0f, 8.0f, 1.0f, 2.0f, 4.0f, 8.0f); 1160 __m256 B = _mm256_setr_ps(9.0f, 7.0f, 5.0f, 3.0f, 6.0f, 8.0f, 3.0f,-3.0f); 1161 float8 R1 = _mm256_dp_ps!(0xf0 + 0xf)(A, B); 1162 float8 R2 = _mm256_dp_ps!(0x30 + 0x5)(A, B); 1163 float8 R3 = _mm256_dp_ps!(0x50 + 0xa)(A, B); 1164 float[8] correct1 = [67.0f, 67.0f, 67.0f,67.0f, 10, 10, 10, 10]; 1165 float[8] correct2 = [23.0f, 0.0f, 23.0f, 0.0f, 22, 0, 22, 0]; 1166 float[8] correct3 = [0.0f, 29.0f, 0.0f, 29.0f, 0, 18, 0, 18]; 1167 assert(R1.array == correct1); 1168 assert(R2.array == correct2); 1169 assert(R3.array == correct3); 1170 } 1171 1172 /// Extract a 32-bit integer from `a`, selected with `imm8`. 1173 int _mm256_extract_epi32 (__m256i a, const int imm8) pure @trusted 1174 { 1175 return (cast(int8)a).array[imm8 & 7]; 1176 } 1177 unittest 1178 { 1179 align(16) int[8] data = [-1, 2, -3, 4, 9, -7, 8, -6]; 1180 auto A = _mm256_loadu_si256(cast(__m256i*) data.ptr); 1181 assert(_mm256_extract_epi32(A, 0) == -1); 1182 assert(_mm256_extract_epi32(A, 1 + 8) == 2); 1183 assert(_mm256_extract_epi32(A, 3 + 16) == 4); 1184 assert(_mm256_extract_epi32(A, 7 + 32) == -6); 1185 } 1186 1187 /// Extract a 64-bit integer from `a`, selected with `index`. 1188 long _mm256_extract_epi64 (__m256i a, const int index) pure @safe 1189 { 1190 return a.array[index & 3]; 1191 } 1192 unittest 1193 { 1194 __m256i A = _mm256_setr_epi64x(-7, 6, 42, 0); 1195 assert(_mm256_extract_epi64(A, -8) == -7); 1196 assert(_mm256_extract_epi64(A, 1) == 6); 1197 assert(_mm256_extract_epi64(A, 2 + 4) == 42); 1198 } 1199 1200 /// Extract a 128-bits lane from `a`, selected with `index` (0 or 1). 1201 /// Note: `_mm256_extractf128_pd!0` is equivalent to `_mm256_castpd256_pd128`. 1202 __m128d _mm256_extractf128_pd(ubyte imm8)(__m256d a) pure @trusted 1203 { 1204 // PERF DMD D_SIMD 1205 static if (GDC_with_AVX) 1206 { 1207 // Note: needs to be a template intrinsics because of this builtin. 1208 return __builtin_ia32_vextractf128_pd256(a, imm8 & 1); 1209 } 1210 else 1211 { 1212 double2 r = void; 1213 enum int index = 2*(imm8 & 1); 1214 r.ptr[0] = a.array[index+0]; 1215 r.ptr[1] = a.array[index+1]; 1216 return r; 1217 } 1218 } 1219 unittest 1220 { 1221 __m256d A = _mm256_setr_pd(1.0, 2, 3, 4); 1222 double[4] correct = [1.0, 2, 3, 4]; 1223 __m128d l0 = _mm256_extractf128_pd!18(A); 1224 __m128d l1 = _mm256_extractf128_pd!55(A); 1225 assert(l0.array == correct[0..2]); 1226 assert(l1.array == correct[2..4]); 1227 } 1228 1229 ///ditto 1230 __m128 _mm256_extractf128_ps(ubyte imm8)(__m256 a) pure @trusted 1231 { 1232 // PERF DMD D_SIMD 1233 static if (GDC_with_AVX) 1234 { 1235 return __builtin_ia32_vextractf128_ps256(a, imm8 & 1); 1236 } 1237 else 1238 { 1239 float4 r = void; // Optimize well since LDC 1.1 -O1 1240 enum int index = 4*(imm8 & 1); 1241 r.ptr[0] = a.array[index+0]; 1242 r.ptr[1] = a.array[index+1]; 1243 r.ptr[2] = a.array[index+2]; 1244 r.ptr[3] = a.array[index+3]; 1245 return r; 1246 } 1247 } 1248 unittest 1249 { 1250 __m256 A = _mm256_setr_ps(1.0, 2, 3, 4, 5, 6, 7, 8); 1251 float[8] correct = [1.0, 2, 3, 4, 5, 6, 7, 8]; 1252 __m128 l0 = _mm256_extractf128_ps!8(A); 1253 __m128 l1 = _mm256_extractf128_ps!255(A); 1254 assert(l0.array == correct[0..4]); 1255 assert(l1.array == correct[4..8]); 1256 } 1257 1258 ///ditto 1259 __m128i _mm256_extractf128_si256(ubyte imm8)(__m256i a) pure @trusted 1260 { 1261 // PERF DMD D_SIMD 1262 static if (GDC_with_AVX) 1263 { 1264 // Note: if it weren't for this GDC intrinsic, _mm256_extractf128_si256 1265 // could be a non-template, however, this wins in -O0. 1266 // Same story for _mm256_extractf128_ps and _mm256_extractf128_pd 1267 return __builtin_ia32_vextractf128_si256(cast(int8)a, imm8 & 1); 1268 } 1269 else 1270 { 1271 long2 r = void; 1272 enum int index = 2*(imm8 & 1); 1273 r.ptr[0] = a.array[index+0]; 1274 r.ptr[1] = a.array[index+1]; 1275 return cast(__m128i)r; 1276 } 1277 } 1278 unittest 1279 { 1280 __m256i A = _mm256_setr_epi32(9, 2, 3, 4, 5, 6, 7, 8); 1281 int[8] correct = [9, 2, 3, 4, 5, 6, 7, 8]; 1282 __m128i l0 = _mm256_extractf128_si256!0(A); 1283 __m128i l1 = _mm256_extractf128_si256!1(A); 1284 assert(l0.array == correct[0..4]); 1285 assert(l1.array == correct[4..8]); 1286 } 1287 1288 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an integer 1289 /// value, and store the results as packed double-precision floating-point elements. 1290 __m256d _mm256_floor_pd (__m256d a) @safe 1291 { 1292 static if (LDC_with_ARM64) 1293 { 1294 __m128d lo = _mm256_extractf128_pd!0(a); 1295 __m128d hi = _mm256_extractf128_pd!1(a); 1296 __m128d ilo = _mm_floor_pd(lo); 1297 __m128d ihi = _mm_floor_pd(hi); 1298 return _mm256_set_m128d(ihi, ilo); 1299 } 1300 else 1301 { 1302 return _mm256_round_pd!1(a); 1303 } 1304 } 1305 unittest 1306 { 1307 __m256d A = _mm256_setr_pd(1.3f, -2.12f, 53.6f, -2.7f); 1308 A = _mm256_floor_pd(A); 1309 double[4] correct = [1.0, -3.0, 53.0, -3.0]; 1310 assert(A.array == correct); 1311 } 1312 1313 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an integer 1314 /// value, and store the results as packed single-precision floating-point elements. 1315 __m256 _mm256_floor_ps (__m256 a) @safe 1316 { 1317 static if (LDC_with_ARM64) 1318 { 1319 __m128 lo = _mm256_extractf128_ps!0(a); 1320 __m128 hi = _mm256_extractf128_ps!1(a); 1321 __m128 ilo = _mm_floor_ps(lo); 1322 __m128 ihi = _mm_floor_ps(hi); 1323 return _mm256_set_m128(ihi, ilo); 1324 } 1325 else 1326 { 1327 return _mm256_round_ps!1(a); 1328 } 1329 } 1330 unittest 1331 { 1332 __m256 A = _mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f); 1333 __m256 C = _mm256_floor_ps(A); 1334 float[8] correct = [1.0f, -3.0f, 53.0f, -3.0f, -2, 2, -54, 2]; 1335 assert(C.array == correct); 1336 } 1337 1338 /// Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in `a` 1339 /// and `b`. 1340 __m256d _mm256_hadd_pd (__m256d a, __m256d b) pure @trusted 1341 { 1342 static if (GDC_or_LDC_with_AVX) 1343 { 1344 return __builtin_ia32_haddpd256(a, b); 1345 } 1346 else 1347 { 1348 __m256d res; 1349 res.ptr[0] = a.array[1] + a.array[0]; 1350 res.ptr[1] = b.array[1] + b.array[0]; 1351 res.ptr[2] = a.array[3] + a.array[2]; 1352 res.ptr[3] = b.array[3] + b.array[2]; 1353 return res; 1354 } 1355 } 1356 unittest 1357 { 1358 __m256d A =_mm256_setr_pd(1.5, 2.0, 21.0, 9.0); 1359 __m256d B =_mm256_setr_pd(1.0, 7.0, 100.0, 14.0); 1360 __m256d C = _mm256_hadd_pd(A, B); 1361 double[4] correct = [3.5, 8.0, 30.0, 114.0]; 1362 assert(C.array == correct); 1363 } 1364 1365 /// Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in `a` and 1366 /// `b`. 1367 __m256 _mm256_hadd_ps (__m256 a, __m256 b) pure @trusted 1368 { 1369 // PERD DMD 1370 static if (GDC_or_LDC_with_AVX) 1371 { 1372 return __builtin_ia32_haddps256(a, b); 1373 } 1374 else static if (LDC_with_ARM64) 1375 { 1376 __m128 a_hi = _mm256_extractf128_ps!1(a); 1377 __m128 a_lo = _mm256_extractf128_ps!0(a); 1378 __m128 b_hi = _mm256_extractf128_ps!1(b); 1379 __m128 b_lo = _mm256_extractf128_ps!0(b); 1380 __m128 hi = vpaddq_f32(a_hi, b_hi); 1381 __m128 lo = vpaddq_f32(a_lo, b_lo); 1382 return _mm256_set_m128(hi, lo); 1383 } 1384 else 1385 { 1386 __m256 res; 1387 res.ptr[0] = a.array[1] + a.array[0]; 1388 res.ptr[1] = a.array[3] + a.array[2]; 1389 res.ptr[2] = b.array[1] + b.array[0]; 1390 res.ptr[3] = b.array[3] + b.array[2]; 1391 res.ptr[4] = a.array[5] + a.array[4]; 1392 res.ptr[5] = a.array[7] + a.array[6]; 1393 res.ptr[6] = b.array[5] + b.array[4]; 1394 res.ptr[7] = b.array[7] + b.array[6]; 1395 return res; 1396 } 1397 } 1398 unittest 1399 { 1400 __m256 A =_mm256_setr_ps(1.0f, 2.0f, 3.0f, 5.0f, 1.0f, 2.0f, 3.0f, 5.0f); 1401 __m256 B =_mm256_setr_ps(1.5f, 2.0f, 3.5f, 4.0f, 1.5f, 2.0f, 3.5f, 5.0f); 1402 __m256 R = _mm256_hadd_ps(A, B); 1403 float[8] correct = [3.0f, 8.0f, 3.5f, 7.5f, 3.0f, 8.0f, 3.5f, 8.5f]; 1404 assert(R.array == correct); 1405 } 1406 1407 /// Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in 1408 /// `a` and `b`. 1409 __m256d _mm256_hsub_pd (__m256d a, __m256d b) pure @trusted 1410 { 1411 static if (GDC_or_LDC_with_AVX) 1412 { 1413 return __builtin_ia32_hsubpd256(a, b); 1414 } 1415 else 1416 { 1417 // 2 zip1, 2 zip2, 2 fsub... I don't think there is better in arm64 1418 __m256d res; 1419 res.ptr[0] = a.array[0] - a.array[1]; 1420 res.ptr[1] = b.array[0] - b.array[1]; 1421 res.ptr[2] = a.array[2] - a.array[3]; 1422 res.ptr[3] = b.array[2] - b.array[3]; 1423 return res; 1424 } 1425 } 1426 unittest 1427 { 1428 __m256d A =_mm256_setr_pd(1.5, 2.0, 21.0, 9.0); 1429 __m256d B =_mm256_setr_pd(1.0, 7.0, 100.0, 14.0); 1430 __m256d C = _mm256_hsub_pd(A, B); 1431 double[4] correct = [-0.5, -6.0, 12.0, 86.0]; 1432 assert(C.array == correct); 1433 } 1434 1435 __m256 _mm256_hsub_ps (__m256 a, __m256 b) pure @trusted 1436 { 1437 // PERD DMD 1438 static if (GDC_or_LDC_with_AVX) 1439 { 1440 return __builtin_ia32_hsubps256(a, b); 1441 } 1442 else 1443 { 1444 __m128 a_hi = _mm256_extractf128_ps!1(a); 1445 __m128 a_lo = _mm256_extractf128_ps!0(a); 1446 __m128 b_hi = _mm256_extractf128_ps!1(b); 1447 __m128 b_lo = _mm256_extractf128_ps!0(b); 1448 __m128 hi = _mm_hsub_ps(a_hi, b_hi); 1449 __m128 lo = _mm_hsub_ps(a_lo, b_lo); 1450 return _mm256_set_m128(hi, lo); 1451 } 1452 } 1453 unittest 1454 { 1455 __m256 A =_mm256_setr_ps(1.0f, 2.0f, 3.0f, 5.0f, 1.0f, 2.0f, 3.0f, 5.0f); 1456 __m256 B =_mm256_setr_ps(1.5f, 2.0f, 3.5f, 4.0f, 1.5f, 2.0f, 3.5f, 5.0f); 1457 __m256 R = _mm256_hsub_ps(A, B); 1458 float[8] correct = [-1.0f, -2.0f, -0.5f, -0.5f, -1.0f, -2.0f, -0.5f, -1.5f]; 1459 assert(R.array == correct); 1460 } 1461 1462 /// Copy `a`, and insert the 16-bit integer `i` into the result at the location specified by 1463 /// `index & 15`. 1464 __m256i _mm256_insert_epi16 (__m256i a, short i, const int index) pure @trusted 1465 { 1466 short16 sa = cast(short16)a; 1467 sa.ptr[index & 15] = i; 1468 return cast(__m256i)sa; 1469 } 1470 unittest 1471 { 1472 __m256i A = _mm256_set1_epi16(1); 1473 short16 R = cast(short16) _mm256_insert_epi16(A, 2, 16 + 16 + 7); 1474 short[16] correct = [1, 1, 1, 1, 1, 1, 1, 2, 1475 1, 1, 1, 1, 1, 1, 1, 1 ]; 1476 assert(R.array == correct); 1477 } 1478 1479 /// Copy `a`, and insert the 32-bit integer `i` into the result at the location specified by 1480 /// `index & 7`. 1481 __m256i _mm256_insert_epi32 (__m256i a, int i, const int index) pure @trusted 1482 { 1483 int8 ia = cast(int8)a; 1484 ia.ptr[index & 7] = i; 1485 return cast(__m256i)ia; 1486 } 1487 unittest 1488 { 1489 __m256i A = _mm256_set1_epi32(1); 1490 int8 R = cast(int8) _mm256_insert_epi32(A, -2, 8 + 8 + 1); 1491 int[8] correct = [1, -2, 1, 1, 1, 1, 1, 1]; 1492 assert(R.array == correct); 1493 } 1494 1495 /// Copy `a`, and insert the 64-bit integer `i` into the result at the location specified by 1496 /// `index & 3`. 1497 __m256i _mm256_insert_epi64(__m256i a, long i, const int index) pure @trusted 1498 { 1499 a.ptr[index & 3] = i; 1500 return a; 1501 } 1502 unittest 1503 { 1504 __m256i A = _mm256_set1_epi64(1); 1505 long4 R = cast(long4) _mm256_insert_epi64(A, -2, 2 - 4 - 4); 1506 long[4] correct = [1, 1, -2, 1]; 1507 assert(R.array == correct); 1508 } 1509 1510 /// Copy `a`, and insert the 8-bit integer `i` into the result at the location specified by 1511 /// `index & 31`. 1512 __m256i _mm256_insert_epi8(__m256i a, byte i, const int index) pure @trusted 1513 { 1514 byte32 ba = cast(byte32)a; 1515 ba.ptr[index & 31] = i; 1516 return cast(__m256i)ba; 1517 } 1518 unittest 1519 { 1520 __m256i A = _mm256_set1_epi8(1); 1521 byte32 R = cast(byte32) _mm256_insert_epi8(A, -2, 7 - 32 - 32); 1522 byte[32] correct = [1, 1, 1, 1, 1, 1, 1,-2, 1, 1, 1, 1, 1, 1, 1, 1, 1523 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]; 1524 assert(R.array == correct); 1525 } 1526 1527 /// Copy `a`, then insert 128 bits (composed of 2 packed double-precision (64-bit) 1528 /// floating-point elements) from `b` at the location specified by `imm8`. 1529 __m256d _mm256_insertf128_pd(int imm8)(__m256d a, __m128d b) pure @trusted 1530 { 1531 static if (GDC_with_AVX) 1532 { 1533 enum ubyte lane = imm8 & 1; 1534 return __builtin_ia32_vinsertf128_pd256(a, b, lane); 1535 } 1536 else 1537 { 1538 __m256d r = a; 1539 enum int index = (imm8 & 1) ? 2 : 0; 1540 r.ptr[index] = b.array[0]; 1541 r.ptr[index+1] = b.array[1]; 1542 return r; 1543 } 1544 } 1545 1546 /// Copy `a` then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point 1547 /// elements) from `b`, at the location specified by `imm8`. 1548 __m256 _mm256_insertf128_ps(int imm8)(__m256 a, __m128 b) pure @trusted 1549 { 1550 static if (GDC_with_AVX) 1551 { 1552 enum ubyte lane = imm8 & 1; 1553 return __builtin_ia32_vinsertf128_ps256(a, b, lane); 1554 } 1555 else 1556 { 1557 __m256 r = a; 1558 enum int index = (imm8 & 1) ? 4 : 0; 1559 r.ptr[index] = b.array[0]; 1560 r.ptr[index+1] = b.array[1]; 1561 r.ptr[index+2] = b.array[2]; 1562 r.ptr[index+3] = b.array[3]; 1563 return r; 1564 } 1565 } 1566 1567 /// Copy `a`, then insert 128 bits from `b` at the location specified by `imm8`. 1568 __m256i _mm256_insertf128_si256(int imm8)(__m256i a, __m128i b) pure @trusted 1569 { 1570 static if (GDC_with_AVX) 1571 { 1572 enum ubyte lane = imm8 & 1; 1573 return cast(__m256i) __builtin_ia32_vinsertf128_si256 (cast(int8)a, b, lane); 1574 } 1575 else 1576 { 1577 long2 lb = cast(long2)b; 1578 __m256i r = a; 1579 enum int index = (imm8 & 1) ? 2 : 0; 1580 r.ptr[index] = lb.array[0]; 1581 r.ptr[index+1] = lb.array[1]; 1582 return r; 1583 } 1584 } 1585 1586 /// Load 256-bits of integer data from unaligned memory into dst. 1587 /// This intrinsic may perform better than `_mm256_loadu_si256` when the data crosses a cache 1588 /// line boundary. 1589 __m256i _mm256_lddqu_si256(const(__m256i)* mem_addr) @trusted 1590 { 1591 // PERF DMD D_SIMD 1592 static if (GDC_or_LDC_with_AVX) 1593 { 1594 return cast(__m256i) __builtin_ia32_lddqu256(cast(const(char)*)mem_addr); 1595 } 1596 else 1597 return _mm256_loadu_si256(mem_addr); 1598 } 1599 unittest 1600 { 1601 int[10] correct = [0, -1, 2, -3, 4, 9, -7, 8, -6, 34]; 1602 int8 A = cast(int8) _mm256_lddqu_si256(cast(__m256i*) &correct[1]); 1603 assert(A.array == correct[1..9]); 1604 } 1605 1606 /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) 1607 /// from memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 1608 /// exception may be generated. 1609 __m256d _mm256_load_pd (const(double)* mem_addr) pure @trusted 1610 { 1611 return *cast(__m256d*)mem_addr; 1612 } 1613 unittest 1614 { 1615 static immutable align(32) double[4] correct = [1.0, 2.0, 3.5, -42.0]; 1616 __m256d A = _mm256_load_pd(correct.ptr); 1617 assert(A.array == correct); 1618 } 1619 1620 /// Load 256-bits (composed of 8 packed single-precision (32-bit) 1621 /// floating-point elements) from memory. 1622 /// `mem_addr` must be aligned on a 32-byte boundary or a 1623 /// general-protection exception may be generated. 1624 __m256 _mm256_load_ps (const(float)* mem_addr) pure @trusted 1625 { 1626 return *cast(__m256*)mem_addr; 1627 } 1628 unittest 1629 { 1630 static immutable align(32) float[8] correct = 1631 [1.0, 2.0, 3.5, -42.0, 7.43f, 0.0f, 3, 2]; 1632 __m256 A = _mm256_load_ps(correct.ptr); 1633 assert(A.array == correct); 1634 } 1635 1636 /// Load 256-bits of integer data from memory. `mem_addr` does not need to be aligned on 1637 /// any particular boundary. 1638 // See this dlang forum post => https://forum.dlang.org/thread/vymrsngsfibkmqsqffce@forum.dlang.org 1639 __m256i _mm256_loadu_si256 (const(__m256i)* mem_addr) pure @trusted 1640 { 1641 // PERF DMD 1642 static if (GDC_with_AVX) 1643 { 1644 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) mem_addr); 1645 } 1646 else version(LDC) 1647 { 1648 return loadUnaligned!(__m256i)(cast(long*)mem_addr); 1649 } 1650 else 1651 { 1652 const(long)* p = cast(const(long)*)mem_addr; 1653 long4 r; 1654 r.ptr[0] = p[0]; 1655 r.ptr[1] = p[1]; 1656 r.ptr[2] = p[2]; 1657 r.ptr[3] = p[3]; 1658 return r; 1659 } 1660 } 1661 unittest 1662 { 1663 align(16) int[8] correct = [-1, 2, -3, 4, 9, -7, 8, -6]; 1664 int8 A = cast(int8) _mm256_loadu_si256(cast(__m256i*) correct.ptr); 1665 assert(A.array == correct); 1666 } 1667 1668 /// Load 256-bits of integer data from memory. `mem_addr` must be aligned on a 1669 /// 32-byte boundary or a general-protection exception may be generated. 1670 __m256i _mm256_load_si256 (const(void)* mem_addr) pure @system 1671 { 1672 return *cast(__m256i*)mem_addr; 1673 } 1674 unittest 1675 { 1676 static immutable align(64) long[4] correct = [1, -2, long.min, long.max]; 1677 __m256i A = _mm256_load_si256(correct.ptr); 1678 assert(A.array == correct); 1679 } 1680 1681 /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) 1682 /// from memory. `mem_addr` does not need to be aligned on any particular boundary. 1683 __m256d _mm256_loadu_pd (const(void)* mem_addr) pure @system 1684 { 1685 // PERF DMD 1686 static if (GDC_with_AVX) 1687 { 1688 return __builtin_ia32_loadupd256 ( cast(const(double)*) mem_addr); 1689 } 1690 else version(LDC) 1691 { 1692 return loadUnaligned!(__m256d)(cast(double*)mem_addr); 1693 } 1694 else 1695 { 1696 const(double)* p = cast(const(double)*)mem_addr; 1697 double4 r; 1698 r.ptr[0] = p[0]; 1699 r.ptr[1] = p[1]; 1700 r.ptr[2] = p[2]; 1701 r.ptr[3] = p[3]; 1702 return r; 1703 } 1704 } 1705 unittest 1706 { 1707 double[4] correct = [1.0, -2.0, 0.0, 768.5]; 1708 __m256d A = _mm256_loadu_pd(correct.ptr); 1709 assert(A.array == correct); 1710 } 1711 1712 /// Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory. 1713 /// `mem_addr` does not need to be aligned on any particular boundary. 1714 __m256 _mm256_loadu_ps (const(float)* mem_addr) pure @system 1715 { 1716 // PERF DMD 1717 static if (GDC_with_AVX) 1718 { 1719 return __builtin_ia32_loadups256 ( cast(const(float)*) mem_addr); 1720 } 1721 else version(LDC) 1722 { 1723 return loadUnaligned!(__m256)(cast(float*)mem_addr); 1724 } 1725 else 1726 { 1727 const(float)* p = cast(const(float)*)mem_addr; 1728 float8 r = void; 1729 r.ptr[0] = p[0]; 1730 r.ptr[1] = p[1]; 1731 r.ptr[2] = p[2]; 1732 r.ptr[3] = p[3]; 1733 r.ptr[4] = p[4]; 1734 r.ptr[5] = p[5]; 1735 r.ptr[6] = p[6]; 1736 r.ptr[7] = p[7]; 1737 return r; 1738 } 1739 } 1740 unittest 1741 { 1742 align(32) float[10] correct = [0.0f, 1, 2, 3, 4, 5, 6, 7, 8, 9]; 1743 __m256 A = _mm256_loadu_ps(&correct[1]); 1744 assert(A.array == correct[1..9]); 1745 } 1746 1747 /// Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point 1748 /// elements) from memory, and combine them into a 256-bit value. 1749 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 1750 __m256 _mm256_loadu2_m128 (const(float)* hiaddr, const(float)* loaddr) pure @system 1751 { 1752 // Note: no particular instruction for this in x86. 1753 return _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr)); 1754 } 1755 unittest 1756 { 1757 align(32) float[6] A = [4.5f, 2, 8, 97, -1, 3]; 1758 align(32) float[6] B = [6.5f, 3, 9, 98, -2, 4]; 1759 __m256 R = _mm256_loadu2_m128(&B[1], &A[1]); 1760 float[8] correct = [2.0f, 8, 97, -1, 3, 9, 98, -2]; 1761 assert(R.array == correct); 1762 } 1763 1764 /// Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point 1765 /// elements) from memory, and combine them into a 256-bit value. 1766 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 1767 __m256d _mm256_loadu2_m128d (const(double)* hiaddr, const(double)* loaddr) pure @system 1768 { 1769 // Note: no particular instruction for this in x86. 1770 return _mm256_set_m128d(_mm_loadu_pd(hiaddr), _mm_loadu_pd(loaddr)); 1771 } 1772 unittest 1773 { 1774 align(32) double[4] A = [4.5f, 2, 8, 97]; 1775 align(32) double[4] B = [6.5f, 3, 9, 98]; 1776 __m256d R = _mm256_loadu2_m128d(&B[1], &A[1]); 1777 double[4] correct = [2.0, 8, 3, 9]; 1778 assert(R.array == correct); 1779 } 1780 1781 /// Load two 128-bit values (composed of integer data) from memory, and combine them into a 1782 /// 256-bit value. `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 1783 __m256i _mm256_loadu2_m128i (const(__m128i)* hiaddr, const(__m128i)* loaddr) pure @trusted 1784 { 1785 // Note: no particular instruction for this in x86. 1786 return _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr)); 1787 } 1788 unittest 1789 { 1790 align(32) long[4] A = [5, 2, 8, 97]; 1791 align(32) long[4] B = [6, 3, 9, 98]; 1792 __m256i R = _mm256_loadu2_m128i(cast(const(__m128i)*) &B[1], cast(const(__m128i)*) &A[1]); 1793 long[4] correct = [2, 8, 3, 9]; 1794 assert(R.array == correct); 1795 } 1796 1797 version(DigitalMars) 1798 { 1799 // this avoids a bug with DMD < 2.099 -a x86 -O 1800 private enum bool maskLoadWorkaround = (__VERSION__ < 2099); 1801 } 1802 else 1803 { 1804 private enum bool maskLoadWorkaround = false; 1805 } 1806 1807 /// Load packed double-precision (64-bit) floating-point elements from memory using `mask` 1808 /// (elements are zeroed out when the high bit of the corresponding element is not set). 1809 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access 1810 /// only when needed. 1811 /// See: "Note about mask load/store" to know why you must address valid memory only. 1812 __m128d _mm_maskload_pd (const(double)* mem_addr, __m128i mask) /* pure */ @system 1813 { 1814 // PERF DMD 1815 // PERF ARM64 1816 static if (LDC_with_AVX) 1817 { 1818 // MAYDO report that the builtin is impure 1819 return __builtin_ia32_maskloadpd(mem_addr, cast(long2)mask); 1820 } 1821 else static if (GDC_with_AVX) 1822 { 1823 return __builtin_ia32_maskloadpd(cast(double2*)mem_addr, cast(long2)mask); 1824 } 1825 else 1826 { 1827 long2 lmask = cast(long2)mask; 1828 double2 r; 1829 r.ptr[0] = (lmask.array[0] < 0) ? mem_addr[0] : 0.0; 1830 r.ptr[1] = (lmask.array[1] < 0) ? mem_addr[1] : 0.0; 1831 return r; 1832 } 1833 } 1834 unittest 1835 { 1836 static if (!maskLoadWorkaround) 1837 { 1838 double[2] A = [7.5, 1]; 1839 double2 B = _mm_maskload_pd(A.ptr, _mm_setr_epi64(-1, 1)); 1840 double[2] correct = [7.5, 0]; 1841 assert(B.array == correct); 1842 } 1843 } 1844 1845 /// Load packed double-precision (64-bit) floating-point elements from memory using `mask` 1846 /// (elements are zeroed out when the high bit of the corresponding element is not set). 1847 /// See: "Note about mask load/store" to know why you must address valid memory only. 1848 __m256d _mm256_maskload_pd (const(double)* mem_addr, __m256i mask) /*pure*/ @system 1849 { 1850 // PERF DMD 1851 // PERF ARM64 1852 static if (LDC_with_AVX) 1853 { 1854 // MAYDO that the builtin is impure 1855 return __builtin_ia32_maskloadpd256(mem_addr, mask); 1856 } 1857 else static if (GDC_with_AVX) 1858 { 1859 return __builtin_ia32_maskloadpd256(cast(double4*)mem_addr, mask); 1860 } 1861 else 1862 { 1863 long4 imask = cast(long4)mask; 1864 double4 r; 1865 r.ptr[0] = (imask.array[0] < 0) ? mem_addr[0] : 0.0; 1866 r.ptr[1] = (imask.array[1] < 0) ? mem_addr[1] : 0.0; 1867 r.ptr[2] = (imask.array[2] < 0) ? mem_addr[2] : 0.0; 1868 r.ptr[3] = (imask.array[3] < 0) ? mem_addr[3] : 0.0; 1869 return r; 1870 } 1871 } 1872 unittest 1873 { 1874 static if (!maskLoadWorkaround) 1875 { 1876 double[4] A = [7.5, 1, 2, 3]; 1877 double4 B = _mm256_maskload_pd(A.ptr, _mm256_setr_epi64(1, -1, -1, 1)); 1878 double[4] correct = [0.0, 1, 2, 0]; 1879 assert(B.array == correct); 1880 } 1881 } 1882 1883 /// Load packed single-precision (32-bit) floating-point elements from memory using mask (elements 1884 /// are zeroed out when the high bit of the corresponding element is not set). 1885 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access 1886 /// only when needed. 1887 /// See: "Note about mask load/store" to know why you must address valid memory only. 1888 __m128 _mm_maskload_ps (const(float)* mem_addr, __m128i mask) /* pure */ @system 1889 { 1890 // PERF DMD 1891 // PERF ARM64 1892 static if (LDC_with_AVX) 1893 { 1894 // MAYDO that the builtin is impure 1895 return __builtin_ia32_maskloadps(mem_addr, mask); 1896 } 1897 else static if (GDC_with_AVX) 1898 { 1899 return __builtin_ia32_maskloadps(cast(float4*)mem_addr, mask); 1900 } 1901 else 1902 { 1903 int4 imask = cast(int4)mask; 1904 float4 r; 1905 r.ptr[0] = (imask.array[0] < 0) ? mem_addr[0] : 0.0f; 1906 r.ptr[1] = (imask.array[1] < 0) ? mem_addr[1] : 0.0f; 1907 r.ptr[2] = (imask.array[2] < 0) ? mem_addr[2] : 0.0f; 1908 r.ptr[3] = (imask.array[3] < 0) ? mem_addr[3] : 0.0f; 1909 return r; 1910 } 1911 } 1912 unittest 1913 { 1914 static if (!maskLoadWorkaround) 1915 { 1916 float[4] A = [7.5f, 1, 2, 3]; 1917 float4 B = _mm_maskload_ps(A.ptr, _mm_setr_epi32(1, -1, -1, 1)); // can address invalid memory with mask load and writes! 1918 float[4] correct = [0.0f, 1, 2, 0]; 1919 assert(B.array == correct); 1920 } 1921 } 1922 1923 /// Load packed single-precision (32-bit) floating-point elements from memory using `mask` 1924 /// (elements are zeroed out when the high bit of the corresponding element is not set). 1925 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access 1926 /// only when needed. 1927 /// See: "Note about mask load/store" to know why you must address valid memory only. 1928 __m256 _mm256_maskload_ps (const(float)* mem_addr, __m256i mask) /*pure*/ @system 1929 { 1930 // PERF DMD 1931 // PERF ARM64 1932 static if (LDC_with_AVX) 1933 { 1934 // MAYDO that the builtin is impure 1935 return __builtin_ia32_maskloadps256(mem_addr, cast(int8)mask); 1936 } 1937 else static if (GDC_with_AVX) 1938 { 1939 return __builtin_ia32_maskloadps256(cast(float8*)mem_addr, cast(int8)mask); 1940 } 1941 else 1942 { 1943 int8 imask = cast(int8)mask; 1944 float8 r; 1945 foreach(n; 0..8) 1946 r.ptr[n] = (imask.array[n] < 0) ? mem_addr[n] : 0.0f; 1947 return r; 1948 } 1949 } 1950 unittest 1951 { 1952 float[8] A = [1, 7.5f, 1, 2, 3, 4, 5, 6]; 1953 __m256i M = _mm256_setr_epi32(1, -1, 1, -1, 1, -1, -1, 1); 1954 float8 B = _mm256_maskload_ps(A.ptr, M); 1955 float[8] correct = [0.0f, 7.5f, 0, 2, 0, 4, 5, 0]; 1956 assert(B.array == correct); 1957 } 1958 1959 /// Store packed double-precision (64-bit) floating-point elements from `a` into memory using `mask`. 1960 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access 1961 /// only when needed. 1962 /// See: "Note about mask load/store" to know why you must address valid memory only. 1963 void _mm_maskstore_pd (double * mem_addr, __m128i mask, __m128d a) /* pure */ @system 1964 { 1965 // PERF DMD 1966 // PERF ARM64 1967 static if (LDC_with_AVX) 1968 { 1969 // MAYDO that the builtin is impure 1970 __builtin_ia32_maskstorepd(mem_addr, cast(long2)mask, a); 1971 } 1972 else static if (GDC_with_AVX) 1973 { 1974 __builtin_ia32_maskstorepd(cast(double2*)mem_addr, cast(long2)mask, a); 1975 } 1976 else 1977 { 1978 long2 imask = cast(long2)mask; 1979 foreach(n; 0..2) 1980 if (imask.array[n] < 0) 1981 mem_addr[n] = a.array[n]; 1982 } 1983 } 1984 unittest 1985 { 1986 double[2] A = [0.0, 1.0]; 1987 __m128i M = _mm_setr_epi64(-1, 0); 1988 __m128d B = _mm_setr_pd(2.0, 3.0); 1989 _mm_maskstore_pd(A.ptr, M, B); 1990 double[2] correct = [2.0, 1.0]; 1991 assert(A == correct); 1992 } 1993 1994 1995 /// Store packed double-precision (64-bit) floating-point elements from `a` into memory using `mask`. 1996 /// See: "Note about mask load/store" to know why you must address valid memory only. 1997 static if (!llvm256BitStackWorkaroundIn32BitX86) 1998 { 1999 void _mm256_maskstore_pd (double * mem_addr, __m256i mask, __m256d a) /* pure */ @system 2000 { 2001 // PERF DMD 2002 // PERF ARM64 2003 static if (LDC_with_AVX) 2004 { 2005 // MAYDO that the builtin is impure 2006 __builtin_ia32_maskstorepd256(mem_addr, cast(long4)mask, a); 2007 } 2008 else static if (GDC_with_AVX) 2009 { 2010 __builtin_ia32_maskstorepd256(cast(double4*)mem_addr, cast(long4)mask, a); 2011 } 2012 else 2013 { 2014 long4 imask = cast(long4)mask; 2015 foreach(n; 0..4) 2016 if (imask.array[n] < 0) 2017 mem_addr[n] = a.array[n]; 2018 } 2019 } 2020 unittest 2021 { 2022 double[4] A = [0.0, 1, 2, 3]; 2023 __m256i M = _mm256_setr_epi64x(-9, 0, -1, 0); 2024 __m256d B = _mm256_setr_pd(2, 3, 4, 5); 2025 _mm256_maskstore_pd(A.ptr, M, B); 2026 double[4] correct = [2.0, 1, 4, 3]; 2027 assert(A == correct); 2028 } 2029 } 2030 2031 /// Store packed single-precision (32-bit) floating-point elements from `a` into memory using `mask`. 2032 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access 2033 /// only when needed. 2034 /// See: "Note about mask load/store" to know why you must address valid memory only. 2035 void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a) /* pure */ @system 2036 { 2037 // PERF DMD 2038 // PERF ARM64 2039 static if (LDC_with_AVX) 2040 { 2041 // MAYDO report that the builtin is impure 2042 __builtin_ia32_maskstoreps(mem_addr, mask, a); 2043 } 2044 else static if (GDC_with_AVX) 2045 { 2046 __builtin_ia32_maskstoreps(cast(float4*)mem_addr, mask, a); 2047 } 2048 else 2049 { 2050 int4 imask = cast(int4)mask; 2051 foreach(n; 0..4) 2052 if (imask.array[n] < 0) 2053 mem_addr[n] = a.array[n]; 2054 } 2055 } 2056 unittest 2057 { 2058 float[4] A = [0.0f, 1, 2, 6]; 2059 __m128i M = _mm_setr_epi32(-1, 0, -1, 0); 2060 __m128 B = _mm_setr_ps(2, 3, 4, 5); 2061 _mm_maskstore_ps(A.ptr, M, B); 2062 float[4] correct = [2.0f, 1, 4, 6]; 2063 assert(A == correct); 2064 } 2065 2066 static if (!llvm256BitStackWorkaroundIn32BitX86) 2067 { 2068 /// Store packed single-precision (32-bit) floating-point elements from `a` into memory using `mask`. 2069 /// See: "Note about mask load/store" to know why you must address valid memory only. 2070 void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a) /* pure */ @system 2071 { 2072 // PERF DMD 2073 // PERF ARM64 2074 static if (LDC_with_AVX) 2075 { 2076 // MAYDO report that the builtin is impure 2077 __builtin_ia32_maskstoreps256(mem_addr, cast(int8)mask, a); 2078 } 2079 else static if (GDC_with_AVX) 2080 { 2081 __builtin_ia32_maskstoreps256(cast(float8*)mem_addr, cast(int8)mask, a); 2082 } 2083 else 2084 { 2085 int8 imask = cast(int8)mask; 2086 foreach(n; 0..8) 2087 if (imask.array[n] < 0) 2088 mem_addr[n] = a.array[n]; 2089 } 2090 } 2091 unittest 2092 { 2093 float[8] A = [0.0f, 0, 1, 2, 3, 4, 5, 7]; 2094 __m256i M = _mm256_setr_epi32( 0, -1, 0, -1, 0, -1, -1, 0); 2095 __m256 B = _mm256_set1_ps(6.0f); 2096 _mm256_maskstore_ps(A.ptr, M, B); 2097 float[8] correct = [0.0f, 6, 1, 6, 3, 6, 6, 7]; 2098 assert(A == correct); 2099 } 2100 } 2101 2102 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 2103 /// packed maximum values. 2104 __m256d _mm256_max_pd (__m256d a, __m256d b) pure @trusted 2105 { 2106 // PERF DMD D_SIMD 2107 static if (GDC_or_LDC_with_AVX) 2108 { 2109 return __builtin_ia32_maxpd256(a, b); 2110 } 2111 else 2112 { 2113 // LDC: becomes good in -O2 2114 // PERF: GDC without AVX 2115 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2116 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2117 a.ptr[2] = (a.array[2] > b.array[2]) ? a.array[2] : b.array[2]; 2118 a.ptr[3] = (a.array[3] > b.array[3]) ? a.array[3] : b.array[3]; 2119 return a; 2120 } 2121 } 2122 unittest 2123 { 2124 __m256d A = _mm256_setr_pd(4.0, 1.0, -9.0, double.infinity); 2125 __m256d B = _mm256_setr_pd(1.0, 8.0, 0.0, 100000.0); 2126 __m256d M = _mm256_max_pd(A, B); 2127 double[4] correct = [4.0, 8.0, 0.0, double.infinity]; 2128 } 2129 2130 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return 2131 /// packed maximum values. 2132 __m256 _mm256_max_ps (__m256 a, __m256 b) pure @trusted 2133 { 2134 // PERF DMD D_SIMD 2135 static if (GDC_or_LDC_with_AVX) 2136 { 2137 return __builtin_ia32_maxps256(a, b); 2138 } 2139 else 2140 { 2141 // LDC: becomes good in -O2, but looks brittle. 2142 // PERF GDC without AVX 2143 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2144 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2145 a.ptr[2] = (a.array[2] > b.array[2]) ? a.array[2] : b.array[2]; 2146 a.ptr[3] = (a.array[3] > b.array[3]) ? a.array[3] : b.array[3]; 2147 a.ptr[4] = (a.array[4] > b.array[4]) ? a.array[4] : b.array[4]; 2148 a.ptr[5] = (a.array[5] > b.array[5]) ? a.array[5] : b.array[5]; 2149 a.ptr[6] = (a.array[6] > b.array[6]) ? a.array[6] : b.array[6]; 2150 a.ptr[7] = (a.array[7] > b.array[7]) ? a.array[7] : b.array[7]; 2151 return a; 2152 } 2153 } 2154 unittest 2155 { 2156 __m256 A = _mm256_setr_ps(4.0, 1.0, -9.0, float.infinity, 1, 2, 3, 4); 2157 __m256 B = _mm256_setr_ps(1.0, 8.0, 0.0, 100000.0f , 4, 3, 2, 1); 2158 __m256 M = _mm256_max_ps(A, B); 2159 float[8] correct = [4.0, 8.0, 0.0, float.infinity , 4, 3, 3, 4]; 2160 } 2161 2162 // Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 2163 /// packed minimum values. 2164 __m256d _mm256_min_pd (__m256d a, __m256d b) pure @trusted 2165 { 2166 // PERF DMD D_SIMD 2167 static if (GDC_or_LDC_with_AVX) 2168 { 2169 return __builtin_ia32_minpd256(a, b); 2170 } 2171 else 2172 { 2173 // LDC: becomes good in -O2 2174 // PERF: GDC without AVX 2175 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2176 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2177 a.ptr[2] = (a.array[2] < b.array[2]) ? a.array[2] : b.array[2]; 2178 a.ptr[3] = (a.array[3] < b.array[3]) ? a.array[3] : b.array[3]; 2179 return a; 2180 } 2181 } 2182 unittest 2183 { 2184 __m256d A = _mm256_setr_pd(4.0, 1.0, -9.0, double.infinity); 2185 __m256d B = _mm256_setr_pd(1.0, 8.0, 0.0, 100000.0); 2186 __m256d M = _mm256_min_pd(A, B); 2187 double[4] correct = [1.0, 8.0, -9.0, 100000.0]; 2188 } 2189 2190 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return 2191 /// packed maximum values. 2192 __m256 _mm256_min_ps (__m256 a, __m256 b) pure @trusted 2193 { 2194 // PERF DMD D_SIMD 2195 static if (GDC_or_LDC_with_AVX) 2196 { 2197 return __builtin_ia32_minps256(a, b); 2198 } 2199 else 2200 { 2201 // LDC: becomes good in -O2, but looks brittle. 2202 // PERF GDC without AVX 2203 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2204 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2205 a.ptr[2] = (a.array[2] < b.array[2]) ? a.array[2] : b.array[2]; 2206 a.ptr[3] = (a.array[3] < b.array[3]) ? a.array[3] : b.array[3]; 2207 a.ptr[4] = (a.array[4] < b.array[4]) ? a.array[4] : b.array[4]; 2208 a.ptr[5] = (a.array[5] < b.array[5]) ? a.array[5] : b.array[5]; 2209 a.ptr[6] = (a.array[6] < b.array[6]) ? a.array[6] : b.array[6]; 2210 a.ptr[7] = (a.array[7] < b.array[7]) ? a.array[7] : b.array[7]; 2211 return a; 2212 } 2213 } 2214 unittest 2215 { 2216 __m256 A = _mm256_setr_ps(4.0, 1.0, -9.0, float.infinity, 1, 2, 3, 4); 2217 __m256 B = _mm256_setr_ps(1.0, 8.0, 0.0, 100000.0f , 4, 3, 2, 1); 2218 __m256 M = _mm256_min_ps(A, B); 2219 float[8] correct = [1.0, 1.0, -9.0, 100000.0f , 1, 2, 2, 1]; 2220 } 2221 2222 /// Duplicate even-indexed double-precision (64-bit) floating-point elements from `a`. 2223 __m256d _mm256_movedup_pd (__m256d a) @trusted 2224 { 2225 // PERF DMD D_SIMD 2226 static if (GDC_with_AVX) 2227 { 2228 return __builtin_ia32_movddup256 (a); 2229 } 2230 else 2231 { 2232 a.ptr[1] = a.array[0]; 2233 a.ptr[3] = a.array[2]; 2234 return a; 2235 } 2236 } 2237 unittest 2238 { 2239 __m256d A = _mm256_setr_pd(1.0, 2, 3, 4); 2240 A = _mm256_movedup_pd(A); 2241 double[4] correct = [1.0, 1, 3, 3]; 2242 assert(A.array == correct); 2243 } 2244 2245 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`. 2246 __m256 _mm256_movehdup_ps (__m256 a) @trusted 2247 { 2248 // PERF DMD D_SIMD 2249 static if (GDC_with_AVX) 2250 { 2251 return __builtin_ia32_movshdup256 (a); 2252 } 2253 else 2254 { 2255 a.ptr[0] = a.array[1]; 2256 a.ptr[2] = a.array[3]; 2257 a.ptr[4] = a.array[5]; 2258 a.ptr[6] = a.array[7]; 2259 return a; 2260 } 2261 } 2262 unittest 2263 { 2264 __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8); 2265 A = _mm256_movehdup_ps(A); 2266 float[8] correct = [2.0, 2, 4, 4, 6, 6, 8, 8]; 2267 assert(A.array == correct); 2268 } 2269 2270 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`. 2271 __m256 _mm256_moveldup_ps (__m256 a) @trusted 2272 { 2273 // PERF DMD D_SIMD 2274 static if (GDC_with_AVX) 2275 { 2276 return __builtin_ia32_movsldup256 (a); 2277 } 2278 else 2279 { 2280 a.ptr[1] = a.array[0]; 2281 a.ptr[3] = a.array[2]; 2282 a.ptr[5] = a.array[4]; 2283 a.ptr[7] = a.array[6]; 2284 return a; 2285 } 2286 } 2287 unittest 2288 { 2289 __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8); 2290 A = _mm256_moveldup_ps(A); 2291 float[8] correct = [1.0, 1, 3, 3, 5, 5, 7, 7]; 2292 assert(A.array == correct); 2293 } 2294 2295 /// Set each bit of result mask based on the most significant bit of the corresponding packed 2296 /// double-precision (64-bit) floating-point element in `a`. 2297 int _mm256_movemask_pd (__m256d a) @safe 2298 { 2299 // PERF: DMD 2300 static if (GDC_or_LDC_with_AVX) 2301 { 2302 return __builtin_ia32_movmskpd256(a); 2303 } 2304 else static if (LDC_with_SSE2) 2305 { 2306 // this doesn't benefit GDC, and not clear for arm64. 2307 __m128d A_lo = _mm256_extractf128_pd!0(a); 2308 __m128d A_hi = _mm256_extractf128_pd!1(a); 2309 2310 return (_mm_movemask_pd(A_hi) << 2) | _mm_movemask_pd(A_lo); 2311 } 2312 else 2313 { 2314 // Fortunately, branchless on arm64 2315 long4 lv = cast(long4)a; 2316 int r = 0; 2317 if (lv.array[0] < 0) r += 1; 2318 if (lv.array[1] < 0) r += 2; 2319 if (lv.array[2] < 0) r += 4; 2320 if (lv.array[3] < 0) r += 8; 2321 return r; 2322 } 2323 } 2324 unittest 2325 { 2326 __m256d A = _mm256_setr_pd(-1, -double.infinity, 0, -1); 2327 assert(_mm256_movemask_pd(A) == 1 + 2 + 8); 2328 } 2329 2330 /// Set each bit of mask result based on the most significant bit of the corresponding packed 2331 /// single-precision (32-bit) floating-point element in `a`. 2332 int _mm256_movemask_ps (__m256 a) @system 2333 { 2334 // PERF: DMD 2335 // PERF GDC without AVX 2336 static if (GDC_or_LDC_with_AVX) 2337 { 2338 return __builtin_ia32_movmskps256(a); 2339 } 2340 else version(LDC) 2341 { 2342 // this doesn't benefit GDC (unable to inline), but benefits both LDC with SSE2 and ARM64 2343 __m128 A_lo = _mm256_extractf128_ps!0(a); 2344 __m128 A_hi = _mm256_extractf128_ps!1(a); 2345 return (_mm_movemask_ps(A_hi) << 4) | _mm_movemask_ps(A_lo); 2346 } 2347 else 2348 { 2349 int8 lv = cast(int8)a; 2350 int r = 0; 2351 if (lv.array[0] < 0) r += 1; 2352 if (lv.array[1] < 0) r += 2; 2353 if (lv.array[2] < 0) r += 4; 2354 if (lv.array[3] < 0) r += 8; 2355 if (lv.array[4] < 0) r += 16; 2356 if (lv.array[5] < 0) r += 32; 2357 if (lv.array[6] < 0) r += 64; 2358 if (lv.array[7] < 0) r += 128; 2359 return r; 2360 } 2361 } 2362 unittest 2363 { 2364 __m256 A = _mm256_setr_ps(-1, -double.infinity, 0, -1, 1, double.infinity, -2, double.nan); 2365 assert(_mm256_movemask_ps(A) == 1 + 2 + 8 + 64); 2366 } 2367 2368 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`. 2369 __m256d _mm256_mul_pd (__m256d a, __m256d b) pure @safe 2370 { 2371 return a * b; 2372 } 2373 unittest 2374 { 2375 __m256d a = [-2.0, 1.5, -2.0, 1.5]; 2376 a = _mm256_mul_pd(a, a); 2377 assert(a.array == [4.0, 2.25, 4.0, 2.25]); 2378 } 2379 2380 /// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`. 2381 __m256 _mm256_mul_ps (__m256 a, __m256 b) pure @safe 2382 { 2383 return a * b; 2384 } 2385 unittest 2386 { 2387 __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 1.5f, -2.0f, 3.0f, 1.0f]; 2388 a = _mm256_mul_ps(a, a); 2389 float[8] correct = [2.25f, 4.0f, 9.0f, 1.0f, 2.25f, 4.0f, 9.0f, 1.0f]; 2390 assert(a.array == correct); 2391 } 2392 2393 2394 /// Compute the bitwise NOT of 256 bits in `a`. #BONUS 2395 __m256i _mm256_not_si256 (__m256i a) pure @safe 2396 { 2397 return ~a; 2398 } 2399 unittest 2400 { 2401 __m256i A = _mm256_set1_epi64x(-748); 2402 long4 notA = cast(long4) _mm256_not_si256(A); 2403 int[4] correct = [747, 747, 747, 747]; 2404 assert(notA.array == correct); 2405 } 2406 2407 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 2408 __m256d _mm256_or_pd (__m256d a, __m256d b) pure @safe 2409 { 2410 return cast(__m256d)( cast(__m256i)a | cast(__m256i)b ); 2411 } 2412 2413 /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in `a` and `b`. 2414 __m256 _mm256_or_ps (__m256 a, __m256 b) pure @safe 2415 { 2416 return cast(__m256)( cast(__m256i)a | cast(__m256i)b ); 2417 } 2418 2419 /// Shuffle double-precision (64-bit) floating-point elements in `a` using the control in `imm8`. 2420 __m128d _mm_permute_pd(int imm8)(__m128d a) pure @trusted 2421 { 2422 static if (GDC_with_AVX) 2423 { 2424 return __builtin_ia32_vpermilpd(a, imm8 & 3); 2425 } 2426 else 2427 { 2428 // Shufflevector not particularly better for LDC here 2429 __m128d r; 2430 r.ptr[0] = a.array[imm8 & 1]; 2431 r.ptr[1] = a.array[(imm8 >> 1) & 1]; 2432 return r; 2433 } 2434 } 2435 unittest 2436 { 2437 __m128d A = _mm_setr_pd(5, 6); 2438 __m128d B = _mm_permute_pd!1(A); 2439 __m128d C = _mm_permute_pd!3(A); 2440 double[2] RB = [6, 5]; 2441 double[2] RC = [6, 6]; 2442 assert(B.array == RB); 2443 assert(C.array == RC); 2444 } 2445 2446 ///ditto 2447 __m256d _mm256_permute_pd(int imm8)(__m256d a) pure @trusted 2448 { 2449 // PERF DMD 2450 static if (GDC_with_AVX) 2451 { 2452 return __builtin_ia32_vpermilpd256(a, imm8 & 15); 2453 } 2454 else version(LDC) 2455 { 2456 return shufflevectorLDC!(double4, 2457 (imm8 >> 0) & 1, 2458 ( (imm8 >> 1) & 1), 2459 2 + ( (imm8 >> 2) & 1), 2460 2 + ( (imm8 >> 3) & 1) )(a, a); 2461 } 2462 else 2463 { 2464 __m256d r; 2465 r.ptr[0] = a.array[ imm8 & 1]; 2466 r.ptr[1] = a.array[(imm8 >> 1) & 1]; 2467 r.ptr[2] = a.array[2 + ((imm8 >> 2) & 1)]; 2468 r.ptr[3] = a.array[2 + ((imm8 >> 3) & 1)]; 2469 return r; 2470 } 2471 } 2472 unittest 2473 { 2474 __m256d A = _mm256_setr_pd(0.0, 1, 2, 3); 2475 __m256d R = _mm256_permute_pd!(1 + 4)(A); 2476 double[4] correct = [1.0, 0, 3, 2]; 2477 assert(R.array == correct); 2478 } 2479 2480 /// Shuffle single-precision (32-bit) floating-point elements in `a` using the control in `imm8`. 2481 __m128 _mm_permute_ps(int imm8)(__m128 a) pure @trusted 2482 { 2483 // PERF DMD 2484 static if (GDC_with_AVX) 2485 { 2486 return __builtin_ia32_vpermilps(a, cast(ubyte)imm8); 2487 } 2488 else version(LDC) 2489 { 2490 return shufflevectorLDC!(float4, (imm8 >> 0) & 3, (imm8 >> 2) & 3, (imm8 >> 4) & 3, 2491 (imm8 >> 6) & 3)(a, a); 2492 } 2493 else 2494 { 2495 // PERF: could use _mm_shuffle_ps which is a super set 2496 // when AVX isn't available 2497 __m128 r; 2498 r.ptr[0] = a.array[(imm8 >> 0) & 3]; 2499 r.ptr[1] = a.array[(imm8 >> 2) & 3]; 2500 r.ptr[2] = a.array[(imm8 >> 4) & 3]; 2501 r.ptr[3] = a.array[(imm8 >> 6) & 3]; 2502 return r; 2503 } 2504 } 2505 unittest 2506 { 2507 __m128 A = _mm_setr_ps(0.0f, 1, 2, 3); 2508 __m128 R = _mm_permute_ps!(1 + 4 * 3 + 16 * 0 + 64 * 2)(A); 2509 float[4] correct = [1.0f, 3, 0, 2]; 2510 assert(R.array == correct); 2511 } 2512 2513 /// Shuffle single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using 2514 /// the control in `imm8`. The same shuffle is applied in lower and higher 128-bit lane. 2515 __m256 _mm256_permute_ps(int imm8)(__m256 a,) pure @trusted 2516 { 2517 // PERF DMD 2518 static if (GDC_with_AVX) 2519 { 2520 return __builtin_ia32_vpermilps256(a, cast(ubyte)imm8); 2521 } 2522 else version(LDC) 2523 { 2524 return shufflevectorLDC!(float8, 2525 (imm8 >> 0) & 3, (imm8 >> 2) & 3, (imm8 >> 4) & 3, (imm8 >> 6) & 3, 2526 4 + ((imm8 >> 0) & 3), 4 + ((imm8 >> 2) & 3), 4 + ((imm8 >> 4) & 3), 2527 4 + ((imm8 >> 6) & 3))(a, a); 2528 } 2529 else 2530 { 2531 __m256 r; 2532 r.ptr[0] = a.array[(imm8 >> 0) & 3]; 2533 r.ptr[1] = a.array[(imm8 >> 2) & 3]; 2534 r.ptr[2] = a.array[(imm8 >> 4) & 3]; 2535 r.ptr[3] = a.array[(imm8 >> 6) & 3]; 2536 r.ptr[4] = a.array[4 + ((imm8 >> 0) & 3)]; 2537 r.ptr[5] = a.array[4 + ((imm8 >> 2) & 3)]; 2538 r.ptr[6] = a.array[4 + ((imm8 >> 4) & 3)]; 2539 r.ptr[7] = a.array[4 + ((imm8 >> 6) & 3)]; 2540 return r; 2541 } 2542 } 2543 unittest 2544 { 2545 __m256 A = _mm256_setr_ps(0.0f, 1, 2, 3, 4, 5, 6, 7); 2546 __m256 R = _mm256_permute_ps!(1 + 4 * 3 + 16 * 0 + 64 * 2)(A); 2547 float[8] correct = [1.0f, 3, 0, 2, 5, 7, 4, 6]; 2548 assert(R.array == correct); 2549 } 2550 2551 /// Shuffle 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 2552 /// selected by `imm8` from `a` and `b`. 2553 __m256d _mm256_permute2f128_pd(int imm8)(__m256d a, __m256d b) pure @safe 2554 { 2555 return cast(__m256d) _mm256_permute2f128_si256!imm8(cast(__m256i)a, cast(__m256i)b); 2556 } 2557 ///ditto 2558 __m256d _mm256_permute2f128_ps(int imm8)(__m256 a, __m256 b) pure @safe 2559 { 2560 return cast(__m256) _mm256_permute2f128_si256!imm8(cast(__m256i)a, cast(__m256i)b); 2561 } 2562 ///ditto 2563 __m256i _mm256_permute2f128_si256(int imm8)(__m256i a, __m256i b) pure @trusted 2564 { 2565 static if (GDC_with_AVX) 2566 { 2567 return cast(__m256i) __builtin_ia32_vperm2f128_si256(cast(int8)a, cast(int8)b, cast(ubyte)imm8); 2568 } 2569 else 2570 { 2571 static __m128i SELECT4(int imm4)(__m256i a, __m256i b) pure @trusted 2572 { 2573 static assert(imm4 >= 0 && imm4 <= 15); 2574 static if (imm4 & 8) 2575 { 2576 return _mm_setzero_si128(); 2577 } 2578 else static if ((imm4 & 2) == 0) 2579 { 2580 long2 r; 2581 enum int index = 2*(imm4 & 1); 2582 r.ptr[0] = a.array[index+0]; 2583 r.ptr[1] = a.array[index+1]; 2584 return cast(__m128i)r; 2585 } 2586 else 2587 { 2588 static assert( (imm4 & 2) != 0); 2589 long2 r; 2590 enum int index = 2*(imm4 & 1); 2591 r.ptr[0] = b.array[index+0]; 2592 r.ptr[1] = b.array[index+1]; 2593 return cast(__m128i)r; 2594 } 2595 } 2596 2597 long4 r; 2598 __m128i lo = SELECT4!(imm8 & 15)(a, b); 2599 __m128i hi = SELECT4!((imm8 >> 4) & 15)(a, b); 2600 return _mm256_set_m128i(hi, lo); 2601 } 2602 } 2603 unittest 2604 { 2605 __m256d A = _mm256_setr_pd(8.0, 1, 2, 3); 2606 __m256d B = _mm256_setr_pd(4.0, 5, 6, 7); 2607 __m256d R = _mm256_permute2f128_pd!(128 + 2)(A, B); 2608 double[4] correct = [4.0, 5.0, 0.0, 0.0]; 2609 assert(R.array == correct); 2610 2611 __m256d R2 = _mm256_permute2f128_pd!(3*16 + 1)(A, B); 2612 double[4] correct2 = [2.0, 3.0, 6.0, 7.0]; 2613 assert(R2.array == correct2); 2614 } 2615 2616 /// Shuffle double-precision (64-bit) floating-point elements in `a` using the control in `b`. 2617 /// Warning: the selector is in bit 1, not bit 0, of each 64-bit element! 2618 /// This is really not intuitive. 2619 __m128d _mm_permutevar_pd(__m128d a, __m128i b) pure @trusted 2620 { 2621 // PERF ARM64 doesn't seem that great in arm64 2622 static if (GDC_or_LDC_with_AVX) 2623 { 2624 return cast(__m128d) __builtin_ia32_vpermilvarpd(a, cast(long2)b); 2625 } 2626 else 2627 { 2628 long2 bl = cast(long2)b; 2629 __m128d r; 2630 r.ptr[0] = a.array[ (bl.array[0] & 2) >> 1]; 2631 r.ptr[1] = a.array[ (bl.array[1] & 2) >> 1]; 2632 return r; 2633 } 2634 } 2635 unittest 2636 { 2637 __m128d A = _mm_setr_pd(5, 6); 2638 __m128d B = _mm_permutevar_pd(A, _mm_setr_epi64(2, 1)); 2639 __m128d C = _mm_permutevar_pd(A, _mm_setr_epi64(1 + 2 + 4, 2)); 2640 // yup, this is super strange, it's actually taking bit 1 and not bit 0 of each 64-bit element 2641 double[2] RB = [6, 5]; 2642 double[2] RC = [6, 6]; 2643 assert(B.array == RB); 2644 assert(C.array == RC); 2645 } 2646 2647 ///ditto 2648 __m256d _mm256_permutevar_pd (__m256d a, __m256i b) pure @trusted 2649 { 2650 // PERF ARM64 2651 // PERF DMD 2652 static if (GDC_or_LDC_with_AVX) 2653 { 2654 return cast(__m256d) __builtin_ia32_vpermilvarpd256(a, cast(long4)b); 2655 } 2656 else 2657 { 2658 long4 bl = cast(long4)b; 2659 __m256d r; 2660 r.ptr[0] = a.array[ (bl.array[0] & 2) >> 1]; 2661 r.ptr[1] = a.array[ (bl.array[1] & 2) >> 1]; 2662 r.ptr[2] = a.array[2 + ((bl.array[2] & 2) >> 1)]; 2663 r.ptr[3] = a.array[2 + ((bl.array[3] & 2) >> 1)]; 2664 return r; 2665 } 2666 } 2667 unittest 2668 { 2669 __m256d A = _mm256_setr_pd(5, 6, 7, 8); 2670 __m256d B = _mm256_permutevar_pd(A, _mm256_setr_epi64(2, 1, 0, 2)); 2671 __m256d C = _mm256_permutevar_pd(A, _mm256_setr_epi64(1 + 2 + 4, 2, 2, 0)); 2672 // yup, this is super strange, it's actually taking bit 1 and not bit 0 of each 64-bit element 2673 double[4] RB = [6, 5, 7, 8]; 2674 double[4] RC = [6, 6, 8, 7]; 2675 assert(B.array == RB); 2676 assert(C.array == RC); 2677 } 2678 2679 /// Shuffle single-precision (32-bit) floating-point elements in `a` using the control in `b`. 2680 __m128 _mm_permutevar_ps (__m128 a, __m128i b) pure @trusted 2681 { 2682 // PERF ARM64 2683 // PERF LDC without AVX 2684 // PERF DMD 2685 static if (GDC_or_LDC_with_AVX) 2686 { 2687 return cast(__m128) __builtin_ia32_vpermilvarps(a, cast(int4)b); 2688 } 2689 else 2690 { 2691 int4 bi = cast(int4)b; 2692 __m128 r; 2693 r.ptr[0] = a.array[ (bi.array[0] & 3) ]; 2694 r.ptr[1] = a.array[ (bi.array[1] & 3) ]; 2695 r.ptr[2] = a.array[ (bi.array[2] & 3) ]; 2696 r.ptr[3] = a.array[ (bi.array[3] & 3) ]; 2697 return r; 2698 } 2699 } 2700 unittest 2701 { 2702 __m128 A = _mm_setr_ps(5, 6, 7, 8); 2703 __m128 B = _mm_permutevar_ps(A, _mm_setr_epi32(2, 1, 0, 2 + 4)); 2704 __m128 C = _mm_permutevar_ps(A, _mm_setr_epi32(2, 3 + 8, 1, 0)); 2705 float[4] RB = [7, 6, 5, 7]; 2706 float[4] RC = [7, 8, 6, 5]; 2707 assert(B.array == RB); 2708 assert(C.array == RC); 2709 } 2710 2711 ///ditto 2712 __m256 _mm256_permutevar_ps (__m256 a, __m256i b) pure @trusted 2713 { 2714 // PERF ARM64 catastrophic 2715 // PERF LDC without AVX, real bad 2716 // PERF GDC 2717 static if (GDC_or_LDC_with_AVX) 2718 { 2719 return __builtin_ia32_vpermilvarps256(a, cast(int8)b); 2720 } 2721 else 2722 { 2723 int8 bi = cast(int8)b; 2724 __m256 r; 2725 r.ptr[0] = a.array[ (bi.array[0] & 3) ]; 2726 r.ptr[1] = a.array[ (bi.array[1] & 3) ]; 2727 r.ptr[2] = a.array[ (bi.array[2] & 3) ]; 2728 r.ptr[3] = a.array[ (bi.array[3] & 3) ]; 2729 r.ptr[4] = a.array[ 4 + (bi.array[4] & 3) ]; 2730 r.ptr[5] = a.array[ 4 + (bi.array[5] & 3) ]; 2731 r.ptr[6] = a.array[ 4 + (bi.array[6] & 3) ]; 2732 r.ptr[7] = a.array[ 4 + (bi.array[7] & 3) ]; 2733 return r; 2734 } 2735 } 2736 unittest 2737 { 2738 __m256 A = _mm256_setr_ps(1, 2, 3, 4, 5, 6, 7, 8); 2739 __m256 B = _mm256_permutevar_ps(A, _mm256_setr_epi32(2, 1, 0, 2, 3, 2, 1, 0)); 2740 __m256 C = _mm256_permutevar_ps(A, _mm256_setr_epi32(2, 3 + 8, 1, 0, 2, 3, 0, 1)); 2741 float[8] RB = [3.0f, 2, 1, 3, 8, 7, 6, 5]; 2742 float[8] RC = [3.0f, 4, 2, 1, 7, 8, 5, 6]; 2743 assert(B.array == RB); 2744 assert(C.array == RC); 2745 } 2746 2747 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements 2748 /// in `a`. The maximum relative error for this approximation is less than 1.5*2^-12. 2749 __m256 _mm256_rcp_ps (__m256 a) pure @trusted 2750 { 2751 // PERF DMD 2752 static if (GDC_or_LDC_with_AVX) 2753 { 2754 return __builtin_ia32_rcpps256(a); 2755 } 2756 else 2757 { 2758 a.ptr[0] = 1.0f / a.array[0]; 2759 a.ptr[1] = 1.0f / a.array[1]; 2760 a.ptr[2] = 1.0f / a.array[2]; 2761 a.ptr[3] = 1.0f / a.array[3]; 2762 a.ptr[4] = 1.0f / a.array[4]; 2763 a.ptr[5] = 1.0f / a.array[5]; 2764 a.ptr[6] = 1.0f / a.array[6]; 2765 a.ptr[7] = 1.0f / a.array[7]; 2766 return a; 2767 } 2768 } 2769 unittest 2770 { 2771 __m256 A = _mm256_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f, 9, -46, 1869816, 55583); 2772 __m256 groundTruth = _mm256_set1_ps(1.0f) / A; 2773 __m256 result = _mm256_rcp_ps(A); 2774 foreach(i; 0..8) 2775 { 2776 double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1; 2777 assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 2778 } 2779 } 2780 2781 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 2782 /// rounding parameter, and store the results as packed double-precision floating-point elements. 2783 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 2784 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 2785 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 2786 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 2787 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 2788 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 2789 __m256d _mm256_round_pd(int rounding)(__m256d a) @trusted 2790 { 2791 // PERF DMD 2792 static if (GDC_with_AVX) 2793 { 2794 return __builtin_ia32_roundpd256(a, rounding); 2795 } 2796 else static if (LDC_with_AVX) 2797 { 2798 return __builtin_ia32_roundpd256(a, rounding); 2799 } 2800 else 2801 { 2802 static if (rounding & _MM_FROUND_CUR_DIRECTION) 2803 { 2804 // PERF: non-AVX x86, would probably be faster to convert those double at once to int64 2805 2806 __m128d A_lo = _mm256_extractf128_pd!0(a); 2807 __m128d A_hi = _mm256_extractf128_pd!1(a); 2808 2809 // Convert to 64-bit integers one by one 2810 long x0 = _mm_cvtsd_si64(A_lo); 2811 long x2 = _mm_cvtsd_si64(A_hi); 2812 A_lo.ptr[0] = A_lo.array[1]; 2813 A_hi.ptr[0] = A_hi.array[1]; 2814 long x1 = _mm_cvtsd_si64(A_lo); 2815 long x3 = _mm_cvtsd_si64(A_hi); 2816 2817 return _mm256_setr_pd(x0, x1, x2, x3); 2818 } 2819 else 2820 { 2821 version(GNU) pragma(inline, false); // this was required for SSE4.1 rounding, let it here 2822 2823 uint old = _MM_GET_ROUNDING_MODE(); 2824 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 2825 2826 __m128d A_lo = _mm256_extractf128_pd!0(a); 2827 __m128d A_hi = _mm256_extractf128_pd!1(a); 2828 2829 // Convert to 64-bit integers one by one 2830 long x0 = _mm_cvtsd_si64(A_lo); 2831 long x2 = _mm_cvtsd_si64(A_hi); 2832 A_lo.ptr[0] = A_lo.array[1]; 2833 A_hi.ptr[0] = A_hi.array[1]; 2834 long x1 = _mm_cvtsd_si64(A_lo); 2835 long x3 = _mm_cvtsd_si64(A_hi); 2836 2837 // Convert back to double to achieve the rounding 2838 // The problem is that a 64-bit double can't represent all the values 2839 // a 64-bit integer can (and vice-versa). So this function won't work for 2840 // large values. (FUTURE: what range exactly?) 2841 _MM_SET_ROUNDING_MODE(old); 2842 return _mm256_setr_pd(x0, x1, x2, x3); 2843 } 2844 } 2845 } 2846 unittest 2847 { 2848 // tested in other intrinsics 2849 } 2850 2851 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 2852 /// rounding parameter, and store the results as packed single-precision floating-point elements. 2853 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 2854 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 2855 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 2856 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 2857 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 2858 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 2859 __m256 _mm256_round_ps(int rounding)(__m256 a) @trusted 2860 { 2861 // PERF DMD 2862 static if (GDC_or_LDC_with_AVX) 2863 { 2864 return __builtin_ia32_roundps256(a, rounding); 2865 } 2866 else static if (GDC_or_LDC_with_SSE41) 2867 { 2868 // we can use _mm_round_ps 2869 __m128 lo = _mm256_extractf128_ps!0(a); 2870 __m128 hi = _mm256_extractf128_ps!1(a); 2871 __m128 ilo = _mm_round_ps!rounding(lo); // unfortunately _mm_round_ps isn't fast for arm64, so we avoid that in that case 2872 __m128 ihi = _mm_round_ps!rounding(hi); 2873 return _mm256_set_m128(ihi, ilo); 2874 } 2875 else 2876 { 2877 static if (rounding & _MM_FROUND_CUR_DIRECTION) 2878 { 2879 __m256i integers = _mm256_cvtps_epi32(a); 2880 return _mm256_cvtepi32_ps(integers); 2881 } 2882 else 2883 { 2884 version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled 2885 uint old = _MM_GET_ROUNDING_MODE(); 2886 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 2887 scope(exit) _MM_SET_ROUNDING_MODE(old); 2888 2889 // Convert to 32-bit integers 2890 __m256i integers = _mm256_cvtps_epi32(a); 2891 2892 // Convert back to float to achieve the rounding 2893 // The problem is that a 32-float can't represent all the values 2894 // a 32-bit integer can (and vice-versa). So this function won't work for 2895 // large values. (FUTURE: what range exactly?) 2896 __m256 result = _mm256_cvtepi32_ps(integers); 2897 2898 return result; 2899 } 2900 } 2901 } 2902 unittest 2903 { 2904 // tested in other intrinsics 2905 } 2906 2907 2908 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) 2909 /// floating-point elements in `a`. The maximum relative error for this approximation is less than 2910 /// 1.5*2^-12. 2911 __m256 _mm256_rsqrt_ps (__m256 a) pure @trusted 2912 { 2913 static if (GDC_or_LDC_with_AVX) 2914 { 2915 return __builtin_ia32_rsqrtps256(a); 2916 } 2917 else version(LDC) 2918 { 2919 a[0] = 1.0f / llvm_sqrt(a[0]); 2920 a[1] = 1.0f / llvm_sqrt(a[1]); 2921 a[2] = 1.0f / llvm_sqrt(a[2]); 2922 a[3] = 1.0f / llvm_sqrt(a[3]); 2923 a[4] = 1.0f / llvm_sqrt(a[4]); 2924 a[5] = 1.0f / llvm_sqrt(a[5]); 2925 a[6] = 1.0f / llvm_sqrt(a[6]); 2926 a[7] = 1.0f / llvm_sqrt(a[7]); 2927 return a; 2928 } 2929 else 2930 { 2931 a.ptr[0] = 1.0f / sqrt(a.array[0]); 2932 a.ptr[1] = 1.0f / sqrt(a.array[1]); 2933 a.ptr[2] = 1.0f / sqrt(a.array[2]); 2934 a.ptr[3] = 1.0f / sqrt(a.array[3]); 2935 a.ptr[4] = 1.0f / sqrt(a.array[4]); 2936 a.ptr[5] = 1.0f / sqrt(a.array[5]); 2937 a.ptr[6] = 1.0f / sqrt(a.array[6]); 2938 a.ptr[7] = 1.0f / sqrt(a.array[7]); 2939 return a; 2940 } 2941 } 2942 unittest 2943 { 2944 __m256 A = _mm256_setr_ps(2.34f, 70000.0f, 0.00001f, 345.5f, 2.34f, 70000.0f, 0.00001f, 345.5f); 2945 __m256 groundTruth = _mm256_setr_ps(0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f, 2946 0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f); 2947 __m256 result = _mm256_rsqrt_ps(A); 2948 foreach(i; 0..8) 2949 { 2950 double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1; 2951 assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 2952 } 2953 } 2954 2955 /// Set packed 16-bit integers with the supplied values. 2956 __m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 2957 { 2958 short16 r; // Note: = void would prevent GDC from inlining a constant short16... 2959 r.ptr[0] = e0; 2960 r.ptr[1] = e1; 2961 r.ptr[2] = e2; 2962 r.ptr[3] = e3; 2963 r.ptr[4] = e4; 2964 r.ptr[5] = e5; 2965 r.ptr[6] = e6; 2966 r.ptr[7] = e7; 2967 r.ptr[8] = e8; 2968 r.ptr[9] = e9; 2969 r.ptr[10] = e10; 2970 r.ptr[11] = e11; 2971 r.ptr[12] = e12; 2972 r.ptr[13] = e13; 2973 r.ptr[14] = e14; 2974 r.ptr[15] = e15; 2975 return cast(__m256i) r; 2976 } 2977 unittest 2978 { 2979 short16 A = cast(short16) _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 2980 7, 6, 5, 4, 3, 2, 1, 0); 2981 foreach(i; 0..16) 2982 assert(A.array[i] == i); 2983 } 2984 2985 /// Set packed 32-bit integers with the supplied values. 2986 __m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted 2987 { 2988 // Inlines a constant with GCC -O1, LDC -O2 2989 int8 r; // = void would prevent GCC from inlining a constant call 2990 r.ptr[0] = e0; 2991 r.ptr[1] = e1; 2992 r.ptr[2] = e2; 2993 r.ptr[3] = e3; 2994 r.ptr[4] = e4; 2995 r.ptr[5] = e5; 2996 r.ptr[6] = e6; 2997 r.ptr[7] = e7; 2998 return cast(__m256i)r; 2999 } 3000 unittest 3001 { 3002 int8 A = cast(int8) _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); 3003 foreach(i; 0..8) 3004 assert(A.array[i] == i); 3005 } 3006 3007 /// Set packed 64-bit integers with the supplied values. 3008 __m256i _mm256_set_epi64x (long e3, long e2, long e1, long e0) pure @trusted 3009 { 3010 long4 r = void; 3011 r.ptr[0] = e0; 3012 r.ptr[1] = e1; 3013 r.ptr[2] = e2; 3014 r.ptr[3] = e3; 3015 return r; 3016 } 3017 unittest 3018 { 3019 __m256i A = _mm256_set_epi64x(-1, 42, long.min, long.max); 3020 long[4] correct = [long.max, long.min, 42, -1]; 3021 assert(A.array == correct); 3022 } 3023 3024 ///ditto 3025 alias _mm256_set_epi64 = _mm256_set_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API. 3026 3027 /// Set packed 8-bit integers with the supplied values. 3028 __m256i _mm256_set_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24, 3029 byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16, 3030 byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9, byte e8, 3031 byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) 3032 { 3033 // Inline a constant call in GDC -O1 and LDC -O2 3034 align(32) byte[32] result = [ e0, e1, e2, e3, e4, e5, e6, e7, 3035 e8, e9, e10, e11, e12, e13, e14, e15, 3036 e16, e17, e18, e19, e20, e21, e22, e23, 3037 e24, e25, e26, e27, e28, e29, e30, e31 ]; 3038 return *cast(__m256i*)(result.ptr); 3039 } 3040 unittest 3041 { 3042 byte32 R = cast(byte32) _mm256_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7); 3043 byte[32] correct = [7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0, 3, 2, 1, 0, 3044 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1]; 3045 assert(R.array == correct); 3046 } 3047 3048 /// Set packed `__m256d` vector with the supplied values. 3049 __m256 _mm256_set_m128 (__m128 hi, __m128 lo) pure @trusted 3050 { 3051 // DMD PERF 3052 static if (GDC_with_AVX) 3053 { 3054 __m256 r = __builtin_ia32_ps256_ps(lo); 3055 return __builtin_ia32_vinsertf128_ps256(r, hi, 1); 3056 } 3057 else 3058 { 3059 __m256 r = void; 3060 r.ptr[0] = lo.array[0]; 3061 r.ptr[1] = lo.array[1]; 3062 r.ptr[2] = lo.array[2]; 3063 r.ptr[3] = lo.array[3]; 3064 r.ptr[4] = hi.array[0]; 3065 r.ptr[5] = hi.array[1]; 3066 r.ptr[6] = hi.array[2]; 3067 r.ptr[7] = hi.array[3]; 3068 return r; 3069 } 3070 3071 /* 3072 // BUG, doesn't work if AVX vector is emulated, but SSE vector is not 3073 // See issue #108 3074 __m256 r = void; 3075 __m128* p = cast(__m128*)(&r); 3076 p[0] = lo; 3077 p[1] = hi; 3078 return r; 3079 */ 3080 } 3081 unittest 3082 { 3083 __m128 lo = _mm_setr_ps(1.0f, 2, 3, 4); 3084 __m128 hi = _mm_setr_ps(3.0f, 4, 5, 6); 3085 __m256 R = _mm256_set_m128(hi, lo); 3086 float[8] correct = [1.0f, 2, 3, 4, 3, 4, 5, 6]; 3087 assert(R.array == correct); 3088 } 3089 3090 /// Set packed `__m256d` vector with the supplied values. 3091 __m256d _mm256_set_m128d (__m128d hi, __m128d lo) pure @trusted 3092 { 3093 __m256d r = void; 3094 r.ptr[0] = lo.array[0]; 3095 r.ptr[1] = lo.array[1]; 3096 r.ptr[2] = hi.array[0]; 3097 r.ptr[3] = hi.array[1]; 3098 return r; 3099 } 3100 unittest 3101 { 3102 __m128d lo = _mm_setr_pd(1.0, 2.0); 3103 __m128d hi = _mm_setr_pd(3.0, 4.0); 3104 __m256d R = _mm256_set_m128d(hi, lo); 3105 double[4] correct = [1.0, 2.0, 3.0, 4.0]; 3106 assert(R.array == correct); 3107 } 3108 3109 /// Set packed `__m256i` vector with the supplied values. 3110 __m256i _mm256_set_m128i (__m128i hi, __m128i lo) pure @trusted 3111 { 3112 // DMD PERF 3113 static if (GDC_with_AVX) 3114 { 3115 __m256i r = cast(long4) __builtin_ia32_si256_si (lo); 3116 return cast(long4) __builtin_ia32_vinsertf128_si256(cast(int8)r, hi, 1); 3117 } 3118 else 3119 { 3120 int8 r = void; 3121 r.ptr[0] = lo.array[0]; 3122 r.ptr[1] = lo.array[1]; 3123 r.ptr[2] = lo.array[2]; 3124 r.ptr[3] = lo.array[3]; 3125 r.ptr[4] = hi.array[0]; 3126 r.ptr[5] = hi.array[1]; 3127 r.ptr[6] = hi.array[2]; 3128 r.ptr[7] = hi.array[3]; 3129 return cast(long4)r; 3130 } 3131 } 3132 unittest 3133 { 3134 __m128i lo = _mm_setr_epi32( 1, 2, 3, 4); 3135 __m128i hi = _mm_set_epi32(-3, -4, -5, -6); 3136 int8 R = cast(int8)_mm256_set_m128i(hi, lo); 3137 int[8] correct = [1, 2, 3, 4, -6, -5, -4, -3]; 3138 assert(R.array == correct); 3139 } 3140 3141 /// Set packed double-precision (64-bit) floating-point elements with the supplied values. 3142 __m256d _mm256_set_pd (double e3, double e2, double e1, double e0) pure @trusted 3143 { 3144 __m256d r = void; 3145 r.ptr[0] = e0; 3146 r.ptr[1] = e1; 3147 r.ptr[2] = e2; 3148 r.ptr[3] = e3; 3149 return r; 3150 } 3151 unittest 3152 { 3153 __m256d A = _mm256_set_pd(3, 2, 1, 546); 3154 double[4] correct = [546.0, 1.0, 2.0, 3.0]; 3155 assert(A.array == correct); 3156 } 3157 3158 /// Set packed single-precision (32-bit) floating-point elements with the supplied values. 3159 __m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) pure @trusted 3160 { 3161 // PERF: see #102, use = void? 3162 __m256 r; 3163 r.ptr[0] = e0; 3164 r.ptr[1] = e1; 3165 r.ptr[2] = e2; 3166 r.ptr[3] = e3; 3167 r.ptr[4] = e4; 3168 r.ptr[5] = e5; 3169 r.ptr[6] = e6; 3170 r.ptr[7] = e7; 3171 return r; 3172 } 3173 unittest 3174 { 3175 __m256 A = _mm256_set_ps(3, 2, 1, 546.0f, -1.25f, -2, -3, 0); 3176 float[8] correct = [0, -3, -2, -1.25f, 546.0f, 1.0, 2.0, 3.0]; 3177 assert(A.array == correct); 3178 } 3179 3180 /// Broadcast 16-bit integer `a` to all elements of the return value. 3181 __m256i _mm256_set1_epi16 (short a) pure @trusted 3182 { 3183 version(DigitalMars) 3184 { 3185 // workaround https://issues.dlang.org/show_bug.cgi?id=21469 3186 // It used to ICE, after that the codegen was just wrong. 3187 // No issue anymore in DMD 2.101, we can eventually remove that 3188 static if (__VERSION__ < 2101) 3189 { 3190 short16 v = a; 3191 return cast(__m256i) v; 3192 } 3193 else 3194 { 3195 pragma(inline, true); 3196 return cast(__m256i)(short16(a)); 3197 } 3198 } 3199 else 3200 { 3201 pragma(inline, true); 3202 return cast(__m256i)(short16(a)); 3203 } 3204 } 3205 unittest 3206 { 3207 short16 a = cast(short16) _mm256_set1_epi16(31); 3208 for (int i = 0; i < 16; ++i) 3209 assert(a.array[i] == 31); 3210 } 3211 3212 /// Broadcast 32-bit integer `a` to all elements. 3213 __m256i _mm256_set1_epi32 (int a) pure @trusted 3214 { 3215 version(DigitalMars) 3216 { 3217 // No issue anymore in DMD 2.101, we can eventually remove that 3218 static if (__VERSION__ < 2101) 3219 { 3220 int8 v = a; 3221 return cast(__m256i) v; 3222 } 3223 else 3224 { 3225 pragma(inline, true); 3226 return cast(__m256i)(int8(a)); 3227 } 3228 } 3229 else 3230 { 3231 pragma(inline, true); 3232 return cast(__m256i)(int8(a)); 3233 } 3234 } 3235 unittest 3236 { 3237 int8 a = cast(int8) _mm256_set1_epi32(31); 3238 for (int i = 0; i < 8; ++i) 3239 assert(a.array[i] == 31); 3240 } 3241 3242 /// Broadcast 64-bit integer `a` to all elements of the return value. 3243 __m256i _mm256_set1_epi64x (long a) 3244 { 3245 return cast(__m256i)(long4(a)); 3246 } 3247 unittest 3248 { 3249 long4 a = cast(long4) _mm256_set1_epi64x(-31); 3250 for (int i = 0; i < 4; ++i) 3251 assert(a.array[i] == -31); 3252 } 3253 ///ditto 3254 alias _mm256_set1_epi64 = _mm256_set1_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API. 3255 3256 /// Broadcast 8-bit integer `a` to all elements of the return value. 3257 __m256i _mm256_set1_epi8 (byte a) pure @trusted 3258 { 3259 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 3260 { 3261 byte32 v = a; 3262 return cast(__m256i) v; 3263 } 3264 else 3265 { 3266 pragma(inline, true); 3267 return cast(__m256i)(byte32(a)); 3268 } 3269 } 3270 unittest 3271 { 3272 byte32 a = cast(byte32) _mm256_set1_epi8(31); 3273 for (int i = 0; i < 32; ++i) 3274 assert(a.array[i] == 31); 3275 } 3276 3277 /// Broadcast double-precision (64-bit) floating-point value `a` to all elements of the return value. 3278 __m256d _mm256_set1_pd (double a) pure @trusted 3279 { 3280 return __m256d(a); 3281 } 3282 unittest 3283 { 3284 double a = 464.21; 3285 double[4] correct = [a, a, a, a]; 3286 double4 A = cast(double4) _mm256_set1_pd(a); 3287 assert(A.array == correct); 3288 } 3289 3290 /// Broadcast single-precision (32-bit) floating-point value `a` to all elements of the return value. 3291 __m256 _mm256_set1_ps (float a) pure @trusted 3292 { 3293 return __m256(a); 3294 } 3295 unittest 3296 { 3297 float a = 464.21f; 3298 float[8] correct = [a, a, a, a, a, a, a, a]; 3299 float8 A = cast(float8) _mm256_set1_ps(a); 3300 assert(A.array == correct); 3301 } 3302 3303 /// Set packed 16-bit integers with the supplied values in reverse order. 3304 __m256i _mm256_setr_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, 3305 short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 3306 { 3307 short[16] result = [ e15, e14, e13, e12, e11, e10, e9, e8, 3308 e7, e6, e5, e4, e3, e2, e1, e0]; 3309 static if (GDC_with_AVX) 3310 { 3311 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr); 3312 } 3313 else version(LDC) 3314 { 3315 return cast(__m256i)( loadUnaligned!(short16)(result.ptr) ); 3316 } 3317 else 3318 { 3319 short16 r; 3320 for(int n = 0; n < 16; ++n) 3321 r.ptr[n] = result[n]; 3322 return cast(__m256i)r; 3323 } 3324 } 3325 unittest 3326 { 3327 short16 A = cast(short16) _mm256_setr_epi16(-1, 0, -21, 21, 42, 127, -42, -128, 3328 -1, 0, -21, 21, 42, 127, -42, -128); 3329 short[16] correct = [-1, 0, -21, 21, 42, 127, -42, -128, 3330 -1, 0, -21, 21, 42, 127, -42, -128]; 3331 assert(A.array == correct); 3332 } 3333 3334 /// Set packed 32-bit integers with the supplied values in reverse order. 3335 __m256i _mm256_setr_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted 3336 { 3337 // Inlines a constant with GCC -O1, LDC -O2 3338 int8 r; // = void would prevent GDC from inlining a constant call 3339 r.ptr[0] = e7; 3340 r.ptr[1] = e6; 3341 r.ptr[2] = e5; 3342 r.ptr[3] = e4; 3343 r.ptr[4] = e3; 3344 r.ptr[5] = e2; 3345 r.ptr[6] = e1; 3346 r.ptr[7] = e0; 3347 return cast(__m256i)r; 3348 } 3349 unittest 3350 { 3351 int8 A = cast(int8) _mm256_setr_epi32(-1, 0, -2147483648, 2147483647, 42, 666, -42, -666); 3352 int[8] correct = [-1, 0, -2147483648, 2147483647, 42, 666, -42, -666]; 3353 assert(A.array == correct); 3354 } 3355 3356 /// Set packed 64-bit integers with the supplied values in reverse order. 3357 __m256i _mm256_setr_epi64x (long e3, long e2, long e1, long e0) pure @trusted 3358 { 3359 long4 r = void; 3360 r.ptr[0] = e3; 3361 r.ptr[1] = e2; 3362 r.ptr[2] = e1; 3363 r.ptr[3] = e0; 3364 return r; 3365 } 3366 unittest 3367 { 3368 __m256i A = _mm256_setr_epi64x(-1, 42, long.min, long.max); 3369 long[4] correct = [-1, 42, long.min, long.max]; 3370 assert(A.array == correct); 3371 } 3372 ///ditto 3373 alias _mm256_setr_epi64 = _mm256_setr_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API. 3374 3375 /// Set packed 8-bit integers with the supplied values in reverse order. 3376 __m256i _mm256_setr_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24, 3377 byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16, 3378 byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9, byte e8, 3379 byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 3380 { 3381 // Inline a constant call in GDC -O1 and LDC -O2 3382 align(32) byte[32] result = [ e31, e30, e29, e28, e27, e26, e25, e24, 3383 e23, e22, e21, e20, e19, e18, e17, e16, 3384 e15, e14, e13, e12, e11, e10, e9, e8, 3385 e7, e6, e5, e4, e3, e2, e1, e0]; 3386 return *cast(__m256i*)(result.ptr); 3387 } 3388 unittest 3389 { 3390 byte32 A = cast(byte32) _mm256_setr_epi8( -1, 0, -21, 21, 42, 127, -42, -128, 3391 -1, 0, -21, 21, 42, 127, -42, -128, 3392 -1, 0, -21, 21, 42, 127, -42, -128, 3393 -1, 0, -21, 21, 42, 127, -42, -128); 3394 byte[32] correct = [-1, 0, -21, 21, 42, 127, -42, -128, 3395 -1, 0, -21, 21, 42, 127, -42, -128, 3396 -1, 0, -21, 21, 42, 127, -42, -128, 3397 -1, 0, -21, 21, 42, 127, -42, -128]; 3398 assert(A.array == correct); 3399 } 3400 3401 /// Set packed `__m256` vector with the supplied values. 3402 __m256 _mm256_setr_m128 (__m128 lo, __m128 hi) 3403 { 3404 return _mm256_set_m128(hi, lo); 3405 } 3406 unittest 3407 { 3408 __m128 A = _mm_setr_ps(1.0f, 2, 3, 4); 3409 __m128 B = _mm_setr_ps(3.0f, 4, 5, 6); 3410 __m256 R = _mm256_setr_m128(B, A); 3411 float[8] correct = [3.0f, 4, 5, 6, 1, 2, 3, 4,]; 3412 assert(R.array == correct); 3413 } 3414 3415 /// Set packed `__m256d` vector with the supplied values. 3416 __m256d _mm256_setr_m128d (__m128d lo, __m128d hi) 3417 { 3418 return _mm256_set_m128d(hi, lo); 3419 } 3420 unittest 3421 { 3422 __m128d A = _mm_setr_pd(1.0, 2.0); 3423 __m128d B = _mm_setr_pd(3.0, 4.0); 3424 __m256d R = _mm256_setr_m128d(B, A); 3425 double[4] correct = [3.0, 4.0, 1.0, 2.0]; 3426 assert(R.array == correct); 3427 } 3428 3429 /// Set packed `__m256i` vector with the supplied values. 3430 __m256i _mm256_setr_m128i (__m128i lo, __m128i hi) 3431 { 3432 return _mm256_set_m128i(hi, lo); 3433 } 3434 unittest 3435 { 3436 __m128i A = _mm_setr_epi32( 1, 2, 3, 4); 3437 __m128i B = _mm_set_epi32(-3, -4, -5, -6); 3438 int8 R = cast(int8)_mm256_setr_m128i(B, A); 3439 int[8] correct = [-6, -5, -4, -3, 1, 2, 3, 4]; 3440 assert(R.array == correct); 3441 } 3442 3443 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order. 3444 __m256d _mm256_setr_pd (double e3, double e2, double e1, double e0) pure @trusted 3445 { 3446 version(LDC) 3447 { 3448 // PERF, probably not the best 3449 double[4] result = [e3, e2, e1, e0]; 3450 return loadUnaligned!(double4)(result.ptr); 3451 } 3452 else 3453 { 3454 __m256d r; 3455 r.ptr[0] = e3; 3456 r.ptr[1] = e2; 3457 r.ptr[2] = e1; 3458 r.ptr[3] = e0; 3459 return r; 3460 } 3461 } 3462 unittest 3463 { 3464 __m256d A = _mm256_setr_pd(3, 2, 1, 546.125); 3465 double[4] correct = [3.0, 2.0, 1.0, 546.125]; 3466 assert(A.array == correct); 3467 } 3468 3469 3470 /// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order. 3471 __m256 _mm256_setr_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) pure @trusted 3472 { 3473 // PERF DMD 3474 static if (GDC_with_AVX) 3475 { 3476 align(32) float[8] r = [ e7, e6, e5, e4, e3, e2, e1, e0]; 3477 return *cast(__m256*)r; 3478 } 3479 else version(LDC) 3480 { 3481 align(32) float[8] r = [ e7, e6, e5, e4, e3, e2, e1, e0]; 3482 return *cast(__m256*)r; 3483 } 3484 else 3485 { 3486 __m256 r; 3487 r.ptr[0] = e7; 3488 r.ptr[1] = e6; 3489 r.ptr[2] = e5; 3490 r.ptr[3] = e4; 3491 r.ptr[4] = e3; 3492 r.ptr[5] = e2; 3493 r.ptr[6] = e1; 3494 r.ptr[7] = e0; 3495 return r; 3496 } 3497 } 3498 unittest 3499 { 3500 __m256 A = _mm256_setr_ps( 3, 2, 1, 546.125f, 4, 5, 6, 7); 3501 float[8] correct = [3.0f, 2, 1, 546.125f, 4, 5, 6, 7]; 3502 assert(A.array == correct); 3503 } 3504 3505 /// Return vector of type `__m256d` with all elements set to zero. 3506 __m256d _mm256_setzero_pd() pure @safe 3507 { 3508 return double4(0.0); 3509 } 3510 unittest 3511 { 3512 __m256d A = _mm256_setzero_pd(); 3513 double[4] correct = [0.0, 0.0, 0.0, 0.0]; 3514 assert(A.array == correct); 3515 } 3516 3517 /// Return vector of type `__m256` with all elements set to zero. 3518 __m256 _mm256_setzero_ps() pure @safe 3519 { 3520 return float8(0.0f); 3521 } 3522 unittest 3523 { 3524 __m256 A = _mm256_setzero_ps(); 3525 float[8] correct = [0.0f, 0, 0, 0, 0, 0, 0, 0]; 3526 assert(A.array == correct); 3527 } 3528 3529 /// Return vector of type `__m256i` with all elements set to zero. 3530 __m256i _mm256_setzero_si256() pure @trusted 3531 { 3532 return __m256i(0); 3533 } 3534 unittest 3535 { 3536 __m256i A = _mm256_setzero_si256(); 3537 long[4] correct = [0, 0, 0, 0]; 3538 assert(A.array == correct); 3539 } 3540 3541 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the 3542 /// control in `imm8`. 3543 __m256d _mm256_shuffle_pd(int imm8)(__m256d a, __m256d b) pure @trusted 3544 { 3545 // PERF DMD D_SIMD 3546 static if (GDC_with_AVX) 3547 { 3548 return __builtin_ia32_shufpd256(a, b, imm8); 3549 } 3550 else version(LDC) 3551 { 3552 return shufflevectorLDC!(double4, 3553 (imm8 >> 0) & 1, 3554 4 + ( (imm8 >> 1) & 1), 3555 2 + ( (imm8 >> 2) & 1), 3556 6 + ( (imm8 >> 3) & 1) )(a, b); 3557 } 3558 else 3559 { 3560 double4 r = void; 3561 r.ptr[0] = a.array[(imm8 >> 0) & 1]; 3562 r.ptr[1] = b.array[(imm8 >> 1) & 1]; 3563 r.ptr[2] = a.array[2 + ( (imm8 >> 2) & 1)]; 3564 r.ptr[3] = b.array[2 + ( (imm8 >> 3) & 1)]; 3565 return r; 3566 } 3567 } 3568 unittest 3569 { 3570 __m256d A = _mm256_setr_pd( 0, 1, 2, 3); 3571 __m256d B = _mm256_setr_pd( 4, 5, 6, 7); 3572 __m256d C = _mm256_shuffle_pd!75 /* 01001011 */(A, B); 3573 double[4] correct = [1.0, 5.0, 2.0, 7.0]; 3574 assert(C.array == correct); 3575 } 3576 3577 /// Shuffle single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using 3578 /// the control in `imm8`. 3579 __m256 _mm256_shuffle_ps(int imm8)(__m256 a, __m256 b) pure @trusted 3580 { 3581 // PERF DMD D_SIMD 3582 static if (GDC_with_AVX) 3583 { 3584 return __builtin_ia32_shufps256(a, b, imm8); 3585 } 3586 else version(LDC) 3587 { 3588 return shufflevectorLDC!(float8, (imm8 >> 0) & 3, 3589 (imm8 >> 2) & 3, 3590 8 + ( (imm8 >> 4) & 3), 3591 8 + ( (imm8 >> 6) & 3), 3592 4 + ( (imm8 >> 0) & 3), 3593 4 + ( (imm8 >> 2) & 3), 3594 12 + ( (imm8 >> 4) & 3), 3595 12 + ( (imm8 >> 6) & 3) )(a, b); 3596 } 3597 else 3598 { 3599 float8 r = void; 3600 r.ptr[0] = a.array[(imm8 >> 0) & 3]; 3601 r.ptr[1] = a.array[(imm8 >> 2) & 3]; 3602 r.ptr[2] = b.array[(imm8 >> 4) & 3]; 3603 r.ptr[3] = b.array[(imm8 >> 6) & 3]; 3604 r.ptr[4] = a.array[4 + ( (imm8 >> 0) & 3 )]; 3605 r.ptr[5] = a.array[4 + ( (imm8 >> 2) & 3 )]; 3606 r.ptr[6] = b.array[4 + ( (imm8 >> 4) & 3 )]; 3607 r.ptr[7] = b.array[4 + ( (imm8 >> 6) & 3 )]; 3608 return r; 3609 } 3610 } 3611 unittest 3612 { 3613 __m256 A = _mm256_setr_ps( 0, 1, 2, 3, 4, 5, 6, 7); 3614 __m256 B = _mm256_setr_ps( 8, 9, 10, 11, 12, 13, 14, 15); 3615 __m256 C = _mm256_shuffle_ps!75 /* 01001011 */(A, B); 3616 float[8] correct = [3.0f, 2, 8, 9, 7, 6, 12, 13]; 3617 assert(C.array == correct); 3618 } 3619 3620 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `a`. 3621 __m256d _mm256_sqrt_pd (__m256d a) pure @trusted 3622 { 3623 static if (GDC_with_AVX) 3624 { 3625 return __builtin_ia32_sqrtpd256(a); 3626 } 3627 else version(LDC) 3628 { 3629 return llvm_sqrt(a); 3630 } 3631 else 3632 { 3633 a.ptr[0] = sqrt(a.array[0]); 3634 a.ptr[1] = sqrt(a.array[1]); 3635 a.ptr[2] = sqrt(a.array[2]); 3636 a.ptr[3] = sqrt(a.array[3]); 3637 return a; 3638 } 3639 } 3640 unittest 3641 { 3642 __m256d A = _mm256_sqrt_pd(_mm256_set1_pd(4.0)); 3643 double[4] correct = [2.0, 2, 2, 2]; 3644 assert(A.array == correct); 3645 } 3646 3647 /// Compute the square root of packed single-precision (32-bit) floating-point elements in `a`. 3648 __m256 _mm256_sqrt_ps (__m256 a) pure @trusted 3649 { 3650 static if (GDC_with_AVX) 3651 { 3652 return __builtin_ia32_sqrtps256(a); 3653 } 3654 else version(LDC) 3655 { 3656 return llvm_sqrt(a); 3657 } 3658 else 3659 { 3660 a.ptr[0] = sqrt(a.array[0]); 3661 a.ptr[1] = sqrt(a.array[1]); 3662 a.ptr[2] = sqrt(a.array[2]); 3663 a.ptr[3] = sqrt(a.array[3]); 3664 a.ptr[4] = sqrt(a.array[4]); 3665 a.ptr[5] = sqrt(a.array[5]); 3666 a.ptr[6] = sqrt(a.array[6]); 3667 a.ptr[7] = sqrt(a.array[7]); 3668 return a; 3669 } 3670 } 3671 unittest 3672 { 3673 __m256 A = _mm256_sqrt_ps(_mm256_set1_ps(4.0f)); 3674 float[8] correct = [2.0f, 2, 2, 2, 2, 2, 2, 2]; 3675 assert(A.array == correct); 3676 } 3677 3678 /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 3679 /// `a` into memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 3680 /// exception may be generated. 3681 void _mm256_store_pd (double* mem_addr, __m256d a) pure @system 3682 { 3683 *cast(__m256d*)mem_addr = a; 3684 } 3685 unittest 3686 { 3687 align(32) double[4] mem; 3688 double[4] correct = [1.0, 2, 3, 4]; 3689 _mm256_store_pd(mem.ptr, _mm256_setr_pd(1.0, 2, 3, 4)); 3690 assert(mem == correct); 3691 } 3692 3693 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 3694 /// `a` into memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 3695 /// exception may be generated. 3696 void _mm256_store_ps (float* mem_addr, __m256 a) pure @system 3697 { 3698 *cast(__m256*)mem_addr = a; 3699 } 3700 unittest 3701 { 3702 align(32) float[8] mem; 3703 float[8] correct = [1.0, 2, 3, 4, 5, 6, 7, 8]; 3704 _mm256_store_ps(mem.ptr, _mm256_set_ps(8.0, 7, 6, 5, 4, 3, 2, 1)); 3705 assert(mem == correct); 3706 } 3707 3708 /// Store 256-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 32-byte 3709 /// boundary or a general-protection exception may be generated. 3710 void _mm256_store_si256 (__m256i * mem_addr, __m256i a) pure @safe 3711 { 3712 *mem_addr = a; 3713 } 3714 unittest 3715 { 3716 align(32) long[4] mem; 3717 long[4] correct = [5, -6, -7, 8]; 3718 _mm256_store_si256(cast(__m256i*)(mem.ptr), _mm256_setr_epi64x(5, -6, -7, 8)); 3719 assert(mem == correct); 3720 } 3721 3722 /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 3723 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary. 3724 void _mm256_storeu_pd (double * mem_addr, __m256d a) pure @system 3725 { 3726 // PERF: DMD 3727 static if (GDC_with_AVX) 3728 { 3729 __builtin_ia32_storeupd256(mem_addr, a); 3730 } 3731 else version(LDC) 3732 { 3733 storeUnaligned!__m256d(a, mem_addr); 3734 } 3735 else 3736 { 3737 for(int n = 0; n < 4; ++n) 3738 mem_addr[n] = a.array[n]; 3739 } 3740 } 3741 unittest 3742 { 3743 align(32) double[6] arr = [0.0, 0, 0, 0, 0, 0]; 3744 _mm256_storeu_pd(&arr[1], _mm256_set1_pd(4.0)); 3745 double[4] correct = [4.0, 4, 4, 4]; 3746 assert(arr[1..5] == correct); 3747 } 3748 3749 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 3750 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary. 3751 void _mm256_storeu_ps (float* mem_addr, __m256 a) pure @system 3752 { 3753 // PERF: DMD 3754 static if (GDC_with_AVX) 3755 { 3756 __builtin_ia32_storeups256(mem_addr, a); 3757 } 3758 else version(LDC) 3759 { 3760 storeUnaligned!__m256(a, mem_addr); 3761 } 3762 else 3763 { 3764 for(int n = 0; n < 8; ++n) 3765 mem_addr[n] = a.array[n]; 3766 } 3767 } 3768 unittest 3769 { 3770 align(32) float[10] arr = [0.0f, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 3771 _mm256_storeu_ps(&arr[1], _mm256_set1_ps(4.0f)); 3772 float[8] correct = [4.0f, 4, 4, 4, 4, 4, 4, 4]; 3773 assert(arr[1..9] == correct); 3774 } 3775 3776 3777 /// Store 256-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned 3778 /// on any particular boundary. 3779 void _mm256_storeu_si256 (__m256i* mem_addr, __m256i a) pure @trusted 3780 { 3781 // PERF: DMD 3782 static if (GDC_with_AVX) 3783 { 3784 __builtin_ia32_storedqu256(cast(char*)mem_addr, cast(ubyte32) a); 3785 } 3786 else version(LDC) 3787 { 3788 storeUnaligned!__m256i(a, cast(long*)mem_addr); 3789 } 3790 else 3791 { 3792 long4 v = cast(long4)a; 3793 long* p = cast(long*)mem_addr; 3794 for(int n = 0; n < 4; ++n) 3795 p[n] = v[n]; 3796 } 3797 } 3798 unittest 3799 { 3800 align(32) long[6] arr = [0, 0, 0, 0, 0, 0]; 3801 _mm256_storeu_si256( cast(__m256i*) &arr[1], _mm256_set1_epi64x(4)); 3802 long[4] correct = [4, 4, 4, 4]; 3803 assert(arr[1..5] == correct); 3804 } 3805 3806 /// Store the high and low 128-bit halves (each composed of 4 packed single-precision (32-bit) 3807 /// floating-point elements) from `a` into memory two different 128-bit locations. 3808 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 3809 void _mm256_storeu2_m128 (float* hiaddr, float* loaddr, __m256 a) pure @system 3810 { 3811 // This is way better on GDC, and similarly in LDC, vs using other intrinsics 3812 loaddr[0] = a.array[0]; 3813 loaddr[1] = a.array[1]; 3814 loaddr[2] = a.array[2]; 3815 loaddr[3] = a.array[3]; 3816 hiaddr[0] = a.array[4]; 3817 hiaddr[1] = a.array[5]; 3818 hiaddr[2] = a.array[6]; 3819 hiaddr[3] = a.array[7]; 3820 } 3821 unittest 3822 { 3823 align(32) float[11] A = [0.0f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 3824 _mm256_storeu2_m128(&A[1], &A[6], _mm256_set1_ps(2.0f)); 3825 float[11] correct = [0.0f, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0]; 3826 assert(A == correct); 3827 } 3828 3829 /// Store the high and low 128-bit halves (each composed of 2 packed double-precision (64-bit) 3830 /// floating-point elements) from `a` into memory two different 128-bit locations. 3831 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 3832 void _mm256_storeu2_m128d (double* hiaddr, double* loaddr, __m256d a) pure @system 3833 { 3834 loaddr[0] = a.array[0]; 3835 loaddr[1] = a.array[1]; 3836 hiaddr[0] = a.array[2]; 3837 hiaddr[1] = a.array[3]; 3838 } 3839 unittest 3840 { 3841 double[2] A; 3842 double[2] B; 3843 _mm256_storeu2_m128d(A.ptr, B.ptr, _mm256_set1_pd(-43.0)); 3844 double[2] correct = [-43.0, -43]; 3845 assert(A == correct); 3846 assert(B == correct); 3847 } 3848 3849 /// Store the high and low 128-bit halves (each composed of integer data) from `a` into memory two 3850 /// different 128-bit locations. 3851 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 3852 void _mm256_storeu2_m128i (__m128i* hiaddr, __m128i* loaddr, __m256i a) pure @trusted 3853 { 3854 long* hi = cast(long*)hiaddr; 3855 long* lo = cast(long*)loaddr; 3856 lo[0] = a.array[0]; 3857 lo[1] = a.array[1]; 3858 hi[0] = a.array[2]; 3859 hi[1] = a.array[3]; 3860 } 3861 unittest 3862 { 3863 long[2] A; 3864 long[2] B; 3865 _mm256_storeu2_m128i(cast(__m128i*)A.ptr, cast(__m128i*)B.ptr, _mm256_set1_epi64x(-42)); 3866 long[2] correct = [-42, -42]; 3867 assert(A == correct); 3868 assert(B == correct); 3869 } 3870 3871 /// Store 256-bits (composed of 4 packed single-precision (64-bit) floating-point elements) from 3872 /// `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 32-byte 3873 /// boundary or a general-protection exception may be generated. 3874 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 3875 void _mm256_stream_pd (double* mem_addr, __m256d a) pure @system 3876 { 3877 // PERF DMD 3878 // PERF GDC + SSE2 3879 version(LDC) 3880 { 3881 enum prefix = `!0 = !{ i32 1 }`; 3882 enum ir = ` 3883 store <4 x double> %1, <4 x double>* %0, align 32, !nontemporal !0 3884 ret void`; 3885 LDCInlineIREx!(prefix, ir, "", void, double4*, double4)(cast(double4*)mem_addr, a); 3886 } 3887 else static if (GDC_with_AVX) // any hope to be non-temporal? Using SSE2 instructions. 3888 { 3889 __builtin_ia32_movntpd256 (mem_addr, a); 3890 } 3891 else 3892 { 3893 // Regular store instead. 3894 __m256d* dest = cast(__m256d*)mem_addr; 3895 *dest = a; 3896 } 3897 } 3898 unittest 3899 { 3900 align(32) double[4] mem; 3901 double[4] correct = [5.0, -6, -7, 8]; 3902 _mm256_stream_pd(mem.ptr, _mm256_setr_pd(5.0, -6, -7, 8)); 3903 assert(mem == correct); 3904 } 3905 3906 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 3907 /// `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 32-byte 3908 /// boundary or a general-protection exception may be generated. 3909 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 3910 void _mm256_stream_ps (float* mem_addr, __m256 a) pure @system 3911 { 3912 // PERF DMD 3913 // PERF GDC + SSE2 3914 version(LDC) 3915 { 3916 enum prefix = `!0 = !{ i32 1 }`; 3917 enum ir = ` 3918 store <8 x float> %1, <8 x float>* %0, align 32, !nontemporal !0 3919 ret void`; 3920 LDCInlineIREx!(prefix, ir, "", void, float8*, float8)(cast(float8*)mem_addr, a); 3921 } 3922 else static if (GDC_with_AVX) 3923 { 3924 __builtin_ia32_movntps256 (mem_addr, a); 3925 } 3926 else 3927 { 3928 // Regular store instead. 3929 __m256* dest = cast(__m256*)mem_addr; 3930 *dest = a; 3931 } 3932 } 3933 unittest 3934 { 3935 align(32) float[8] mem; 3936 float[8] correct = [5, -6, -7, 8, 1, 2, 3, 4]; 3937 _mm256_stream_ps(mem.ptr, _mm256_setr_ps(5, -6, -7, 8, 1, 2, 3, 4)); 3938 assert(mem == correct); 3939 } 3940 3941 /// Store 256-bits of integer data from `a` into memory using a non-temporal memory hint. 3942 /// `mem_addr` must be aligned on a 32-byte boundary or a general-protection exception may be 3943 /// generated. 3944 /// Note: there isn't any particular instruction in AVX to do that. It just defers to SSE2. 3945 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 3946 void _mm256_stream_si256 (__m256i * mem_addr, __m256i a) pure @trusted 3947 { 3948 // PERF DMD 3949 // PERF GDC 3950 version(LDC) 3951 { 3952 enum prefix = `!0 = !{ i32 1 }`; 3953 enum ir = ` 3954 store <4 x i64> %1, <4 x i64>* %0, align 16, !nontemporal !0 3955 ret void`; 3956 LDCInlineIREx!(prefix, ir, "", void, long4*, long4)(mem_addr, a); 3957 } 3958 else static if (GDC_with_SSE2) // any hope to be non-temporal? Using SSE2 instructions. 3959 { 3960 long2 lo, hi; 3961 lo.ptr[0] = a.array[0]; 3962 lo.ptr[1] = a.array[1]; 3963 hi.ptr[0] = a.array[2]; 3964 hi.ptr[1] = a.array[3]; 3965 _mm_stream_si128(cast(__m128i*)mem_addr, cast(__m128i)lo); 3966 _mm_stream_si128((cast(__m128i*)mem_addr) + 1, cast(__m128i)hi); 3967 } 3968 else 3969 { 3970 // Regular store instead. 3971 __m256i* dest = cast(__m256i*)mem_addr; 3972 *dest = a; 3973 } 3974 } 3975 unittest 3976 { 3977 align(32) long[4] mem; 3978 long[4] correct = [5, -6, -7, 8]; 3979 _mm256_stream_si256(cast(__m256i*)(mem.ptr), _mm256_setr_epi64x(5, -6, -7, 8)); 3980 assert(mem == correct); 3981 } 3982 3983 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from 3984 /// packed double-precision (64-bit) floating-point elements in `a`. 3985 __m256d _mm256_sub_pd (__m256d a, __m256d b) pure @safe 3986 { 3987 return a - b; 3988 } 3989 unittest 3990 { 3991 __m256d a = [1.5, -2.0, 3.0, 200000.0]; 3992 a = _mm256_sub_pd(a, a); 3993 double[4] correct = [0.0, 0, 0, 0]; 3994 assert(a.array == correct); 3995 } 3996 3997 /// Subtract packed single-precision (32-bit) floating-point elements in `b` from 3998 /// packed single-precision (32-bit) floating-point elements in `a`. 3999 __m256 _mm256_sub_ps (__m256 a, __m256 b) pure @safe 4000 { 4001 return a - b; 4002 } 4003 unittest 4004 { 4005 __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 1.5f, -2000.0f, 3.0f, 1.0f]; 4006 a = _mm256_sub_ps(a, a); 4007 float[8] correct = [0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f]; 4008 assert(a.array == correct); 4009 } 4010 4011 /// Compute the bitwise NOT of `a` and then AND with `b`, producing an intermediate value, and 4012 /// return 1 if the sign bit of each 64-bit element in the intermediate value is zero, 4013 /// otherwise return 0. 4014 int _mm_testc_pd (__m128d a, __m128d b) pure @trusted 4015 { 4016 static if (GDC_or_LDC_with_AVX) 4017 { 4018 return __builtin_ia32_vtestcpd(a, b); 4019 } 4020 else 4021 { 4022 // PERF: maybe do the generic version more like simde 4023 long2 la = cast(long2)a; 4024 long2 lb = cast(long2)b; 4025 long2 r = ~la & lb; 4026 return r.array[0] >= 0 && r.array[1] >= 0; 4027 } 4028 } 4029 unittest 4030 { 4031 __m128d A = _mm_setr_pd(-1, 1); 4032 __m128d B = _mm_setr_pd(-1, -1); 4033 __m128d C = _mm_setr_pd(1, -1); 4034 assert(_mm_testc_pd(A, A) == 1); 4035 assert(_mm_testc_pd(A, B) == 0); 4036 assert(_mm_testc_pd(B, A) == 1); 4037 } 4038 4039 ///ditto 4040 int _mm256_testc_pd (__m256d a, __m256d b) pure @safe 4041 { 4042 static if (GDC_or_LDC_with_AVX) 4043 { 4044 return __builtin_ia32_vtestcpd256(a, b); 4045 } 4046 else static if (LDC_with_ARM64) 4047 { 4048 // better to split than do vanilla (down to 10 inst) 4049 __m128d lo_a = _mm256_extractf128_pd!0(a); 4050 __m128d lo_b = _mm256_extractf128_pd!0(b); 4051 __m128d hi_a = _mm256_extractf128_pd!1(a); 4052 __m128d hi_b = _mm256_extractf128_pd!1(b); 4053 return _mm_testc_pd(lo_a, lo_b) & _mm_testc_pd(hi_a, hi_b); 4054 } 4055 else 4056 { 4057 // PERF: do the generic version more like simde, maybe this get rids of arm64 version 4058 long4 la = cast(long4)a; 4059 long4 lb = cast(long4)b; 4060 long4 r = ~la & lb; 4061 return r.array[0] >= 0 && r.array[1] >= 0 && r.array[2] >= 0 && r.array[3] >= 0; 4062 } 4063 } 4064 unittest 4065 { 4066 __m256d A = _mm256_setr_pd(-1, 1, -1, 1); 4067 __m256d B = _mm256_setr_pd(-1, -1, -1, -1); 4068 __m256d C = _mm256_setr_pd(1, -1, 1, -1); 4069 assert(_mm256_testc_pd(A, A) == 1); 4070 assert(_mm256_testc_pd(A, B) == 0); 4071 assert(_mm256_testc_pd(B, A) == 1); 4072 } 4073 4074 /// Compute the bitwise NOT of `a` and then AND with `b`, producing an intermediate value, and 4075 /// return 1 if the sign bit of each 32-bit element in the intermediate value is zero, 4076 /// otherwise return 0. 4077 int _mm_testc_ps (__m128 a, __m128 b) pure @safe 4078 { 4079 // PERF DMD 4080 static if (GDC_or_LDC_with_AVX) 4081 { 4082 return __builtin_ia32_vtestcps(a, b); 4083 } 4084 else static if (LDC_with_ARM64) 4085 { 4086 int4 la = cast(int4)a; 4087 int4 lb = cast(int4)b; 4088 int4 r = ~la & lb; 4089 int4 shift; 4090 shift = 31; 4091 r >>= shift; 4092 int[4] zero = [0, 0, 0, 0]; 4093 return r.array == zero; 4094 } 4095 else 4096 { 4097 // PERF: do the generic version more like simde, maybe this get rids of arm64 version 4098 int4 la = cast(int4)a; 4099 int4 lb = cast(int4)b; 4100 int4 r = ~la & lb; 4101 return r.array[0] >= 0 && r.array[1] >= 0 && r.array[2] >= 0 && r.array[3] >= 0; 4102 } 4103 } 4104 unittest 4105 { 4106 __m128 A = _mm_setr_ps(-1, 1, -1, 1); 4107 __m128 B = _mm_setr_ps(-1, -1, -1, -1); 4108 __m128 C = _mm_setr_ps(1, -1, 1, -1); 4109 assert(_mm_testc_ps(A, A) == 1); 4110 assert(_mm_testc_ps(A, B) == 0); 4111 assert(_mm_testc_ps(B, A) == 1); 4112 } 4113 4114 ///ditto 4115 int _mm256_testc_ps (__m256 a, __m256 b) pure @safe 4116 { 4117 // PERF DMD 4118 static if (GDC_or_LDC_with_AVX) 4119 { 4120 return __builtin_ia32_vtestcps256(a, b); 4121 } 4122 else static if (LDC_with_ARM64) 4123 { 4124 int8 la = cast(int8)a; 4125 int8 lb = cast(int8)b; 4126 int8 r = ~la & lb; 4127 int8 shift; 4128 shift = 31; 4129 r >>= shift; 4130 int[8] zero = [0, 0, 0, 0, 0, 0, 0, 0]; 4131 return r.array == zero; 4132 } 4133 else 4134 { 4135 // PERF: do the generic version more like simde, maybe this get rids of arm64 version 4136 int8 la = cast(int8)a; 4137 int8 lb = cast(int8)b; 4138 int8 r = ~la & lb; 4139 return r.array[0] >= 0 4140 && r.array[1] >= 0 4141 && r.array[2] >= 0 4142 && r.array[3] >= 0 4143 && r.array[4] >= 0 4144 && r.array[5] >= 0 4145 && r.array[6] >= 0 4146 && r.array[7] >= 0; 4147 } 4148 } 4149 unittest 4150 { 4151 __m256 A = _mm256_setr_ps(-1, 1, -1, 1, -1, 1, -1, 1); 4152 __m256 B = _mm256_setr_ps(-1, -1, -1, -1, -1, -1, -1, -1); 4153 __m256 C = _mm256_setr_ps( 1, -1, 1, -1, 1, 1, 1, 1); 4154 assert(_mm256_testc_ps(A, A) == 1); 4155 assert(_mm256_testc_ps(B, B) == 1); 4156 assert(_mm256_testc_ps(A, B) == 0); 4157 assert(_mm256_testc_ps(B, A) == 1); 4158 assert(_mm256_testc_ps(C, B) == 0); 4159 assert(_mm256_testc_ps(B, C) == 1); 4160 } 4161 4162 /// Compute the bitwise NOT of `a` and then AND with `b`, and return 1 if the result is zero, 4163 /// otherwise return 0. 4164 /// In other words, test if all bits masked by `b` are also 1 in `a`. 4165 int _mm256_testc_si256 (__m256i a, __m256i b) pure @trusted 4166 { 4167 static if (GDC_or_LDC_with_AVX) 4168 { 4169 return __builtin_ia32_ptestc256(cast(long4)a, cast(long4)b); 4170 } 4171 else static if (LDC_with_ARM64) 4172 { 4173 // better to split than do vanilla (down to 10 inst) 4174 __m128i lo_a = _mm256_extractf128_si256!0(a); 4175 __m128i lo_b = _mm256_extractf128_si256!0(b); 4176 __m128i hi_a = _mm256_extractf128_si256!1(a); 4177 __m128i hi_b = _mm256_extractf128_si256!1(b); 4178 return _mm_testc_si128(lo_a, lo_b) & _mm_testc_si128(hi_a, hi_b); 4179 } 4180 else 4181 { 4182 __m256i c = ~a & b; 4183 long[4] zero = [0, 0, 0, 0]; 4184 return c.array == zero; 4185 } 4186 } 4187 unittest 4188 { 4189 __m256i A = _mm256_setr_epi64(0x01, 0x02, 0x04, 0xf8); 4190 __m256i M1 = _mm256_setr_epi64(0xfe, 0xfd, 0x00, 0x00); 4191 __m256i M2 = _mm256_setr_epi64(0x00, 0x00, 0x04, 0x00); 4192 assert(_mm256_testc_si256(A, A) == 1); 4193 assert(_mm256_testc_si256(A, M1) == 0); 4194 assert(_mm256_testc_si256(A, M2) == 1); 4195 } 4196 4197 /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 4198 /// elements) in `a` and `b`, producing an intermediate 128-bit value, and set ZF to 1 if the 4199 /// sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. 4200 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set 4201 /// CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise 4202 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 4203 /// 4204 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`, 4205 /// AND there is at least one negative number in `b` that correspond to a negative number in `a`. 4206 int _mm_testnzc_pd (__m128d a, __m128d b) pure @safe 4207 { 4208 // PERF DMD 4209 static if (GDC_or_LDC_with_AVX) 4210 { 4211 return __builtin_ia32_vtestnzcpd(a, b); 4212 } 4213 else 4214 { 4215 // ZF = 0 means "there is at least one pair of negative numbers" 4216 // ZF = 1 means "no pairs of negative numbers" 4217 // CF = 0 means "there is a negative number in b that is next to a positive number in a" 4218 // CF = 1 means "all negative numbers in b are also negative in a" 4219 // Consequently, CF = 0 and ZF = 0 means: 4220 // "There is a pair of matching negative numbers in a and b, 4221 // AND also there is a negative number in b, that is matching a positive number in a" 4222 // Phew. 4223 long2 la = cast(long2)a; 4224 long2 lb = cast(long2)b; 4225 long2 r = la & lb; 4226 long m = r.array[0] | r.array[1]; 4227 int ZF = (~m >> 63) & 1; 4228 long2 r2 = ~la & lb; 4229 long m2 = r2.array[0] | r2.array[1]; 4230 int CF = (~m2 >> 63) & 1; 4231 return (CF | ZF) == 0; 4232 } 4233 } 4234 unittest 4235 { 4236 __m128d PM = _mm_setr_pd( 1, -1); 4237 __m128d MP = _mm_setr_pd(-1, 1); 4238 __m128d MM = _mm_setr_pd(-1, -1); 4239 assert(_mm_testnzc_pd(PM, MP) == 0); 4240 assert(_mm_testnzc_pd(PM, MM) == 1); 4241 assert(_mm_testnzc_pd(MP, MP) == 0); 4242 assert(_mm_testnzc_pd(MP, MM) == 1); 4243 assert(_mm_testnzc_pd(MM, MM) == 0); 4244 } 4245 4246 /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 4247 /// elements) in `a` and `b`, producing an intermediate 256-bit value, and set ZF to 1 if the 4248 /// sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. 4249 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set 4250 /// CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise 4251 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 4252 /// 4253 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`, 4254 /// AND there is at least one negative number in `b` that correspond to a negative number in `a`. 4255 int _mm256_testnzc_pd (__m256d a, __m256d b) pure @safe 4256 { 4257 // PERF DMD 4258 // PERF ARM64 4259 static if (GDC_or_LDC_with_AVX) 4260 { 4261 return __builtin_ia32_vtestnzcpd256(a, b); 4262 } 4263 else 4264 { 4265 long4 la = cast(long4)a; 4266 long4 lb = cast(long4)b; 4267 long4 r = la & lb; 4268 long m = r.array[0] | r.array[1] | r.array[2] | r.array[3]; 4269 int ZF = (~m >> 63) & 1; 4270 long4 r2 = ~la & lb; 4271 long m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3]; 4272 int CF = (~m2 >> 63) & 1; 4273 return (CF | ZF) == 0; 4274 } 4275 } 4276 unittest 4277 { 4278 __m256d PM = _mm256_setr_pd( 1, -1, 1, 1); 4279 __m256d MP = _mm256_setr_pd(-1, 1, 1, 1); 4280 __m256d MM = _mm256_setr_pd(-1, -1, 1, 1); 4281 assert(_mm256_testnzc_pd(PM, MP) == 0); 4282 assert(_mm256_testnzc_pd(PM, MM) == 1); 4283 assert(_mm256_testnzc_pd(MP, MP) == 0); 4284 assert(_mm256_testnzc_pd(MP, MM) == 1); 4285 assert(_mm256_testnzc_pd(MM, MM) == 0); 4286 } 4287 4288 /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 4289 /// elements) in `a` and `b`, producing an intermediate 128-bit value, and set ZF to 1 if the 4290 /// sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. 4291 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set 4292 /// CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise 4293 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 4294 /// 4295 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`, 4296 /// AND there is at least one negative number in `b` that correspond to a negative number in `a`. 4297 int _mm_testnzc_ps (__m128 a, __m128 b) pure @safe 4298 { 4299 // PERF DMD 4300 static if (GDC_or_LDC_with_AVX) 4301 { 4302 return __builtin_ia32_vtestnzcps(a, b); 4303 } 4304 else 4305 { 4306 int4 la = cast(int4)a; 4307 int4 lb = cast(int4)b; 4308 int4 r = la & lb; 4309 int m = r.array[0] | r.array[1] | r.array[2] | r.array[3]; 4310 int ZF = (~m >> 31) & 1; 4311 int4 r2 = ~la & lb; 4312 int m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3]; 4313 int CF = (~m2 >> 31) & 1; 4314 return (CF | ZF) == 0; 4315 } 4316 } 4317 unittest 4318 { 4319 __m128 PM = _mm_setr_ps( 1, -1, 1, 1); 4320 __m128 MP = _mm_setr_ps(-1, 1, 1, 1); 4321 __m128 MM = _mm_setr_ps(-1, -1, 1, 1); 4322 assert(_mm_testnzc_ps(PM, MP) == 0); 4323 assert(_mm_testnzc_ps(PM, MM) == 1); 4324 assert(_mm_testnzc_ps(MP, MP) == 0); 4325 assert(_mm_testnzc_ps(MP, MM) == 1); 4326 assert(_mm_testnzc_ps(MM, MM) == 0); 4327 } 4328 4329 /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 4330 /// elements) in `a` and `b`, producing an intermediate 256-bit value, and set ZF to 1 if the 4331 /// sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. 4332 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set 4333 /// CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise 4334 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 4335 /// 4336 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`, 4337 /// AND there is at least one negative number in `b` that correspond to a negative number in `a`. 4338 int _mm256_testnzc_ps (__m256 a, __m256 b) pure @safe 4339 { 4340 // PERF DMD 4341 static if (GDC_or_LDC_with_AVX) 4342 { 4343 return __builtin_ia32_vtestnzcps256(a, b); 4344 } 4345 else 4346 { 4347 int8 la = cast(int8)a; 4348 int8 lb = cast(int8)b; 4349 int8 r = la & lb; 4350 int m = r.array[0] | r.array[1] | r.array[2] | r.array[3] 4351 | r.array[4] | r.array[5] | r.array[6] | r.array[7]; 4352 int ZF = (~m >> 31) & 1; 4353 int8 r2 = ~la & lb; 4354 int m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3] 4355 | r2.array[4] | r2.array[5] | r2.array[6] | r2.array[7]; 4356 int CF = (~m2 >> 31) & 1; 4357 return (CF | ZF) == 0; 4358 } 4359 } 4360 unittest 4361 { 4362 __m256 PM = _mm256_setr_ps(1, 1, 1, 1, 1, -1, 1, 1); 4363 __m256 MP = _mm256_setr_ps(1, 1, 1, 1, -1, 1, 1, 1); 4364 __m256 MM = _mm256_setr_ps(1, 1, 1, 1, -1, -1, 1, 1); 4365 assert(_mm256_testnzc_ps(PM, MP) == 0); 4366 assert(_mm256_testnzc_ps(PM, MM) == 1); 4367 assert(_mm256_testnzc_ps(MP, MP) == 0); 4368 assert(_mm256_testnzc_ps(MP, MM) == 1); 4369 assert(_mm256_testnzc_ps(MM, MM) == 0); 4370 } 4371 4372 /// Compute the bitwise AND of 256 bits (representing integer data) in `a` and `b`, 4373 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 4374 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 4375 /// result is zero, otherwise set CF to 0. 4376 /// Return 1 if both the ZF and CF values are zero, otherwise return 0. 4377 int _mm256_testnzc_si256 (__m256i a, __m256i b) pure @trusted 4378 { 4379 // PERF ARM64 4380 // PERF DMD 4381 // PERF LDC without AVX 4382 static if (GDC_or_LDC_with_AVX) 4383 { 4384 return __builtin_ia32_ptestnzc256(cast(long4) a, cast(long4) b); 4385 } 4386 else 4387 { 4388 // Need to defer to _mm_testnzc_si128 if possible, for more speed 4389 __m256i c = a & b; 4390 __m256i d = ~a & b; 4391 long m = c.array[0] | c.array[1] | c.array[2] | c.array[3]; 4392 long n = d.array[0] | d.array[1] | d.array[2] | d.array[3]; 4393 return (m != 0) & (n != 0); 4394 } 4395 } 4396 unittest 4397 { 4398 __m256i A = _mm256_setr_epi32(0x01, 0x02, 0x04, 0xf8, 0, 0, 0, 0); 4399 __m256i M = _mm256_setr_epi32(0x01, 0x40, 0x00, 0x00, 0, 0, 0, 0); 4400 __m256i Z = _mm256_setzero_si256(); 4401 assert(_mm256_testnzc_si256(A, Z) == 0); 4402 assert(_mm256_testnzc_si256(A, M) == 1); 4403 assert(_mm256_testnzc_si256(A, A) == 0); 4404 } 4405 4406 /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 4407 /// elements) in `a` and `b`, producing an intermediate 128-bit value, return 1 if the sign bit of 4408 /// each 64-bit element in the intermediate value is zero, otherwise return 0. 4409 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place. 4410 int _mm_testz_pd (__m128d a, __m128d b) pure @trusted 4411 { 4412 static if (GDC_or_LDC_with_AVX) 4413 { 4414 return __builtin_ia32_vtestzpd(a, b); 4415 } 4416 else 4417 { 4418 long2 la = cast(long2)a; 4419 long2 lb = cast(long2)b; 4420 long2 r = la & lb; 4421 long m = r.array[0] | r.array[1]; 4422 return (~m >> 63) & 1; 4423 } 4424 } 4425 unittest 4426 { 4427 __m128d A = _mm_setr_pd(-1, 1); 4428 __m128d B = _mm_setr_pd(-1, -1); 4429 __m128d C = _mm_setr_pd(1, -1); 4430 assert(_mm_testz_pd(A, A) == 0); 4431 assert(_mm_testz_pd(A, B) == 0); 4432 assert(_mm_testz_pd(C, A) == 1); 4433 } 4434 4435 /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 4436 /// elements) in `a` and `b`, producing an intermediate 256-bit value, return 1 if the sign bit of 4437 /// each 64-bit element in the intermediate value is zero, otherwise return 0. 4438 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place. 4439 int _mm256_testz_pd (__m256d a, __m256d b) pure @trusted 4440 { 4441 static if (GDC_or_LDC_with_AVX) 4442 { 4443 return __builtin_ia32_vtestzpd256(a, b); 4444 } 4445 else 4446 { 4447 long4 la = cast(long4)a; 4448 long4 lb = cast(long4)b; 4449 long4 r = la & lb; 4450 long r2 = r.array[0] | r.array[1] | r.array[2] | r.array[3]; 4451 return (~r2 >> 63) & 1; 4452 } 4453 } 4454 unittest 4455 { 4456 __m256d A = _mm256_setr_pd(-1, 1, -1, 1); 4457 __m256d B = _mm256_setr_pd(1, 1, -1, 1); 4458 __m256d C = _mm256_setr_pd(1, -1, 1, -1); 4459 assert(_mm256_testz_pd(A, A) == 0); 4460 assert(_mm256_testz_pd(A, B) == 0); 4461 assert(_mm256_testz_pd(C, A) == 1); 4462 } 4463 4464 /// Compute the bitwise AND of 128 bits (representing double-precision (32-bit) floating-point 4465 /// elements) in `a` and `b`, producing an intermediate 128-bit value, return 1 if the sign bit of 4466 /// each 32-bit element in the intermediate value is zero, otherwise return 0. 4467 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place. 4468 int _mm_testz_ps (__m128 a, __m128 b) pure @safe 4469 { 4470 // PERF DMD 4471 static if (GDC_or_LDC_with_AVX) 4472 { 4473 return __builtin_ia32_vtestzps(a, b); 4474 } 4475 else 4476 { 4477 int4 la = cast(int4)a; 4478 int4 lb = cast(int4)b; 4479 int4 r = la & lb; 4480 int m = r.array[0] | r.array[1] | r.array[2] | r.array[3]; 4481 return (~m >> 31) & 1; 4482 } 4483 } 4484 unittest 4485 { 4486 __m128 A = _mm_setr_ps(-1, 1, -1, 1); 4487 __m128 B = _mm_setr_ps( 1, 1, -1, 1); 4488 __m128 C = _mm_setr_ps( 1, -1, 1, -1); 4489 assert(_mm_testz_ps(A, A) == 0); 4490 assert(_mm_testz_ps(A, B) == 0); 4491 assert(_mm_testz_ps(C, A) == 1); 4492 assert(_mm_testz_ps(C, B) == 1); 4493 } 4494 4495 /// Compute the bitwise AND of 256 bits (representing double-precision (32-bit) floating-point 4496 /// elements) in `a` and `b`, producing an intermediate 256-bit value, return 1 if the sign bit of 4497 /// each 32-bit element in the intermediate value is zero, otherwise return 0. 4498 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place. 4499 int _mm256_testz_ps (__m256 a, __m256 b) pure @safe 4500 { 4501 // PERF DMD 4502 static if (GDC_or_LDC_with_AVX) 4503 { 4504 return __builtin_ia32_vtestzps256(a, b); 4505 } 4506 else 4507 { 4508 int8 la = cast(int8)a; 4509 int8 lb = cast(int8)b; 4510 int8 r = la & lb; 4511 int m = r.array[0] | r.array[1] | r.array[2] | r.array[3] 4512 | r.array[4] | r.array[5] | r.array[6] | r.array[7]; 4513 return (~m >> 31) & 1; 4514 } 4515 } 4516 4517 /// Compute the bitwise AND of 256 bits (representing integer data) in 4518 /// and return 1 if the result is zero, otherwise return 0. 4519 /// In other words, test if all bits masked by `b` are 0 in `a`. 4520 int _mm256_testz_si256 (__m256i a, __m256i b) @trusted 4521 { 4522 // PERF DMD 4523 static if (GDC_with_AVX) 4524 { 4525 return __builtin_ia32_ptestz256(cast(long4)a, cast(long4)b); 4526 } 4527 else static if (LDC_with_AVX) 4528 { 4529 return __builtin_ia32_ptestz256(cast(long4)a, cast(long4)b); 4530 } 4531 else version(LDC) 4532 { 4533 // better to split than do vanilla (down to 8 inst in arm64) 4534 __m128i lo_a = _mm256_extractf128_si256!0(a); 4535 __m128i lo_b = _mm256_extractf128_si256!0(b); 4536 __m128i hi_a = _mm256_extractf128_si256!1(a); 4537 __m128i hi_b = _mm256_extractf128_si256!1(b); 4538 return _mm_testz_si128(lo_a, lo_b) & _mm_testz_si128(hi_a, hi_b); 4539 } 4540 else 4541 { 4542 __m256i c = a & b; 4543 long[4] zero = [0, 0, 0, 0]; 4544 return c.array == zero; 4545 } 4546 } 4547 unittest 4548 { 4549 __m256i A = _mm256_setr_epi32(0x01, 0x02, 0x04, 0xf8, 0x01, 0x02, 0x04, 0xf8); 4550 __m256i M1 = _mm256_setr_epi32(0xfe, 0xfd, 0x00, 0x07, 0xfe, 0xfd, 0x00, 0x07); 4551 __m256i M2 = _mm256_setr_epi32(0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00); 4552 assert(_mm256_testz_si256(A, A) == 0); 4553 assert(_mm256_testz_si256(A, M1) == 1); 4554 assert(_mm256_testz_si256(A, M2) == 0); 4555 } 4556 4557 /// Return vector of type __m256d with undefined elements. 4558 __m256d _mm256_undefined_pd () pure @safe 4559 { 4560 __m256d r = void; 4561 return r; 4562 } 4563 4564 /// Return vector of type __m256 with undefined elements. 4565 __m256 _mm256_undefined_ps () pure @safe 4566 { 4567 __m256 r = void; 4568 return r; 4569 } 4570 4571 /// Return vector of type __m256i with undefined elements. 4572 __m256i _mm256_undefined_si256 () pure @safe 4573 { 4574 __m256i r = void; 4575 return r; 4576 } 4577 4578 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 4579 /// each 128-bit lane in `a` and `b`. 4580 __m256d _mm256_unpackhi_pd (__m256d a, __m256d b) pure @trusted 4581 { 4582 version(LDC) 4583 { 4584 return shufflevectorLDC!(double4, 1, 5, 3, 7)(a, b); 4585 } 4586 else static if (GDC_with_AVX) 4587 { 4588 return __builtin_ia32_unpckhpd256 (a, b); 4589 } 4590 else 4591 { 4592 __m256d r; 4593 r.ptr[0] = a.array[1]; 4594 r.ptr[1] = b.array[1]; 4595 r.ptr[2] = a.array[3]; 4596 r.ptr[3] = b.array[3]; 4597 return r; 4598 } 4599 } 4600 unittest 4601 { 4602 __m256d A = _mm256_setr_pd(1.0, 2, 3, 4); 4603 __m256d B = _mm256_setr_pd(5.0, 6, 7, 8); 4604 __m256d C = _mm256_unpackhi_pd(A, B); 4605 double[4] correct = [2.0, 6, 4, 8]; 4606 assert(C.array == correct); 4607 } 4608 4609 4610 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 4611 /// each 128-bit lane in `a` and `b`. 4612 __m256 _mm256_unpackhi_ps (__m256 a, __m256 b) pure @trusted 4613 { 4614 version(LDC) 4615 { 4616 return shufflevectorLDC!(float8, 2, 10, 3, 11, 6, 14, 7, 15)(a, b); 4617 } 4618 else static if (GDC_with_AVX) 4619 { 4620 return __builtin_ia32_unpckhps256 (a, b); 4621 } 4622 else 4623 { 4624 __m256 r; 4625 r.ptr[0] = a.array[2]; 4626 r.ptr[1] = b.array[2]; 4627 r.ptr[2] = a.array[3]; 4628 r.ptr[3] = b.array[3]; 4629 r.ptr[4] = a.array[6]; 4630 r.ptr[5] = b.array[6]; 4631 r.ptr[6] = a.array[7]; 4632 r.ptr[7] = b.array[7]; 4633 return r; 4634 } 4635 } 4636 unittest 4637 { 4638 __m256 A = _mm256_setr_ps(0.0f, 1, 2, 3, 4, 5, 6, 7); 4639 __m256 B = _mm256_setr_ps(8.0f, 9, 10, 11, 12, 13, 14, 15); 4640 __m256 C = _mm256_unpackhi_ps(A, B); 4641 float[8] correct = [2.0f, 10, 3, 11, 6, 14, 7, 15]; 4642 assert(C.array == correct); 4643 } 4644 4645 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of 4646 /// each 128-bit lane in `a` and `b`. 4647 __m256d _mm256_unpacklo_pd (__m256d a, __m256d b) 4648 { 4649 version(LDC) 4650 { 4651 return shufflevectorLDC!(double4, 0, 4, 2, 6)(a, b); 4652 } 4653 else static if (GDC_with_AVX) 4654 { 4655 return __builtin_ia32_unpcklpd256 (a, b); 4656 } 4657 else 4658 { 4659 __m256d r; 4660 r.ptr[0] = a.array[0]; 4661 r.ptr[1] = b.array[0]; 4662 r.ptr[2] = a.array[2]; 4663 r.ptr[3] = b.array[2]; 4664 return r; 4665 } 4666 } 4667 unittest 4668 { 4669 __m256d A = _mm256_setr_pd(1.0, 2, 3, 4); 4670 __m256d B = _mm256_setr_pd(5.0, 6, 7, 8); 4671 __m256d C = _mm256_unpacklo_pd(A, B); 4672 double[4] correct = [1.0, 5, 3, 7]; 4673 assert(C.array == correct); 4674 } 4675 4676 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of 4677 /// each 128-bit lane in `a` and `b`. 4678 __m256 _mm256_unpacklo_ps (__m256 a, __m256 b) 4679 { 4680 version(LDC) 4681 { 4682 return shufflevectorLDC!(float8, 0, 8, 1, 9, 4, 12, 5, 13)(a, b); 4683 } 4684 else static if (GDC_with_AVX) 4685 { 4686 return __builtin_ia32_unpcklps256 (a, b); 4687 } 4688 else 4689 { 4690 __m256 r; 4691 r.ptr[0] = a.array[0]; 4692 r.ptr[1] = b.array[0]; 4693 r.ptr[2] = a.array[1]; 4694 r.ptr[3] = b.array[1]; 4695 r.ptr[4] = a.array[4]; 4696 r.ptr[5] = b.array[4]; 4697 r.ptr[6] = a.array[5]; 4698 r.ptr[7] = b.array[5]; 4699 return r; 4700 } 4701 } 4702 unittest 4703 { 4704 __m256 A = _mm256_setr_ps(0.0f, 1, 2, 3, 4, 5, 6, 7); 4705 __m256 B = _mm256_setr_ps(8.0f, 9, 10, 11, 12, 13, 14, 15); 4706 __m256 C = _mm256_unpacklo_ps(A, B); 4707 float[8] correct = [0.0f, 8, 1, 9, 4, 12, 5, 13]; 4708 assert(C.array == correct); 4709 } 4710 4711 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 4712 __m256d _mm256_xor_pd (__m256d a, __m256d b) pure @safe 4713 { 4714 return cast(__m256d)( cast(__m256i)a ^ cast(__m256i)b ); 4715 } 4716 4717 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`. 4718 __m256 _mm256_xor_ps (__m256 a, __m256 b) pure @safe 4719 { 4720 return cast(__m256)( cast(__m256i)a ^ cast(__m256i)b ); 4721 } 4722 4723 void _mm256_zeroall () pure @safe 4724 { 4725 // PERF: DMD needs to do it explicitely if AVX is ever used one day. 4726 4727 static if (GDC_with_AVX) 4728 { 4729 __builtin_ia32_vzeroall(); 4730 } 4731 else 4732 { 4733 // Do nothing. The transitions penalty are supposed handled by the backend (eg: LLVM). 4734 } 4735 } 4736 4737 void _mm256_zeroupper () pure @safe 4738 { 4739 // PERF: DMD needs to do it explicitely if AVX is ever used. 4740 4741 static if (GDC_with_AVX) 4742 { 4743 __builtin_ia32_vzeroupper(); 4744 } 4745 else 4746 { 4747 // Do nothing. The transitions penalty are supposed handled by the backend (eg: LLVM). 4748 } 4749 4750 } 4751 4752 /// Cast vector of type `__m128d` to type `__m256d`; the upper 128 bits of the result are zeroed. 4753 __m256d _mm256_zextpd128_pd256 (__m128d a) pure @trusted 4754 { 4755 __m256d r; 4756 r.ptr[0] = a.array[0]; 4757 r.ptr[1] = a.array[1]; 4758 r.ptr[2] = 0; 4759 r.ptr[3] = 0; 4760 return r; 4761 } 4762 unittest 4763 { 4764 __m256d R = _mm256_zextpd128_pd256(_mm_setr_pd(2.0, -3.0)); 4765 double[4] correct = [2.0, -3, 0, 0]; 4766 assert(R.array == correct); 4767 } 4768 4769 /// Cast vector of type `__m128` to type `__m256`; the upper 128 bits of the result are zeroed. 4770 __m256 _mm256_zextps128_ps256 (__m128 a) pure @trusted 4771 { 4772 double2 la = cast(double2)a; 4773 double4 r; 4774 r.ptr[0] = la.array[0]; 4775 r.ptr[1] = la.array[1]; 4776 r.ptr[2] = 0; 4777 r.ptr[3] = 0; 4778 return cast(__m256)r; 4779 } 4780 unittest 4781 { 4782 __m256 R = _mm256_zextps128_ps256(_mm_setr_ps(2.0, -3.0, 4, -5)); 4783 float[8] correct = [2.0, -3, 4, -5, 0, 0, 0, 0]; 4784 assert(R.array == correct); 4785 } 4786 4787 /// Cast vector of type `__m128i` to type `__m256i`; the upper 128 bits of the result are zeroed. 4788 __m256i _mm256_zextsi128_si256 (__m128i a) pure @trusted 4789 { 4790 long2 la = cast(long2)a; 4791 __m256i r; 4792 r.ptr[0] = la.array[0]; 4793 r.ptr[1] = la.array[1]; 4794 r.ptr[2] = 0; 4795 r.ptr[3] = 0; 4796 return r; 4797 } 4798 unittest 4799 { 4800 __m256i R = _mm256_zextsi128_si256(_mm_setr_epi64(-1, 99)); 4801 long[4] correct = [-1, 99, 0, 0]; 4802 assert(R.array == correct); 4803 }