1 /** 2 * AVX intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX 4 * 5 * Copyright: Guillaume Piolat 2022. 6 * Johan Engelen 2022. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.avxintrin; 10 11 // AVX instructions 12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX 13 // Note: this header will work whether you have AVX enabled or not. 14 // With LDC, use "dflags-ldc": ["-mattr=+avx"] or equivalent to actively 15 // generate AVX instructions. 16 // With GDC, use "dflags-gdc": ["-mavx"] or equivalent to actively 17 // generate AVX instructions. 18 19 20 /// IMPORTANT NOTE ABOUT MASK LOAD/STORE: 21 /// 22 /// In theory, masked load/store can adress unadressable memory provided the mask is zero. 23 /// In practice, that is not the case for the following reasons: 24 /// 25 /// - AMD manual says: 26 /// "Exception and trap behavior for elements not selected for loading or storing from/to memory 27 /// is implementation dependent. For instance, a given implementation may signal a data 28 /// breakpoint or a page fault for doublewords that are zero-masked and not actually written." 29 /// 30 /// - Intel fetches the whole cacheline anyway: 31 /// https://erik.science/2019/06/21/AVX-fun.html 32 /// "Even if the mask is stored in the special mask registers, it will still first fetch the data 33 /// before checking the mask." 34 /// 35 /// So intel-intrinsics adopted the tightened semantics of only adressing fully addressable memory 36 /// with masked loads and stores. 37 38 39 /// Some AVX intrinsics takes a float comparison constant. 40 /// When labelled "ordered" it means "AND ordered" 41 /// When labelled "unordered" it means "OR unordered" 42 alias _CMP_EQ = int; 43 ///ditto 44 enum : _CMP_EQ 45 { 46 _CMP_EQ_OQ = 0x00, // Equal (ordered, non-signaling) 47 _CMP_LT_OS = 0x01, // Less-than (ordered, signaling) 48 _CMP_LE_OS = 0x02, // Less-than-or-equal (ordered, signaling) 49 _CMP_UNORD_Q = 0x03, // Unordered (non-signaling) 50 _CMP_NEQ_UQ = 0x04, // Not-equal (unordered, non-signaling) 51 _CMP_NLT_US = 0x05, // Not-less-than (unordered, signaling) 52 _CMP_NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling) 53 _CMP_ORD_Q = 0x07, // Ordered (nonsignaling) 54 _CMP_EQ_UQ = 0x08, // Equal (unordered, non-signaling) 55 _CMP_NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling) 56 _CMP_NGT_US = 0x0a, // Not-greater-than (unordered, signaling) 57 _CMP_FALSE_OQ = 0x0b, // False (ordered, non-signaling) 58 _CMP_NEQ_OQ = 0x0c, // Not-equal (ordered, non-signaling) 59 _CMP_GE_OS = 0x0d, // Greater-than-or-equal (ordered, signaling) 60 _CMP_GT_OS = 0x0e, // Greater-than (ordered, signaling) 61 _CMP_TRUE_UQ = 0x0f, // True (unordered, non-signaling) 62 _CMP_EQ_OS = 0x10, // Equal (ordered, signaling) 63 _CMP_LT_OQ = 0x11, // Less-than (ordered, non-signaling) 64 _CMP_LE_OQ = 0x12, // Less-than-or-equal (ordered, non-signaling) 65 _CMP_UNORD_S = 0x13, // Unordered (signaling) 66 _CMP_NEQ_US = 0x14, // Not-equal (unordered, signaling) 67 _CMP_NLT_UQ = 0x15, // Not-less-than (unordered, non-signaling) 68 _CMP_NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, non-signaling) 69 _CMP_ORD_S = 0x17, // Ordered (signaling) 70 _CMP_EQ_US = 0x18, // Equal (unordered, signaling) 71 _CMP_NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, non-signaling) 72 _CMP_NGT_UQ = 0x1a, // Not-greater-than (unordered, non-signaling) 73 _CMP_FALSE_OS = 0x1b, // False (ordered, signaling) 74 _CMP_NEQ_OS = 0x1c, // Not-equal (ordered, signaling) 75 _CMP_GE_OQ = 0x1d, // Greater-than-or-equal (ordered, non-signaling) 76 _CMP_GT_OQ = 0x1e, // Greater-than (ordered, non-signaling) 77 _CMP_TRUE_US = 0x1f // (unordered, signaling) 78 } 79 80 public import inteli.types; 81 import inteli.internals; 82 83 // Pull in all previous instruction set intrinsics. 84 public import inteli.smmintrin; 85 public import inteli.tmmintrin; 86 87 88 89 // In x86, LDC earlier version may have trouble preserving the stack pointer when an unsupported 90 // 256-bit vector type is passed, and AVX is disabled. 91 // This leads to disabling some intrinsics in this particular situation, since they are not safe for 92 // the caller. 93 version(LDC) 94 { 95 version(X86) 96 { 97 enum llvm256BitStackWorkaroundIn32BitX86 = __VERSION__ < 2099; 98 } 99 else 100 enum llvm256BitStackWorkaroundIn32BitX86 = false; 101 } 102 else 103 enum llvm256BitStackWorkaroundIn32BitX86 = false; 104 105 106 107 108 nothrow @nogc: 109 110 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 111 __m256d _mm256_add_pd (__m256d a, __m256d b) pure @trusted 112 { 113 return a + b; 114 } 115 unittest 116 { 117 align(32) double[4] A = [-1, 2, -3, 40000]; 118 align(32) double[4] B = [ 9, -7, 8, -0.5]; 119 __m256d R = _mm256_add_pd(_mm256_load_pd(A.ptr), _mm256_load_pd(B.ptr)); 120 double[4] correct = [8, -5, 5, 39999.5]; 121 assert(R.array == correct); 122 } 123 124 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`. 125 __m256 _mm256_add_ps (__m256 a, __m256 b) pure @trusted 126 { 127 return a + b; 128 } 129 unittest 130 { 131 align(32) float[8] A = [-1.0f, 2, -3, 40000, 0, 3, 5, 6]; 132 align(32) float[8] B = [ 9.0f, -7, 8, -0.5, 8, 7, 3, -1]; 133 __m256 R = _mm256_add_ps(_mm256_load_ps(A.ptr), _mm256_load_ps(B.ptr)); 134 float[8] correct = [8, -5, 5, 39999.5, 8, 10, 8, 5]; 135 assert(R.array == correct); 136 } 137 138 /// Alternatively add and subtract packed double-precision (64-bit) floating-point 139 /// elements in `a` to/from packed elements in `b`. 140 __m256d _mm256_addsub_pd (__m256d a, __m256d b) pure @trusted 141 { 142 // PERF DMD 143 static if (GDC_or_LDC_with_AVX) 144 { 145 return __builtin_ia32_addsubpd256(a, b); 146 } 147 else 148 { 149 //// Note: GDC x86 generates addsubpd since GDC 11.1 with -O3 150 //// LDC x86 generates addsubpd since LDC 1.18 with -O2 151 //// LDC ARM: not fantastic, ok since LDC 1.18 -O2 152 a.ptr[0] = a.array[0] + (-b.array[0]); 153 a.ptr[1] = a.array[1] + b.array[1]; 154 a.ptr[2] = a.array[2] + (-b.array[2]); 155 a.ptr[3] = a.array[3] + b.array[3]; 156 return a; 157 } 158 } 159 unittest 160 { 161 align(32) double[4] A = [-1, 2, -3, 40000]; 162 align(32) double[4] B = [ 9, -7, 8, -0.5]; 163 __m256d R = _mm256_addsub_pd(_mm256_load_pd(A.ptr), _mm256_load_pd(B.ptr)); 164 double[4] correct = [-10, -5, -11, 39999.5]; 165 assert(R.array == correct); 166 } 167 168 /// Alternatively add and subtract packed single-precision (32-bit) floating-point elements 169 /// in `a` to/from packed elements in `b`. 170 __m256 _mm256_addsub_ps (__m256 a, __m256 b) pure @trusted 171 { 172 // PERF DMD 173 static if (GDC_or_LDC_with_AVX) 174 { 175 return __builtin_ia32_addsubps256(a, b); 176 } 177 else 178 { 179 // Note: GDC x86 generates addsubps since GDC 11 -O3 180 // and in absence of AVX, a pair of SSE3 addsubps since GDC 12 -O2 181 // LDC x86 generates addsubps since LDC 1.18 -O2 182 // and in absence of AVX, a pair of SSE3 addsubps since LDC 1.1 -O1 183 // LDC ARM: neat output since LDC 1.21 -O2 184 185 a.ptr[0] = a.array[0] + (-b.array[0]); 186 a.ptr[1] = a.array[1] + b.array[1]; 187 a.ptr[2] = a.array[2] + (-b.array[2]); 188 a.ptr[3] = a.array[3] + b.array[3]; 189 a.ptr[4] = a.array[4] + (-b.array[4]); 190 a.ptr[5] = a.array[5] + b.array[5]; 191 a.ptr[6] = a.array[6] + (-b.array[6]); 192 a.ptr[7] = a.array[7] + b.array[7]; 193 return a; 194 } 195 } 196 unittest 197 { 198 align(32) float[8] A = [-1.0f, 2, -3, 40000, 0, 3, 5, 6]; 199 align(32) float[8] B = [ 9.0f, -7, 8, -0.5, 8, 7, 3, -1]; 200 __m256 R = _mm256_addsub_ps(_mm256_load_ps(A.ptr), _mm256_load_ps(B.ptr)); 201 float[8] correct = [ -10, -5, -11, 39999.5, -8, 10, 2, 5]; 202 assert(R.array == correct); 203 } 204 205 /// Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in `a` and `b`. 206 __m256d _mm256_and_pd (__m256d a, __m256d b) pure @trusted 207 { 208 // Note: GCC avxintrin.h uses the builtins for AND NOTAND OR of _ps and _pd, 209 // but those do not seem needed at any optimization level. 210 return cast(__m256d)(cast(__m256i)a & cast(__m256i)b); 211 } 212 unittest 213 { 214 double a = 4.32; 215 double b = -78.99; 216 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 217 __m256d A = _mm256_set_pd(a, b, a, b); 218 __m256d B = _mm256_set_pd(b, a, b, a); 219 long4 R = cast(long4)( _mm256_and_pd(A, B) ); 220 assert(R.array[0] == correct); 221 assert(R.array[1] == correct); 222 assert(R.array[2] == correct); 223 assert(R.array[3] == correct); 224 } 225 226 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`. 227 __m256 _mm256_and_ps (__m256 a, __m256 b) pure @trusted 228 { 229 return cast(__m256)(cast(__m256i)a & cast(__m256i)b); 230 } 231 unittest 232 { 233 float a = 4.32f; 234 float b = -78.99f; 235 int correct = (*cast(int*)(&a)) & (*cast(int*)(&b)); 236 __m256 A = _mm256_set_ps(a, b, a, b, a, b, a, b); 237 __m256 B = _mm256_set_ps(b, a, b, a, b, a, b, a); 238 int8 R = cast(int8)( _mm256_and_ps(A, B) ); 239 foreach(i; 0..8) 240 assert(R.array[i] == correct); 241 } 242 243 /// Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in `a` 244 /// and then AND with b. 245 __m256d _mm256_andnot_pd (__m256d a, __m256d b) pure @trusted 246 { 247 // PERF DMD 248 __m256i notA = _mm256_not_si256(cast(__m256i)a); 249 __m256i ib = cast(__m256i)b; 250 __m256i ab = notA & ib; 251 return cast(__m256d)ab; 252 } 253 unittest 254 { 255 double a = 4.32; 256 double b = -78.99; 257 long notA = ~ ( *cast(long*)(&a) ); 258 long correct = notA & (*cast(long*)(&b)); 259 __m256d A = _mm256_set_pd(a, a, a, a); 260 __m256d B = _mm256_set_pd(b, b, b, b); 261 long4 R = cast(long4)( _mm256_andnot_pd(A, B) ); 262 foreach(i; 0..4) 263 assert(R.array[i] == correct); 264 } 265 266 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` 267 /// and then AND with b. 268 __m256 _mm256_andnot_ps (__m256 a, __m256 b) pure @trusted 269 { 270 // PERF DMD 271 __m256i notA = _mm256_not_si256(cast(__m256i)a); 272 __m256i ib = cast(__m256i)b; 273 __m256i ab = notA & ib; 274 return cast(__m256)ab; 275 } 276 unittest 277 { 278 float a = 4.32f; 279 float b = -78.99f; 280 int notA = ~ ( *cast(int*)(&a) ); 281 int correct = notA & (*cast(int*)(&b)); 282 __m256 A = _mm256_set1_ps(a); 283 __m256 B = _mm256_set1_ps(b); 284 int8 R = cast(int8)( _mm256_andnot_ps(A, B) ); 285 foreach(i; 0..8) 286 assert(R.array[i] == correct); 287 } 288 289 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control 290 /// mask `imm8`. 291 __m256d _mm256_blend_pd(int imm8)(__m256d a, __m256d b) 292 { 293 static assert(imm8 >= 0 && imm8 < 16); 294 295 // PERF DMD 296 static if (GDC_with_AVX) 297 { 298 return __builtin_ia32_blendpd256 (a, b, imm8); 299 } 300 else 301 { 302 // Works great with LDC. 303 double4 r; 304 for (int n = 0; n < 4; ++n) 305 { 306 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 307 } 308 return r; 309 } 310 } 311 unittest 312 { 313 __m256d A = _mm256_setr_pd(0, 1, 2, 3); 314 __m256d B = _mm256_setr_pd(8, 9, 10, 11); 315 double4 C = _mm256_blend_pd!0x06(A, B); 316 double[4] correct = [0, 9, 10, 3]; 317 assert(C.array == correct); 318 } 319 320 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control 321 /// mask `imm8`. 322 __m256 _mm256_blend_ps(int imm8)(__m256 a, __m256 b) pure @trusted 323 { 324 static assert(imm8 >= 0 && imm8 < 256); 325 // PERF DMD 326 static if (GDC_with_AVX) 327 { 328 return __builtin_ia32_blendps256 (a, b, imm8); 329 } 330 else version(LDC) 331 { 332 // LDC x86: generates a vblendps since LDC 1.1 -O0 333 // arm64: pretty good, four instructions worst case 334 return shufflevectorLDC!(float8, (imm8 & 1) ? 8 : 0, 335 (imm8 & 2) ? 9 : 1, 336 (imm8 & 4) ? 10 : 2, 337 (imm8 & 8) ? 11 : 3, 338 (imm8 & 16) ? 12 : 4, 339 (imm8 & 32) ? 13 : 5, 340 (imm8 & 64) ? 14 : 6, 341 (imm8 & 128) ? 15 : 7)(a, b); 342 } 343 else 344 { 345 // LDC x86: vblendps generated since LDC 1.27 -O1 346 float8 r; 347 for (int n = 0; n < 8; ++n) 348 { 349 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 350 } 351 return r; 352 } 353 } 354 unittest 355 { 356 __m256 A = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7); 357 __m256 B = _mm256_setr_ps(8, 9, 10, 11, 12, 13, 14, 15); 358 float8 C = _mm256_blend_ps!0xe7(A, B); 359 float[8] correct = [8, 9, 10, 3, 4, 13, 14, 15]; 360 assert(C.array == correct); 361 } 362 363 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using mask. 364 __m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask) @trusted 365 { 366 // PERF DMD 367 static if (GDC_with_AVX) 368 { 369 // Amazingly enough, GCC/GDC generates the vblendvpd instruction 370 // with -mavx2 but not -mavx. 371 // Not sure what is the reason, and there is a replacement sequence. 372 // Sounds like a bug, similar to _mm_blendv_pd 373 // or maybe the instruction in unsafe? 374 return __builtin_ia32_blendvpd256(a, b, mask); 375 } 376 else static if (LDC_with_AVX) 377 { 378 return __builtin_ia32_blendvpd256(a, b, mask); 379 } 380 else 381 { 382 // LDC x86: vblendvpd since LDC 1.27 -O2 383 // arm64: only 4 instructions, since LDC 1.27 -O2 384 __m256d r; 385 long4 lmask = cast(long4)mask; 386 for (int n = 0; n < 4; ++n) 387 { 388 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 389 } 390 return r; 391 } 392 } 393 unittest 394 { 395 __m256d A = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0); 396 __m256d B = _mm256_setr_pd(5.0, 6.0, 7.0, 8.0); 397 __m256d M = _mm256_setr_pd(-3.0, 2.0, 1.0, -4.0); 398 __m256d R = _mm256_blendv_pd(A, B, M); 399 double[4] correct1 = [5.0, 2.0, 3.0, 8.0]; 400 assert(R.array == correct1); 401 } 402 403 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` 404 /// using `mask`. 405 __m256 _mm256_blendv_ps (__m256 a, __m256 b, __m256 mask) @trusted 406 { 407 // PERF DMD 408 static if (GDC_or_LDC_with_AVX) 409 { 410 return __builtin_ia32_blendvps256(a, b, mask); 411 } 412 else static if (LDC_with_ARM64) 413 { 414 int8 shift; 415 shift = 31; 416 int8 lmask = cast(int8)mask >> shift; 417 int8 ia = cast(int8)a; 418 int8 ib = cast(int8)b; 419 return cast(__m256)(ia ^ ((ia ^ ib) & lmask)); 420 } 421 else 422 { 423 // In both LDC and GDC with SSE4.1, this generates blendvps as fallback 424 __m256 r; 425 int8 lmask = cast(int8)mask; 426 for (int n = 0; n < 8; ++n) 427 { 428 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 429 } 430 return r; 431 } 432 } 433 unittest 434 { 435 __m256 A = _mm256_setr_ps(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); 436 __m256 B = _mm256_setr_ps(5.0f, 6.0f, 7.0f, 8.0f, 5.0f, 6.0f, 7.0f, 8.0f); 437 __m256 M = _mm256_setr_ps(-3.0f, 2.0f, 1.0f, -4.0f, -3.0f, 2.0f, 1.0f, -4.0f); 438 __m256 R = _mm256_blendv_ps(A, B, M); 439 float[8] correct1 = [5.0f, 2.0f, 3.0f, 8.0f, 5.0f, 2.0f, 3.0f, 8.0f]; 440 assert(R.array == correct1); 441 } 442 443 /// Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit) 444 /// floating-point elements) to all elements. 445 /// This effectively duplicates the 128-bit vector. 446 __m256d _mm256_broadcast_pd (const(__m128d)* mem_addr) pure @trusted 447 { 448 // PERF DMD 449 static if (GDC_with_AVX) 450 { 451 return __builtin_ia32_vbroadcastf128_pd256(cast(float4*)mem_addr); 452 } 453 else 454 { 455 const(double)* p = cast(const(double)*) mem_addr; 456 __m256d r; 457 r.ptr[0] = p[0]; 458 r.ptr[1] = p[1]; 459 r.ptr[2] = p[0]; 460 r.ptr[3] = p[1]; 461 return r; 462 } 463 } 464 unittest 465 { 466 __m128d A = _mm_setr_pd(3, -4); 467 __m256d B = _mm256_broadcast_pd(&A); 468 double[4] correct = [3, -4, 3, -4]; 469 assert(B.array == correct); 470 } 471 472 /// Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) 473 /// floating-point elements) to all elements. 474 /// This effectively duplicates the 128-bit vector. 475 __m256 _mm256_broadcast_ps (const(__m128)* mem_addr) pure @trusted 476 { 477 // PERF DMD 478 static if (GDC_with_AVX) 479 { 480 return __builtin_ia32_vbroadcastf128_ps256(cast(float4*)mem_addr); 481 } 482 else 483 { 484 const(float)* p = cast(const(float)*)mem_addr; 485 __m256 r; 486 r.ptr[0] = p[0]; 487 r.ptr[1] = p[1]; 488 r.ptr[2] = p[2]; 489 r.ptr[3] = p[3]; 490 r.ptr[4] = p[0]; 491 r.ptr[5] = p[1]; 492 r.ptr[6] = p[2]; 493 r.ptr[7] = p[3]; 494 return r; 495 } 496 } 497 unittest 498 { 499 __m128 A = _mm_setr_ps(1, 2, 3, -4); 500 __m256 B = _mm256_broadcast_ps(&A); 501 float[8] correct = [1.0f, 2, 3, -4, 1, 2, 3, -4]; 502 assert(B.array == correct); 503 } 504 505 /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements. 506 __m256d _mm256_broadcast_sd (const(double)* mem_addr) pure @trusted 507 { 508 static if (GDC_with_AVX) 509 { 510 return __builtin_ia32_vbroadcastsd256(mem_addr); 511 } 512 else 513 { 514 double a = *mem_addr; 515 __m256d r; 516 r.ptr[0] = a; 517 r.ptr[1] = a; 518 r.ptr[2] = a; 519 r.ptr[3] = a; 520 return r; 521 } 522 } 523 unittest 524 { 525 double t = 7.5f; 526 __m256d A = _mm256_broadcast_sd(&t); 527 double[4] correct = [7.5, 7.5, 7.5, 7.5]; 528 assert(A.array == correct); 529 } 530 531 /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements. 532 __m128 _mm_broadcast_ss (const(float)* mem_addr) pure @trusted 533 { 534 // PERF DMD 535 static if (GDC_with_AVX) 536 { 537 return __builtin_ia32_vbroadcastss(mem_addr); 538 } 539 else 540 { 541 float a = *mem_addr; 542 __m128 r; 543 r.ptr[0] = a; 544 r.ptr[1] = a; 545 r.ptr[2] = a; 546 r.ptr[3] = a; 547 return r; 548 } 549 } 550 unittest 551 { 552 float t = 7.5f; 553 __m128 A = _mm_broadcast_ss(&t); 554 float[4] correct = [7.5f, 7.5f, 7.5f, 7.5f]; 555 assert(A.array == correct); 556 } 557 558 __m256 _mm256_broadcast_ss (const(float)* mem_addr) 559 { 560 // PERF DMD 561 static if (GDC_with_AVX) 562 { 563 return __builtin_ia32_vbroadcastss256 (mem_addr); 564 } 565 else 566 { 567 float a = *mem_addr; 568 __m256 r = __m256(a); 569 return r; 570 } 571 } 572 unittest 573 { 574 float t = 7.5f; 575 __m256 A = _mm256_broadcast_ss(&t); 576 float[8] correct = [7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f]; 577 assert(A.array == correct); 578 } 579 580 /// Cast vector of type `__m256d` to type `__m256`. 581 __m256 _mm256_castpd_ps (__m256d a) pure @safe 582 { 583 return cast(__m256)a; 584 } 585 586 /// Cast vector of type `__m256d` to type `__m256i`. 587 __m256i _mm256_castpd_si256 (__m256d a) pure @safe 588 { 589 return cast(__m256i)a; 590 } 591 592 /// Cast vector of type `__m128d` to type `__m256d`; the upper 128 bits of the result are undefined. 593 __m256d _mm256_castpd128_pd256 (__m128d a) pure @trusted 594 { 595 static if (GDC_with_AVX) 596 { 597 return __builtin_ia32_pd256_pd(a); 598 } 599 else 600 { 601 __m256d r = void; 602 r.ptr[0] = a.array[0]; 603 r.ptr[1] = a.array[1]; 604 return r; 605 } 606 } 607 unittest 608 { 609 __m128d A = _mm_setr_pd(4.0, -6.125); 610 __m256d B = _mm256_castpd128_pd256(A); 611 assert(B.array[0] == 4.0); 612 assert(B.array[1] == -6.125); 613 } 614 615 /// Cast vector of type `__m256d` to type `__m128d`; the upper 128 bits of `a` are lost. 616 __m128d _mm256_castpd256_pd128 (__m256d a) pure @trusted 617 { 618 static if (GDC_with_AVX) 619 { 620 return __builtin_ia32_pd_pd256(a); 621 } 622 else 623 { 624 __m128d r; 625 r.ptr[0] = a.array[0]; 626 r.ptr[1] = a.array[1]; 627 return r; 628 } 629 } 630 unittest 631 { 632 __m256d A = _mm256_set_pd(1, 2, -6.25, 4.0); 633 __m128d B = _mm256_castpd256_pd128(A); 634 assert(B.array[0] == 4.0); 635 assert(B.array[1] == -6.25); 636 } 637 638 /// Cast vector of type `__m256` to type `__m256d`. 639 __m256d _mm256_castps_pd (__m256 a) pure @safe 640 { 641 return cast(__m256d)a; 642 } 643 644 /// Cast vector of type `__m256` to type `__m256i`. 645 __m256i _mm256_castps_si256 (__m256 a) pure @safe 646 { 647 return cast(__m256i)a; 648 } 649 650 /// Cast vector of type `__m128` to type `__m256`; the upper 128 bits of the result are undefined. 651 __m256 _mm256_castps128_ps256 (__m128 a) pure @trusted 652 { 653 static if (GDC_with_AVX) 654 { 655 return __builtin_ia32_ps256_ps(a); 656 } 657 else 658 { 659 __m256 r = void; 660 r.ptr[0] = a.array[0]; 661 r.ptr[1] = a.array[1]; 662 r.ptr[2] = a.array[2]; 663 r.ptr[3] = a.array[3]; 664 return r; 665 } 666 } 667 unittest 668 { 669 __m128 A = _mm_setr_ps(1.0f, 2, 3, 4); 670 __m256 B = _mm256_castps128_ps256(A); 671 float[4] correct = [1.0f, 2, 3, 4]; 672 assert(B.array[0..4] == correct); 673 } 674 675 /// Cast vector of type `__m256` to type `__m128`. The upper 128-bit of `a` are lost. 676 __m128 _mm256_castps256_ps128 (__m256 a) pure @trusted 677 { 678 return *cast(const(__m128)*)(&a); 679 } 680 unittest 681 { 682 __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8); 683 __m128 B = _mm256_castps256_ps128(A); 684 float[4] correct = [1.0f, 2, 3, 4]; 685 assert(B.array == correct); 686 } 687 688 /// Cast vector of type `__m128i` to type `__m256i`; the upper 128 bits of the result are undefined. 689 __m256i _mm256_castsi128_si256 (__m128i a) pure @trusted 690 { 691 long2 la = cast(long2)a; 692 long4 r = void; 693 r.ptr[0] = la.array[0]; 694 r.ptr[1] = la.array[1]; 695 return r; 696 } 697 unittest 698 { 699 __m128i A = _mm_setr_epi64(-1, 42); 700 __m256i B = _mm256_castsi128_si256(A); 701 long[2] correct = [-1, 42]; 702 assert(B.array[0..2] == correct); 703 } 704 705 /// Cast vector of type `__m256i` to type `__m256d`. 706 __m256d _mm256_castsi256_pd (__m256i a) pure @safe 707 { 708 return cast(__m256d)a; 709 } 710 711 /// Cast vector of type `__m256i` to type `__m256`. 712 __m256 _mm256_castsi256_ps (__m256i a) pure @safe 713 { 714 return cast(__m256)a; 715 } 716 717 /// Cast vector of type `__m256i` to type `__m128i`. The upper 128-bit of `a` are lost. 718 __m128i _mm256_castsi256_si128 (__m256i a) pure @trusted 719 { 720 long2 r = void; 721 r.ptr[0] = a.array[0]; 722 r.ptr[1] = a.array[1]; 723 return cast(__m128i)r; 724 } 725 unittest 726 { 727 long4 A; 728 A.ptr[0] = -1; 729 A.ptr[1] = 42; 730 long2 B = cast(long2)(_mm256_castsi256_si128(A)); 731 long[2] correct = [-1, 42]; 732 assert(B.array[0..2] == correct); 733 } 734 735 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer 736 /// value, and store the results as packed double-precision floating-point elements. 737 __m256d _mm256_ceil_pd (__m256d a) @safe 738 { 739 static if (LDC_with_ARM64) 740 { 741 __m128d lo = _mm256_extractf128_pd!0(a); 742 __m128d hi = _mm256_extractf128_pd!1(a); 743 __m128d ilo = _mm_ceil_pd(lo); 744 __m128d ihi = _mm_ceil_pd(hi); 745 return _mm256_set_m128d(ihi, ilo); 746 } 747 else 748 { 749 return _mm256_round_pd!2(a); 750 } 751 } 752 unittest 753 { 754 __m256d A = _mm256_setr_pd(1.3f, -2.12f, 53.6f, -2.7f); 755 A = _mm256_ceil_pd(A); 756 double[4] correct = [2.0, -2.0, 54.0, -2.0]; 757 assert(A.array == correct); 758 } 759 760 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer 761 /// value, and store the results as packed single-precision floating-point elements. 762 __m256 _mm256_ceil_ps (__m256 a) @safe 763 { 764 static if (LDC_with_ARM64) 765 { 766 __m128 lo = _mm256_extractf128_ps!0(a); 767 __m128 hi = _mm256_extractf128_ps!1(a); 768 __m128 ilo = _mm_ceil_ps(lo); 769 __m128 ihi = _mm_ceil_ps(hi); 770 return _mm256_set_m128(ihi, ilo); 771 } 772 else 773 { 774 return _mm256_round_ps!2(a); 775 } 776 } 777 unittest 778 { 779 __m256 A = _mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f); 780 __m256 C = _mm256_ceil_ps(A); 781 float[8] correct = [2.0f, -2.0f, 54.0f, -2.0f, -1, 3, -53, 3]; 782 assert(C.array == correct); 783 } 784 785 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b` based on the 786 /// comparison operand specified by `imm8`. 787 __m128d _mm_cmp_pd(int imm8)(__m128d a, __m128d b) pure @safe 788 { 789 enum comparison = mapAVXFPComparison(imm8); 790 return cast(__m128d) cmppd!comparison(a, b); 791 } 792 unittest 793 { 794 __m128d A = _mm_setr_pd(double.infinity, double.nan); 795 __m128d B = _mm_setr_pd(3.0, 4.0); 796 long2 R = cast(long2) _mm_cmp_pd!_CMP_GT_OS(A, B); 797 long[2] correct = [-1, 0]; 798 assert(R.array == correct); 799 800 long2 R2 = cast(long2) _mm_cmp_pd!_CMP_NLE_UQ(A, B); 801 long[2] correct2 = [-1, -1]; 802 assert(R2.array == correct2); 803 } 804 805 ///ditto 806 __m256d _mm256_cmp_pd(int imm8)(__m256d a, __m256d b) pure @safe 807 { 808 enum comparison = mapAVXFPComparison(imm8); 809 return cast(__m256d) cmppd256!comparison(a, b); 810 } 811 unittest 812 { 813 __m256d A = _mm256_setr_pd(1.0, 2.0, 3.0, double.nan); 814 __m256d B = _mm256_setr_pd(3.0, 2.0, 1.0, double.nan); 815 __m256i R = cast(__m256i) _mm256_cmp_pd!_CMP_LT_OS(A, B); 816 long[4] correct = [-1, 0, 0, 0]; 817 assert(R.array == correct); 818 } 819 820 /// Compare packed double-precision (32-bit) floating-point elements in `a` and `b` based on the 821 /// comparison operand specified by `imm8`. 822 __m128 _mm_cmp_ps(int imm8)(__m128 a, __m128 b) pure @safe 823 { 824 enum comparison = mapAVXFPComparison(imm8); 825 return cast(__m128) cmpps!comparison(a, b); 826 } 827 828 ///ditto 829 __m256 _mm256_cmp_ps(int imm8)(__m256 a, __m256 b) pure @safe 830 { 831 enum comparison = mapAVXFPComparison(imm8); 832 return cast(__m256) cmpps256!comparison(a, b); 833 } 834 835 /// Compare the lower double-precision (64-bit) floating-point element in `a` and `b` based on the 836 /// comparison operand specified by `imm8`, store the result in the lower element of result, and 837 /// copy the upper element from `a` to the upper element of result. 838 __m128d _mm_cmp_sd(int imm8)(__m128d a, __m128d b) pure @safe 839 { 840 enum comparison = mapAVXFPComparison(imm8); 841 return cast(__m128d) cmpsd!comparison(a, b); 842 } 843 844 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` based on the 845 /// comparison operand specified by `imm8`, store the result in the lower element of result, and 846 /// copy the upper 3 packed elements from `a` to the upper elements of result. 847 __m128 _mm_cmp_ss(int imm8)(__m128 a, __m128 b) pure @safe 848 { 849 enum comparison = mapAVXFPComparison(imm8); 850 return cast(__m128) cmpss!comparison(a, b); 851 } 852 853 /// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point 854 /// elements. 855 __m256d _mm256_cvtepi32_pd (__m128i a) pure @trusted 856 { 857 version(LDC) 858 { 859 enum ir = ` 860 %r = sitofp <4 x i32> %0 to <4 x double> 861 ret <4 x double> %r`; 862 return LDCInlineIR!(ir, double4, __m128i)(a); 863 } 864 else static if (GDC_with_AVX) 865 { 866 return __builtin_ia32_cvtdq2pd256(a); 867 } 868 else 869 { 870 double4 r; 871 r.ptr[0] = a.array[0]; 872 r.ptr[1] = a.array[1]; 873 r.ptr[2] = a.array[2]; 874 r.ptr[3] = a.array[3]; 875 return r; 876 } 877 } 878 unittest 879 { 880 __m256d R = _mm256_cvtepi32_pd(_mm_set1_epi32(54)); 881 double[4] correct = [54.0, 54, 54, 54]; 882 assert(R.array == correct); 883 } 884 885 /// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point 886 /// elements. 887 __m256 _mm256_cvtepi32_ps (__m256i a) pure @trusted 888 { 889 version(LDC) 890 { 891 enum ir = ` 892 %r = sitofp <8 x i32> %0 to <8 x float> 893 ret <8 x float> %r`; 894 return LDCInlineIR!(ir, float8, int8)(cast(int8)a); 895 } 896 else static if (GDC_with_AVX) 897 { 898 return __builtin_ia32_cvtdq2ps256(cast(int8)a); 899 } 900 else 901 { 902 int8 ia = cast(int8)a; 903 __m256 r; 904 r.ptr[0] = ia.array[0]; 905 r.ptr[1] = ia.array[1]; 906 r.ptr[2] = ia.array[2]; 907 r.ptr[3] = ia.array[3]; 908 r.ptr[4] = ia.array[4]; 909 r.ptr[5] = ia.array[5]; 910 r.ptr[6] = ia.array[6]; 911 r.ptr[7] = ia.array[7]; 912 return r; 913 } 914 } 915 unittest 916 { 917 __m256 R = _mm256_cvtepi32_ps(_mm256_set1_epi32(5)); 918 float[8] correct = [5.0f, 5, 5, 5, 5, 5, 5, 5]; 919 assert(R.array == correct); 920 } 921 922 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit 923 /// integers. Follows the current rounding mode. 924 __m128i _mm256_cvtpd_epi32 (__m256d a) @safe 925 { 926 static if (GDC_or_LDC_with_AVX) 927 { 928 return __builtin_ia32_cvtpd2dq256(a); 929 } 930 else 931 { 932 __m128d lo = _mm256_extractf128_pd!0(a); 933 __m128d hi = _mm256_extractf128_pd!1(a); 934 __m128i ilo = _mm_cvtpd_epi32(lo); // Only lower 64-bit contains significant values 935 __m128i ihi = _mm_cvtpd_epi32(hi); 936 return _mm_unpacklo_epi64(ilo, ihi); 937 } 938 } 939 unittest 940 { 941 int4 A = _mm256_cvtpd_epi32(_mm256_setr_pd(61.0, 55.0, -100, 1_000_000)); 942 int[4] correct = [61, 55, -100, 1_000_000]; 943 assert(A.array == correct); 944 } 945 946 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed single-precision (32-bit) 947 /// floating-point elements. 948 __m128 _mm256_cvtpd_ps (__m256d a) pure @trusted 949 { 950 // PERF DMD 951 static if (GDC_or_LDC_with_AVX) 952 { 953 return __builtin_ia32_cvtpd2ps256(a); 954 } 955 else 956 { 957 __m128 r; 958 r.ptr[0] = a.array[0]; 959 r.ptr[1] = a.array[1]; 960 r.ptr[2] = a.array[2]; 961 r.ptr[3] = a.array[3]; 962 return r; 963 } 964 } 965 unittest 966 { 967 __m256d A = _mm256_setr_pd(1.0, 2, 3, 5); 968 __m128 R = _mm256_cvtpd_ps(A); 969 float[4] correct = [1.0f, 2, 3, 5]; 970 assert(R.array == correct); 971 } 972 973 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit 974 /// integers, using the current rounding mode. 975 __m256i _mm256_cvtps_epi32 (__m256 a) @trusted 976 { 977 static if (GDC_or_LDC_with_AVX) 978 { 979 return cast(__m256i) __builtin_ia32_cvtps2dq256(a); 980 } 981 else 982 { 983 __m128 lo = _mm256_extractf128_ps!0(a); 984 __m128 hi = _mm256_extractf128_ps!1(a); 985 __m128i ilo = _mm_cvtps_epi32(lo); 986 __m128i ihi = _mm_cvtps_epi32(hi); 987 return _mm256_set_m128i(ihi, ilo); 988 } 989 } 990 unittest 991 { 992 uint savedRounding = _MM_GET_ROUNDING_MODE(); 993 994 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 995 __m256i A = _mm256_cvtps_epi32(_mm256_setr_ps(1.4f, -2.1f, 53.5f, -2.9f, -1.4f, 2.1f, -53.5f, 2.9f)); 996 assert( (cast(int8)A).array == [1, -2, 54, -3, -1, 2, -54, 3]); 997 998 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 999 A = _mm256_cvtps_epi32(_mm256_setr_ps(1.3f, -2.11f, 53.4f, -2.8f, -1.3f, 2.11f, -53.4f, 2.8f)); 1000 assert( (cast(int8)A).array == [1, -3, 53, -3, -2, 2, -54, 2]); 1001 1002 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1003 A = _mm256_cvtps_epi32(_mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f)); 1004 assert( (cast(int8)A).array == [2, -2, 54, -2, -1, 3, -53, 3]); 1005 1006 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1007 A = _mm256_cvtps_epi32(_mm256_setr_ps(1.4f, -2.17f, 53.8f, -2.91f, -1.4f, 2.17f, -53.8f, 2.91f)); 1008 assert( (cast(int8)A).array == [1, -2, 53, -2, -1, 2, -53, 2]); 1009 1010 _MM_SET_ROUNDING_MODE(savedRounding); 1011 } 1012 1013 1014 /// Convert packed single-precision (32-bit) floating-point elements in `a`` to packed double-precision 1015 /// (64-bit) floating-point elements. 1016 __m256d _mm256_cvtps_pd (__m128 a) pure @trusted 1017 { 1018 // PERF DMD 1019 static if (GDC_with_AVX) 1020 { 1021 return __builtin_ia32_cvtps2pd256(a); // LDC doesn't have the builtin 1022 } 1023 else 1024 { 1025 // LDC: x86, needs -O2 to generate cvtps2pd since LDC 1.2.0 1026 __m256d r; 1027 r.ptr[0] = a.array[0]; 1028 r.ptr[1] = a.array[1]; 1029 r.ptr[2] = a.array[2]; 1030 r.ptr[3] = a.array[3]; 1031 return r; 1032 } 1033 } 1034 unittest 1035 { 1036 __m128 A = _mm_setr_ps(1.0f, 2, 3, 5); 1037 __m256d R = _mm256_cvtps_pd(A); 1038 double[4] correct = [1.0, 2, 3, 5]; 1039 assert(R.array == correct); 1040 } 1041 1042 /// Return the lower double-precision (64-bit) floating-point element of `a`. 1043 double _mm256_cvtsd_f64 (__m256d a) pure @safe 1044 { 1045 return a.array[0]; 1046 } 1047 1048 /// Return the lower 32-bit integer in `a`. 1049 int _mm256_cvtsi256_si32 (__m256i a) pure @safe 1050 { 1051 return (cast(int8)a).array[0]; 1052 } 1053 1054 /// Return the lower single-precision (32-bit) floating-point element of `a`. 1055 float _mm256_cvtss_f32 (__m256 a) pure @safe 1056 { 1057 return a.array[0]; 1058 } 1059 1060 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit 1061 /// integers with truncation. 1062 __m128i _mm256_cvttpd_epi32 (__m256d a) pure @trusted 1063 { 1064 // PERF DMD 1065 static if (GDC_or_LDC_with_AVX) 1066 { 1067 return cast(__m128i)__builtin_ia32_cvttpd2dq256(a); 1068 } 1069 else 1070 { 1071 __m128i r; 1072 r.ptr[0] = cast(int)a.array[0]; 1073 r.ptr[1] = cast(int)a.array[1]; 1074 r.ptr[2] = cast(int)a.array[2]; 1075 r.ptr[3] = cast(int)a.array[3]; 1076 return r; 1077 } 1078 } 1079 unittest 1080 { 1081 __m256d A = _mm256_set_pd(4.7, -1000.9, -7.1, 3.1); 1082 __m128i R = _mm256_cvttpd_epi32(A); 1083 int[4] correct = [3, -7, -1000, 4]; 1084 assert(R.array == correct); 1085 } 1086 1087 /// Convert packed single-precision (32-bit) floating-point elements in `a`. 1088 __m256i _mm256_cvttps_epi32 (__m256 a) pure @trusted 1089 { 1090 // PERF DMD 1091 static if (GDC_or_LDC_with_AVX) 1092 { 1093 return cast(__m256i)__builtin_ia32_cvttps2dq256(a); 1094 } 1095 else 1096 { 1097 int8 r; 1098 r.ptr[0] = cast(int)a.array[0]; 1099 r.ptr[1] = cast(int)a.array[1]; 1100 r.ptr[2] = cast(int)a.array[2]; 1101 r.ptr[3] = cast(int)a.array[3]; 1102 r.ptr[4] = cast(int)a.array[4]; 1103 r.ptr[5] = cast(int)a.array[5]; 1104 r.ptr[6] = cast(int)a.array[6]; 1105 r.ptr[7] = cast(int)a.array[7]; 1106 return cast(__m256i)r; 1107 } 1108 } 1109 unittest 1110 { 1111 __m256 A = _mm256_set_ps(4.7, -1000.9, -7.1, 3.1, 1.4, 2.9, -2.9, 0); 1112 int8 R = cast(int8) _mm256_cvttps_epi32(A); 1113 int[8] correct = [0, -2, 2, 1, 3, -7, -1000, 4]; 1114 assert(R.array == correct); 1115 } 1116 1117 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`. 1118 __m256d _mm256_div_pd (__m256d a, __m256d b) pure @safe 1119 { 1120 return a / b; 1121 } 1122 unittest 1123 { 1124 __m256d a = [1.5, -2.0, 3.0, 1.0]; 1125 a = _mm256_div_pd(a, a); 1126 double[4] correct = [1.0, 1.0, 1.0, 1.0]; 1127 assert(a.array == correct); 1128 } 1129 1130 /// Divide packed single-precision (32-bit) floating-point elements in `a` by packed elements in `b`. 1131 __m256 _mm256_div_ps (__m256 a, __m256 b) pure @safe 1132 { 1133 return a / b; 1134 } 1135 unittest 1136 { 1137 __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 4.5f, -5.0f, 6.0f, 7.0f]; 1138 a = _mm256_div_ps(a, a); 1139 float[8] correct = [1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f]; 1140 assert(a.array == correct); 1141 } 1142 1143 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements in `a` and 1144 /// `b` using the high 4 bits in `imm8`, sum the four products, and conditionally store the sum 1145 /// using the low 4 bits of `imm8`. 1146 __m256 _mm256_dp_ps(int imm8)(__m256 a, __m256 b) 1147 { 1148 // PERF DMD 1149 static if (GDC_or_LDC_with_AVX) 1150 { 1151 return __builtin_ia32_dpps256(a, b, cast(ubyte)imm8); 1152 } 1153 else 1154 { 1155 // Note: in LDC with SSE4.1 but no AVX, we _could_ increase perf a bit by using two 1156 // _mm_dp_ps. 1157 __m256 zero = _mm256_setzero_ps(); 1158 enum ubyte op = (imm8 >>> 4) & 15; 1159 __m256 temp = _mm256_blend_ps!( op | (op << 4) )(zero, a * b); 1160 float lo = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3]; 1161 float hi = temp.array[4] + temp.array[5] + temp.array[6] + temp.array[7]; 1162 __m256 r = _mm256_set_m128(_mm_set1_ps(hi), _mm_set1_ps(lo)); 1163 enum ubyte op2 = (imm8 & 15); 1164 return _mm256_blend_ps!(op2 | (op2 << 4))(zero, r); 1165 } 1166 } 1167 unittest 1168 { 1169 // Products: 9 14 20 24 6 16 12 -24 1170 __m256 A = _mm256_setr_ps(1.0f, 2.0f, 4.0f, 8.0f, 1.0f, 2.0f, 4.0f, 8.0f); 1171 __m256 B = _mm256_setr_ps(9.0f, 7.0f, 5.0f, 3.0f, 6.0f, 8.0f, 3.0f,-3.0f); 1172 float8 R1 = _mm256_dp_ps!(0xf0 + 0xf)(A, B); 1173 float8 R2 = _mm256_dp_ps!(0x30 + 0x5)(A, B); 1174 float8 R3 = _mm256_dp_ps!(0x50 + 0xa)(A, B); 1175 float[8] correct1 = [67.0f, 67.0f, 67.0f,67.0f, 10, 10, 10, 10]; 1176 float[8] correct2 = [23.0f, 0.0f, 23.0f, 0.0f, 22, 0, 22, 0]; 1177 float[8] correct3 = [0.0f, 29.0f, 0.0f, 29.0f, 0, 18, 0, 18]; 1178 assert(R1.array == correct1); 1179 assert(R2.array == correct2); 1180 assert(R3.array == correct3); 1181 } 1182 1183 /// Extract a 32-bit integer from `a`, selected with `imm8`. 1184 int _mm256_extract_epi32 (__m256i a, const int imm8) pure @trusted 1185 { 1186 return (cast(int8)a).array[imm8 & 7]; 1187 } 1188 unittest 1189 { 1190 align(16) int[8] data = [-1, 2, -3, 4, 9, -7, 8, -6]; 1191 auto A = _mm256_loadu_si256(cast(__m256i*) data.ptr); 1192 assert(_mm256_extract_epi32(A, 0) == -1); 1193 assert(_mm256_extract_epi32(A, 1 + 8) == 2); 1194 assert(_mm256_extract_epi32(A, 3 + 16) == 4); 1195 assert(_mm256_extract_epi32(A, 7 + 32) == -6); 1196 } 1197 1198 /// Extract a 64-bit integer from `a`, selected with `index`. 1199 long _mm256_extract_epi64 (__m256i a, const int index) pure @safe 1200 { 1201 return a.array[index & 3]; 1202 } 1203 unittest 1204 { 1205 __m256i A = _mm256_setr_epi64x(-7, 6, 42, 0); 1206 assert(_mm256_extract_epi64(A, -8) == -7); 1207 assert(_mm256_extract_epi64(A, 1) == 6); 1208 assert(_mm256_extract_epi64(A, 2 + 4) == 42); 1209 } 1210 1211 /// Extract a 128-bits lane from `a`, selected with `index` (0 or 1). 1212 /// Note: `_mm256_extractf128_pd!0` is equivalent to `_mm256_castpd256_pd128`. 1213 __m128d _mm256_extractf128_pd(ubyte imm8)(__m256d a) pure @trusted 1214 { 1215 version(GNU) pragma(inline, true); // else GDC has trouble inlining this 1216 1217 // PERF DMD 1218 static if (GDC_with_AVX) 1219 { 1220 // Note: needs to be a template intrinsics because of this builtin. 1221 return __builtin_ia32_vextractf128_pd256(a, imm8 & 1); 1222 } 1223 else 1224 { 1225 double2 r = void; 1226 enum int index = 2*(imm8 & 1); 1227 r.ptr[0] = a.array[index+0]; 1228 r.ptr[1] = a.array[index+1]; 1229 return r; 1230 } 1231 } 1232 unittest 1233 { 1234 __m256d A = _mm256_setr_pd(1.0, 2, 3, 4); 1235 double[4] correct = [1.0, 2, 3, 4]; 1236 __m128d l0 = _mm256_extractf128_pd!18(A); 1237 __m128d l1 = _mm256_extractf128_pd!55(A); 1238 assert(l0.array == correct[0..2]); 1239 assert(l1.array == correct[2..4]); 1240 } 1241 1242 ///ditto 1243 __m128 _mm256_extractf128_ps(ubyte imm8)(__m256 a) pure @trusted 1244 { 1245 version(GNU) pragma(inline, true); // else GDC has trouble inlining this 1246 1247 // PERF DMD 1248 static if (GDC_with_AVX) 1249 { 1250 return __builtin_ia32_vextractf128_ps256(a, imm8 & 1); 1251 } 1252 else 1253 { 1254 float4 r = void; // Optimize well since LDC 1.1 -O1 1255 enum int index = 4*(imm8 & 1); 1256 r.ptr[0] = a.array[index+0]; 1257 r.ptr[1] = a.array[index+1]; 1258 r.ptr[2] = a.array[index+2]; 1259 r.ptr[3] = a.array[index+3]; 1260 return r; 1261 } 1262 } 1263 unittest 1264 { 1265 __m256 A = _mm256_setr_ps(1.0, 2, 3, 4, 5, 6, 7, 8); 1266 float[8] correct = [1.0, 2, 3, 4, 5, 6, 7, 8]; 1267 __m128 l0 = _mm256_extractf128_ps!8(A); 1268 __m128 l1 = _mm256_extractf128_ps!255(A); 1269 assert(l0.array == correct[0..4]); 1270 assert(l1.array == correct[4..8]); 1271 } 1272 1273 ///ditto 1274 __m128i _mm256_extractf128_si256(ubyte imm8)(__m256i a) pure @trusted 1275 { 1276 version(GNU) pragma(inline, true); // else GDC has trouble inlining this 1277 1278 // PERF DMD 1279 static if (GDC_with_AVX) 1280 { 1281 // Note: if it weren't for this GDC intrinsic, _mm256_extractf128_si256 1282 // could be a non-template, however, this wins in -O0. 1283 // Same story for _mm256_extractf128_ps and _mm256_extractf128_pd 1284 return __builtin_ia32_vextractf128_si256(cast(int8)a, imm8 & 1); 1285 } 1286 else 1287 { 1288 long2 r = void; 1289 enum int index = 2*(imm8 & 1); 1290 r.ptr[0] = a.array[index+0]; 1291 r.ptr[1] = a.array[index+1]; 1292 return cast(__m128i)r; 1293 } 1294 } 1295 unittest 1296 { 1297 __m256i A = _mm256_setr_epi32(9, 2, 3, 4, 5, 6, 7, 8); 1298 int[8] correct = [9, 2, 3, 4, 5, 6, 7, 8]; 1299 __m128i l0 = _mm256_extractf128_si256!0(A); 1300 __m128i l1 = _mm256_extractf128_si256!1(A); 1301 assert(l0.array == correct[0..4]); 1302 assert(l1.array == correct[4..8]); 1303 } 1304 1305 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an integer 1306 /// value, and store the results as packed double-precision floating-point elements. 1307 __m256d _mm256_floor_pd (__m256d a) @safe 1308 { 1309 static if (LDC_with_ARM64) 1310 { 1311 __m128d lo = _mm256_extractf128_pd!0(a); 1312 __m128d hi = _mm256_extractf128_pd!1(a); 1313 __m128d ilo = _mm_floor_pd(lo); 1314 __m128d ihi = _mm_floor_pd(hi); 1315 return _mm256_set_m128d(ihi, ilo); 1316 } 1317 else 1318 { 1319 return _mm256_round_pd!1(a); 1320 } 1321 } 1322 unittest 1323 { 1324 __m256d A = _mm256_setr_pd(1.3f, -2.12f, 53.6f, -2.7f); 1325 A = _mm256_floor_pd(A); 1326 double[4] correct = [1.0, -3.0, 53.0, -3.0]; 1327 assert(A.array == correct); 1328 } 1329 1330 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an integer 1331 /// value, and store the results as packed single-precision floating-point elements. 1332 __m256 _mm256_floor_ps (__m256 a) @safe 1333 { 1334 static if (LDC_with_ARM64) 1335 { 1336 __m128 lo = _mm256_extractf128_ps!0(a); 1337 __m128 hi = _mm256_extractf128_ps!1(a); 1338 __m128 ilo = _mm_floor_ps(lo); 1339 __m128 ihi = _mm_floor_ps(hi); 1340 return _mm256_set_m128(ihi, ilo); 1341 } 1342 else 1343 { 1344 return _mm256_round_ps!1(a); 1345 } 1346 } 1347 unittest 1348 { 1349 __m256 A = _mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f); 1350 __m256 C = _mm256_floor_ps(A); 1351 float[8] correct = [1.0f, -3.0f, 53.0f, -3.0f, -2, 2, -54, 2]; 1352 assert(C.array == correct); 1353 } 1354 1355 /// Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in `a` 1356 /// and `b`. 1357 __m256d _mm256_hadd_pd (__m256d a, __m256d b) pure @trusted 1358 { 1359 static if (GDC_or_LDC_with_AVX) 1360 { 1361 return __builtin_ia32_haddpd256(a, b); 1362 } 1363 else 1364 { 1365 __m256d res; 1366 res.ptr[0] = a.array[1] + a.array[0]; 1367 res.ptr[1] = b.array[1] + b.array[0]; 1368 res.ptr[2] = a.array[3] + a.array[2]; 1369 res.ptr[3] = b.array[3] + b.array[2]; 1370 return res; 1371 } 1372 } 1373 unittest 1374 { 1375 __m256d A =_mm256_setr_pd(1.5, 2.0, 21.0, 9.0); 1376 __m256d B =_mm256_setr_pd(1.0, 7.0, 100.0, 14.0); 1377 __m256d C = _mm256_hadd_pd(A, B); 1378 double[4] correct = [3.5, 8.0, 30.0, 114.0]; 1379 assert(C.array == correct); 1380 } 1381 1382 /// Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in `a` and 1383 /// `b`. 1384 __m256 _mm256_hadd_ps (__m256 a, __m256 b) pure @trusted 1385 { 1386 // PERD DMD 1387 static if (GDC_or_LDC_with_AVX) 1388 { 1389 return __builtin_ia32_haddps256(a, b); 1390 } 1391 else static if (LDC_with_ARM64) 1392 { 1393 __m128 a_hi = _mm256_extractf128_ps!1(a); 1394 __m128 a_lo = _mm256_extractf128_ps!0(a); 1395 __m128 b_hi = _mm256_extractf128_ps!1(b); 1396 __m128 b_lo = _mm256_extractf128_ps!0(b); 1397 __m128 hi = vpaddq_f32(a_hi, b_hi); 1398 __m128 lo = vpaddq_f32(a_lo, b_lo); 1399 return _mm256_set_m128(hi, lo); 1400 } 1401 else 1402 { 1403 __m256 res; 1404 res.ptr[0] = a.array[1] + a.array[0]; 1405 res.ptr[1] = a.array[3] + a.array[2]; 1406 res.ptr[2] = b.array[1] + b.array[0]; 1407 res.ptr[3] = b.array[3] + b.array[2]; 1408 res.ptr[4] = a.array[5] + a.array[4]; 1409 res.ptr[5] = a.array[7] + a.array[6]; 1410 res.ptr[6] = b.array[5] + b.array[4]; 1411 res.ptr[7] = b.array[7] + b.array[6]; 1412 return res; 1413 } 1414 } 1415 unittest 1416 { 1417 __m256 A =_mm256_setr_ps(1.0f, 2.0f, 3.0f, 5.0f, 1.0f, 2.0f, 3.0f, 5.0f); 1418 __m256 B =_mm256_setr_ps(1.5f, 2.0f, 3.5f, 4.0f, 1.5f, 2.0f, 3.5f, 5.0f); 1419 __m256 R = _mm256_hadd_ps(A, B); 1420 float[8] correct = [3.0f, 8.0f, 3.5f, 7.5f, 3.0f, 8.0f, 3.5f, 8.5f]; 1421 assert(R.array == correct); 1422 } 1423 1424 /// Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in 1425 /// `a` and `b`. 1426 __m256d _mm256_hsub_pd (__m256d a, __m256d b) pure @trusted 1427 { 1428 static if (GDC_or_LDC_with_AVX) 1429 { 1430 return __builtin_ia32_hsubpd256(a, b); 1431 } 1432 else 1433 { 1434 // 2 zip1, 2 zip2, 2 fsub... I don't think there is better in arm64 1435 __m256d res; 1436 res.ptr[0] = a.array[0] - a.array[1]; 1437 res.ptr[1] = b.array[0] - b.array[1]; 1438 res.ptr[2] = a.array[2] - a.array[3]; 1439 res.ptr[3] = b.array[2] - b.array[3]; 1440 return res; 1441 } 1442 } 1443 unittest 1444 { 1445 __m256d A =_mm256_setr_pd(1.5, 2.0, 21.0, 9.0); 1446 __m256d B =_mm256_setr_pd(1.0, 7.0, 100.0, 14.0); 1447 __m256d C = _mm256_hsub_pd(A, B); 1448 double[4] correct = [-0.5, -6.0, 12.0, 86.0]; 1449 assert(C.array == correct); 1450 } 1451 1452 __m256 _mm256_hsub_ps (__m256 a, __m256 b) pure @trusted 1453 { 1454 // PERD DMD 1455 static if (GDC_or_LDC_with_AVX) 1456 { 1457 return __builtin_ia32_hsubps256(a, b); 1458 } 1459 else 1460 { 1461 __m128 a_hi = _mm256_extractf128_ps!1(a); 1462 __m128 a_lo = _mm256_extractf128_ps!0(a); 1463 __m128 b_hi = _mm256_extractf128_ps!1(b); 1464 __m128 b_lo = _mm256_extractf128_ps!0(b); 1465 __m128 hi = _mm_hsub_ps(a_hi, b_hi); 1466 __m128 lo = _mm_hsub_ps(a_lo, b_lo); 1467 return _mm256_set_m128(hi, lo); 1468 } 1469 } 1470 unittest 1471 { 1472 __m256 A =_mm256_setr_ps(1.0f, 2.0f, 3.0f, 5.0f, 1.0f, 2.0f, 3.0f, 5.0f); 1473 __m256 B =_mm256_setr_ps(1.5f, 2.0f, 3.5f, 4.0f, 1.5f, 2.0f, 3.5f, 5.0f); 1474 __m256 R = _mm256_hsub_ps(A, B); 1475 float[8] correct = [-1.0f, -2.0f, -0.5f, -0.5f, -1.0f, -2.0f, -0.5f, -1.5f]; 1476 assert(R.array == correct); 1477 } 1478 1479 /// Copy `a`, and insert the 16-bit integer `i` into the result at the location specified by 1480 /// `index & 15`. 1481 __m256i _mm256_insert_epi16 (__m256i a, short i, const int index) pure @trusted 1482 { 1483 short16 sa = cast(short16)a; 1484 sa.ptr[index & 15] = i; 1485 return cast(__m256i)sa; 1486 } 1487 unittest 1488 { 1489 __m256i A = _mm256_set1_epi16(1); 1490 short16 R = cast(short16) _mm256_insert_epi16(A, 2, 16 + 16 + 7); 1491 short[16] correct = [1, 1, 1, 1, 1, 1, 1, 2, 1492 1, 1, 1, 1, 1, 1, 1, 1 ]; 1493 assert(R.array == correct); 1494 } 1495 1496 /// Copy `a`, and insert the 32-bit integer `i` into the result at the location specified by 1497 /// `index & 7`. 1498 __m256i _mm256_insert_epi32 (__m256i a, int i, const int index) pure @trusted 1499 { 1500 int8 ia = cast(int8)a; 1501 ia.ptr[index & 7] = i; 1502 return cast(__m256i)ia; 1503 } 1504 unittest 1505 { 1506 __m256i A = _mm256_set1_epi32(1); 1507 int8 R = cast(int8) _mm256_insert_epi32(A, -2, 8 + 8 + 1); 1508 int[8] correct = [1, -2, 1, 1, 1, 1, 1, 1]; 1509 assert(R.array == correct); 1510 } 1511 1512 /// Copy `a`, and insert the 64-bit integer `i` into the result at the location specified by 1513 /// `index & 3`. 1514 __m256i _mm256_insert_epi64(__m256i a, long i, const int index) pure @trusted 1515 { 1516 a.ptr[index & 3] = i; 1517 return a; 1518 } 1519 unittest 1520 { 1521 __m256i A = _mm256_set1_epi64(1); 1522 long4 R = cast(long4) _mm256_insert_epi64(A, -2, 2 - 4 - 4); 1523 long[4] correct = [1, 1, -2, 1]; 1524 assert(R.array == correct); 1525 } 1526 1527 /// Copy `a`, and insert the 8-bit integer `i` into the result at the location specified by 1528 /// `index & 31`. 1529 __m256i _mm256_insert_epi8(__m256i a, byte i, const int index) pure @trusted 1530 { 1531 byte32 ba = cast(byte32)a; 1532 ba.ptr[index & 31] = i; 1533 return cast(__m256i)ba; 1534 } 1535 unittest 1536 { 1537 __m256i A = _mm256_set1_epi8(1); 1538 byte32 R = cast(byte32) _mm256_insert_epi8(A, -2, 7 - 32 - 32); 1539 byte[32] correct = [1, 1, 1, 1, 1, 1, 1,-2, 1, 1, 1, 1, 1, 1, 1, 1, 1540 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]; 1541 assert(R.array == correct); 1542 } 1543 1544 /// Copy `a`, then insert 128 bits (composed of 2 packed double-precision (64-bit) 1545 /// floating-point elements) from `b` at the location specified by `imm8`. 1546 __m256d _mm256_insertf128_pd(int imm8)(__m256d a, __m128d b) pure @trusted 1547 { 1548 static if (GDC_with_AVX) 1549 { 1550 enum ubyte lane = imm8 & 1; 1551 return __builtin_ia32_vinsertf128_pd256(a, b, lane); 1552 } 1553 else 1554 { 1555 __m256d r = a; 1556 enum int index = (imm8 & 1) ? 2 : 0; 1557 r.ptr[index] = b.array[0]; 1558 r.ptr[index+1] = b.array[1]; 1559 return r; 1560 } 1561 } 1562 1563 /// Copy `a` then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point 1564 /// elements) from `b`, at the location specified by `imm8`. 1565 __m256 _mm256_insertf128_ps(int imm8)(__m256 a, __m128 b) pure @trusted 1566 { 1567 static if (GDC_with_AVX) 1568 { 1569 enum ubyte lane = imm8 & 1; 1570 return __builtin_ia32_vinsertf128_ps256(a, b, lane); 1571 } 1572 else 1573 { 1574 __m256 r = a; 1575 enum int index = (imm8 & 1) ? 4 : 0; 1576 r.ptr[index] = b.array[0]; 1577 r.ptr[index+1] = b.array[1]; 1578 r.ptr[index+2] = b.array[2]; 1579 r.ptr[index+3] = b.array[3]; 1580 return r; 1581 } 1582 } 1583 1584 /// Copy `a`, then insert 128 bits from `b` at the location specified by `imm8`. 1585 __m256i _mm256_insertf128_si256(int imm8)(__m256i a, __m128i b) pure @trusted 1586 { 1587 static if (GDC_with_AVX) 1588 { 1589 enum ubyte lane = imm8 & 1; 1590 return cast(__m256i) __builtin_ia32_vinsertf128_si256 (cast(int8)a, b, lane); 1591 } 1592 else 1593 { 1594 long2 lb = cast(long2)b; 1595 __m256i r = a; 1596 enum int index = (imm8 & 1) ? 2 : 0; 1597 r.ptr[index] = lb.array[0]; 1598 r.ptr[index+1] = lb.array[1]; 1599 return r; 1600 } 1601 } 1602 1603 /// Load 256-bits of integer data from unaligned memory into dst. 1604 /// This intrinsic may run better than `_mm256_loadu_si256` when the data crosses a cache 1605 /// line boundary. 1606 __m256i _mm256_lddqu_si256(const(__m256i)* mem_addr) @trusted 1607 { 1608 // PERF DMD 1609 static if (GDC_or_LDC_with_AVX) 1610 { 1611 return cast(__m256i) __builtin_ia32_lddqu256(cast(const(char)*)mem_addr); 1612 } 1613 else 1614 return _mm256_loadu_si256(mem_addr); 1615 } 1616 unittest 1617 { 1618 int[10] correct = [0, -1, 2, -3, 4, 9, -7, 8, -6, 34]; 1619 int8 A = cast(int8) _mm256_lddqu_si256(cast(__m256i*) &correct[1]); 1620 assert(A.array == correct[1..9]); 1621 } 1622 1623 /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) 1624 /// from memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 1625 /// exception may be generated. 1626 __m256d _mm256_load_pd (const(double)* mem_addr) pure @trusted 1627 { 1628 return *cast(__m256d*)mem_addr; 1629 } 1630 unittest 1631 { 1632 static immutable align(32) double[4] correct = [1.0, 2.0, 3.5, -42.0]; 1633 __m256d A = _mm256_load_pd(correct.ptr); 1634 assert(A.array == correct); 1635 } 1636 1637 /// Load 256-bits (composed of 8 packed single-precision (32-bit) 1638 /// floating-point elements) from memory. 1639 /// `mem_addr` must be aligned on a 32-byte boundary or a 1640 /// general-protection exception may be generated. 1641 __m256 _mm256_load_ps (const(float)* mem_addr) pure @trusted 1642 { 1643 return *cast(__m256*)mem_addr; 1644 } 1645 unittest 1646 { 1647 static immutable align(32) float[8] correct = 1648 [1.0, 2.0, 3.5, -42.0, 7.43f, 0.0f, 3, 2]; 1649 __m256 A = _mm256_load_ps(correct.ptr); 1650 assert(A.array == correct); 1651 } 1652 1653 /// Load 256-bits of integer data from memory. `mem_addr` does not need to be aligned on 1654 /// any particular boundary. 1655 // See this dlang forum post => https://forum.dlang.org/thread/vymrsngsfibkmqsqffce@forum.dlang.org 1656 __m256i _mm256_loadu_si256 (const(__m256i)* mem_addr) pure @trusted 1657 { 1658 // PERF DMD 1659 static if (GDC_with_AVX) 1660 { 1661 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) mem_addr); 1662 } 1663 else version(LDC) 1664 { 1665 return loadUnaligned!(__m256i)(cast(long*)mem_addr); 1666 } 1667 else 1668 { 1669 const(long)* p = cast(const(long)*)mem_addr; 1670 long4 r; 1671 r.ptr[0] = p[0]; 1672 r.ptr[1] = p[1]; 1673 r.ptr[2] = p[2]; 1674 r.ptr[3] = p[3]; 1675 return r; 1676 } 1677 } 1678 unittest 1679 { 1680 align(16) int[8] correct = [-1, 2, -3, 4, 9, -7, 8, -6]; 1681 int8 A = cast(int8) _mm256_loadu_si256(cast(__m256i*) correct.ptr); 1682 assert(A.array == correct); 1683 } 1684 1685 /// Load 256-bits of integer data from memory. `mem_addr` must be aligned on a 1686 /// 32-byte boundary or a general-protection exception may be generated. 1687 __m256i _mm256_load_si256 (const(void)* mem_addr) pure @system 1688 { 1689 return *cast(__m256i*)mem_addr; 1690 } 1691 unittest 1692 { 1693 static immutable align(64) long[4] correct = [1, -2, long.min, long.max]; 1694 __m256i A = _mm256_load_si256(correct.ptr); 1695 assert(A.array == correct); 1696 } 1697 1698 /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) 1699 /// from memory. `mem_addr` does not need to be aligned on any particular boundary. 1700 __m256d _mm256_loadu_pd (const(void)* mem_addr) pure @system 1701 { 1702 // PERF DMD 1703 static if (GDC_with_AVX) 1704 { 1705 return __builtin_ia32_loadupd256 ( cast(const(double)*) mem_addr); 1706 } 1707 else version(LDC) 1708 { 1709 return loadUnaligned!(__m256d)(cast(double*)mem_addr); 1710 } 1711 else 1712 { 1713 const(double)* p = cast(const(double)*)mem_addr; 1714 double4 r; 1715 r.ptr[0] = p[0]; 1716 r.ptr[1] = p[1]; 1717 r.ptr[2] = p[2]; 1718 r.ptr[3] = p[3]; 1719 return r; 1720 } 1721 } 1722 unittest 1723 { 1724 double[4] correct = [1.0, -2.0, 0.0, 768.5]; 1725 __m256d A = _mm256_loadu_pd(correct.ptr); 1726 assert(A.array == correct); 1727 } 1728 1729 /// Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory. 1730 /// `mem_addr` does not need to be aligned on any particular boundary. 1731 __m256 _mm256_loadu_ps (const(float)* mem_addr) pure @system 1732 { 1733 // PERF DMD 1734 static if (GDC_with_AVX) 1735 { 1736 return __builtin_ia32_loadups256 ( cast(const(float)*) mem_addr); 1737 } 1738 else version(LDC) 1739 { 1740 return loadUnaligned!(__m256)(cast(float*)mem_addr); 1741 } 1742 else 1743 { 1744 const(float)* p = cast(const(float)*)mem_addr; 1745 float8 r = void; 1746 r.ptr[0] = p[0]; 1747 r.ptr[1] = p[1]; 1748 r.ptr[2] = p[2]; 1749 r.ptr[3] = p[3]; 1750 r.ptr[4] = p[4]; 1751 r.ptr[5] = p[5]; 1752 r.ptr[6] = p[6]; 1753 r.ptr[7] = p[7]; 1754 return r; 1755 } 1756 } 1757 unittest 1758 { 1759 align(32) float[10] correct = [0.0f, 1, 2, 3, 4, 5, 6, 7, 8, 9]; 1760 __m256 A = _mm256_loadu_ps(&correct[1]); 1761 assert(A.array == correct[1..9]); 1762 } 1763 1764 /// Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point 1765 /// elements) from memory, and combine them into a 256-bit value. 1766 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 1767 __m256 _mm256_loadu2_m128 (const(float)* hiaddr, const(float)* loaddr) pure @system 1768 { 1769 // Note: no particular instruction for this in x86. 1770 return _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr)); 1771 } 1772 unittest 1773 { 1774 align(32) float[6] A = [4.5f, 2, 8, 97, -1, 3]; 1775 align(32) float[6] B = [6.5f, 3, 9, 98, -2, 4]; 1776 __m256 R = _mm256_loadu2_m128(&B[1], &A[1]); 1777 float[8] correct = [2.0f, 8, 97, -1, 3, 9, 98, -2]; 1778 assert(R.array == correct); 1779 } 1780 1781 /// Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point 1782 /// elements) from memory, and combine them into a 256-bit value. 1783 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 1784 __m256d _mm256_loadu2_m128d (const(double)* hiaddr, const(double)* loaddr) pure @system 1785 { 1786 // Note: no particular instruction for this in x86. 1787 return _mm256_set_m128d(_mm_loadu_pd(hiaddr), _mm_loadu_pd(loaddr)); 1788 } 1789 unittest 1790 { 1791 align(32) double[4] A = [4.5f, 2, 8, 97]; 1792 align(32) double[4] B = [6.5f, 3, 9, 98]; 1793 __m256d R = _mm256_loadu2_m128d(&B[1], &A[1]); 1794 double[4] correct = [2.0, 8, 3, 9]; 1795 assert(R.array == correct); 1796 } 1797 1798 /// Load two 128-bit values (composed of integer data) from memory, and combine them into a 1799 /// 256-bit value. `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 1800 __m256i _mm256_loadu2_m128i (const(__m128i)* hiaddr, const(__m128i)* loaddr) pure @trusted 1801 { 1802 // Note: no particular instruction for this in x86. 1803 return _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr)); 1804 } 1805 unittest 1806 { 1807 align(32) long[4] A = [5, 2, 8, 97]; 1808 align(32) long[4] B = [6, 3, 9, 98]; 1809 __m256i R = _mm256_loadu2_m128i(cast(const(__m128i)*) &B[1], cast(const(__m128i)*) &A[1]); 1810 long[4] correct = [2, 8, 3, 9]; 1811 assert(R.array == correct); 1812 } 1813 1814 version(DigitalMars) 1815 { 1816 // this avoids a bug with DMD < 2.099 -a x86 -O 1817 private enum bool maskLoadWorkaroundDMD = (__VERSION__ < 2099); 1818 } 1819 else 1820 { 1821 private enum bool maskLoadWorkaroundDMD = false; 1822 } 1823 1824 /// Load packed double-precision (64-bit) floating-point elements from memory using `mask` 1825 /// (elements are zeroed out when the high bit of the corresponding element is not set). 1826 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access 1827 /// only when needed. 1828 /// See: "Note about mask load/store" to know why you must address valid memory only. 1829 __m128d _mm_maskload_pd (const(double)* mem_addr, __m128i mask) /* pure */ @system 1830 { 1831 // PERF DMD 1832 static if (LDC_with_AVX) 1833 { 1834 // MAYDO report that the builtin is impure 1835 return __builtin_ia32_maskloadpd(mem_addr, cast(long2)mask); 1836 } 1837 else static if (GDC_with_AVX) 1838 { 1839 return __builtin_ia32_maskloadpd(cast(double2*)mem_addr, cast(long2)mask); 1840 } 1841 else 1842 { 1843 __m128d a = _mm_loadu_pd(mem_addr); 1844 __m128d zero = _mm_setzero_pd(); 1845 return _mm_blendv_pd(zero, a, cast(double2)mask); 1846 } 1847 } 1848 unittest 1849 { 1850 static if (!maskLoadWorkaroundDMD) 1851 { 1852 double[2] A = [7.5, 1]; 1853 double2 B = _mm_maskload_pd(A.ptr, _mm_setr_epi64(-1, 1)); 1854 double[2] correct = [7.5, 0]; 1855 assert(B.array == correct); 1856 } 1857 } 1858 1859 /// Load packed double-precision (64-bit) floating-point elements from memory using `mask` 1860 /// (elements are zeroed out when the high bit of the corresponding element is not set). 1861 /// See: "Note about mask load/store" to know why you must address valid memory only. 1862 __m256d _mm256_maskload_pd (const(double)* mem_addr, __m256i mask) /*pure*/ @system 1863 { 1864 // PERF DMD 1865 static if (LDC_with_AVX) 1866 { 1867 // MAYDO that the builtin is impure 1868 return __builtin_ia32_maskloadpd256(mem_addr, mask); 1869 } 1870 else static if (GDC_with_AVX) 1871 { 1872 return __builtin_ia32_maskloadpd256(cast(double4*)mem_addr, mask); 1873 } 1874 else 1875 { 1876 __m256d a = _mm256_loadu_pd(mem_addr); 1877 __m256d zero = _mm256_setzero_pd(); 1878 return _mm256_blendv_pd(zero, a, cast(double4)mask); 1879 } 1880 } 1881 unittest 1882 { 1883 static if (!maskLoadWorkaroundDMD) 1884 { 1885 double[4] A = [7.5, 1, 2, 3]; 1886 double4 B = _mm256_maskload_pd(A.ptr, _mm256_setr_epi64(1, -1, -1, 1)); 1887 double[4] correct = [0.0, 1, 2, 0]; 1888 assert(B.array == correct); 1889 } 1890 } 1891 1892 /// Load packed single-precision (32-bit) floating-point elements from memory using mask (elements 1893 /// are zeroed out when the high bit of the corresponding element is not set). 1894 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access 1895 /// only when needed. 1896 /// See: "Note about mask load/store" to know why you must address valid memory only. 1897 __m128 _mm_maskload_ps (const(float)* mem_addr, __m128i mask) /* pure */ @system 1898 { 1899 // PERF DMD 1900 static if (LDC_with_AVX) 1901 { 1902 // MAYDO that the builtin is impure 1903 return __builtin_ia32_maskloadps(mem_addr, mask); 1904 } 1905 else static if (GDC_with_AVX) 1906 { 1907 return __builtin_ia32_maskloadps(cast(float4*)mem_addr, mask); 1908 } 1909 else 1910 { 1911 __m128 a = _mm_loadu_ps(mem_addr); 1912 __m128 zero = _mm_setzero_ps(); 1913 return _mm_blendv_ps(zero, a, cast(float4)mask); 1914 } 1915 } 1916 unittest 1917 { 1918 static if (!maskLoadWorkaroundDMD) 1919 { 1920 float[4] A = [7.5f, 1, 2, 3]; 1921 float4 B = _mm_maskload_ps(A.ptr, _mm_setr_epi32(1, -1, -1, 1)); // can address invalid memory with mask load and writes! 1922 float[4] correct = [0.0f, 1, 2, 0]; 1923 assert(B.array == correct); 1924 } 1925 } 1926 1927 /// Load packed single-precision (32-bit) floating-point elements from memory using `mask` 1928 /// (elements are zeroed out when the high bit of the corresponding element is not set). 1929 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access 1930 /// only when needed. 1931 /// See: "Note about mask load/store" to know why you must address valid memory only. 1932 __m256 _mm256_maskload_ps (const(float)* mem_addr, __m256i mask) /*pure*/ @system 1933 { 1934 // PERF DMD 1935 static if (LDC_with_AVX) 1936 { 1937 // MAYDO that the builtin is impure 1938 return __builtin_ia32_maskloadps256(mem_addr, cast(int8)mask); 1939 } 1940 else static if (GDC_with_AVX) 1941 { 1942 return __builtin_ia32_maskloadps256(cast(float8*)mem_addr, cast(int8)mask); 1943 } 1944 else 1945 { 1946 __m256 a = _mm256_loadu_ps(mem_addr); 1947 __m256 zero = _mm256_setzero_ps(); 1948 return _mm256_blendv_ps(zero, a, cast(float8)mask); 1949 } 1950 } 1951 unittest 1952 { 1953 float[8] A = [1, 7.5f, 1, 2, 3, 4, 5, 6]; 1954 __m256i M = _mm256_setr_epi32(1, -1, 1, -1, 1, -1, -1, 1); 1955 float8 B = _mm256_maskload_ps(A.ptr, M); 1956 float[8] correct = [0.0f, 7.5f, 0, 2, 0, 4, 5, 0]; 1957 assert(B.array == correct); 1958 } 1959 1960 /// Store packed double-precision (64-bit) floating-point elements from `a` into memory using `mask`. 1961 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access 1962 /// only when needed. 1963 /// See: "Note about mask load/store" to know why you must address valid memory only. 1964 void _mm_maskstore_pd (double * mem_addr, __m128i mask, __m128d a) /* pure */ @system 1965 { 1966 // PERF DMD 1967 static if (LDC_with_AVX) 1968 { 1969 // MAYDO that the builtin is impure 1970 __builtin_ia32_maskstorepd(mem_addr, cast(long2)mask, a); 1971 } 1972 else static if (GDC_with_AVX) 1973 { 1974 __builtin_ia32_maskstorepd(cast(double2*)mem_addr, cast(long2)mask, a); 1975 } 1976 else 1977 { 1978 __m128d source = _mm_loadu_pd(mem_addr); 1979 __m128d r = _mm_blendv_pd(source, a, cast(double2) mask); 1980 _mm_storeu_pd(mem_addr, r); 1981 } 1982 } 1983 unittest 1984 { 1985 double[2] A = [0.0, 1.0]; 1986 __m128i M = _mm_setr_epi64(-1, 0); 1987 __m128d B = _mm_setr_pd(2.0, 3.0); 1988 _mm_maskstore_pd(A.ptr, M, B); 1989 double[2] correct = [2.0, 1.0]; 1990 assert(A == correct); 1991 } 1992 1993 1994 /// Store packed double-precision (64-bit) floating-point elements from `a` into memory using `mask`. 1995 /// See: "Note about mask load/store" to know why you must address valid memory only. 1996 static if (!llvm256BitStackWorkaroundIn32BitX86) 1997 { 1998 void _mm256_maskstore_pd (double * mem_addr, __m256i mask, __m256d a) /* pure */ @system 1999 { 2000 // PERF DMD 2001 static if (LDC_with_AVX) 2002 { 2003 // MAYDO that the builtin is impure 2004 __builtin_ia32_maskstorepd256(mem_addr, cast(long4)mask, a); 2005 } 2006 else static if (GDC_with_AVX) 2007 { 2008 __builtin_ia32_maskstorepd256(cast(double4*)mem_addr, cast(long4)mask, a); 2009 } 2010 else 2011 { 2012 __m256d source = _mm256_loadu_pd(mem_addr); 2013 __m256d r = _mm256_blendv_pd(source, a, cast(double4) mask); 2014 _mm256_storeu_pd(mem_addr, r); 2015 } 2016 } 2017 unittest 2018 { 2019 double[4] A = [0.0, 1, 2, 3]; 2020 __m256i M = _mm256_setr_epi64x(-9, 0, -1, 0); 2021 __m256d B = _mm256_setr_pd(2, 3, 4, 5); 2022 _mm256_maskstore_pd(A.ptr, M, B); 2023 double[4] correct = [2.0, 1, 4, 3]; 2024 assert(A == correct); 2025 } 2026 } 2027 2028 /// Store packed single-precision (32-bit) floating-point elements from `a` into memory using `mask`. 2029 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access 2030 /// only when needed. 2031 /// See: "Note about mask load/store" to know why you must address valid memory only. 2032 void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a) /* pure */ @system 2033 { 2034 // PERF DMD 2035 static if (LDC_with_AVX) 2036 { 2037 // MAYDO report that the builtin is impure 2038 __builtin_ia32_maskstoreps(mem_addr, mask, a); 2039 } 2040 else static if (GDC_with_AVX) 2041 { 2042 __builtin_ia32_maskstoreps(cast(float4*)mem_addr, mask, a); 2043 } 2044 else 2045 { 2046 __m128 source = _mm_loadu_ps(mem_addr); 2047 __m128 r = _mm_blendv_ps(source, a, cast(float4) mask); 2048 _mm_storeu_ps(mem_addr, r); 2049 } 2050 } 2051 unittest 2052 { 2053 float[4] A = [0.0f, 1, 2, 6]; 2054 __m128i M = _mm_setr_epi32(-1, 0, -1, 0); 2055 __m128 B = _mm_setr_ps(2, 3, 4, 5); 2056 _mm_maskstore_ps(A.ptr, M, B); 2057 float[4] correct = [2.0f, 1, 4, 6]; 2058 assert(A == correct); 2059 } 2060 2061 static if (!llvm256BitStackWorkaroundIn32BitX86) 2062 { 2063 /// Store packed single-precision (32-bit) floating-point elements from `a` into memory using `mask`. 2064 /// See: "Note about mask load/store" to know why you must address valid memory only. 2065 void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a) /* pure */ @system 2066 { 2067 // PERF DMD 2068 static if (LDC_with_AVX) 2069 { 2070 // MAYDO report that the builtin is impure 2071 __builtin_ia32_maskstoreps256(mem_addr, cast(int8)mask, a); 2072 } 2073 else static if (GDC_with_AVX) 2074 { 2075 __builtin_ia32_maskstoreps256(cast(float8*)mem_addr, cast(int8)mask, a); 2076 } 2077 else 2078 { 2079 __m256 source = _mm256_loadu_ps(mem_addr); 2080 __m256 r = _mm256_blendv_ps(source, a, cast(float8) mask); 2081 _mm256_storeu_ps(mem_addr, r); 2082 } 2083 } 2084 unittest 2085 { 2086 float[8] A = [0.0f, 0, 1, 2, 3, 4, 5, 7]; 2087 __m256i M = _mm256_setr_epi32( 0, -1, 0, -1, 0, -1, -1, 0); 2088 __m256 B = _mm256_set1_ps(6.0f); 2089 _mm256_maskstore_ps(A.ptr, M, B); 2090 float[8] correct = [0.0f, 6, 1, 6, 3, 6, 6, 7]; 2091 assert(A == correct); 2092 } 2093 } 2094 2095 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 2096 /// packed maximum values. 2097 __m256d _mm256_max_pd (__m256d a, __m256d b) pure @trusted 2098 { 2099 // PERF DMD 2100 static if (GDC_or_LDC_with_AVX) 2101 { 2102 return __builtin_ia32_maxpd256(a, b); 2103 } 2104 else 2105 { 2106 // LDC: becomes good in -O2 2107 // PERF: GDC without AVX 2108 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2109 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2110 a.ptr[2] = (a.array[2] > b.array[2]) ? a.array[2] : b.array[2]; 2111 a.ptr[3] = (a.array[3] > b.array[3]) ? a.array[3] : b.array[3]; 2112 return a; 2113 } 2114 } 2115 unittest 2116 { 2117 __m256d A = _mm256_setr_pd(4.0, 1.0, -9.0, double.infinity); 2118 __m256d B = _mm256_setr_pd(1.0, 8.0, 0.0, 100000.0); 2119 __m256d M = _mm256_max_pd(A, B); 2120 double[4] correct = [4.0, 8.0, 0.0, double.infinity]; 2121 } 2122 2123 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return 2124 /// packed maximum values. 2125 __m256 _mm256_max_ps (__m256 a, __m256 b) pure @trusted 2126 { 2127 // PERF DMD 2128 static if (GDC_or_LDC_with_AVX) 2129 { 2130 return __builtin_ia32_maxps256(a, b); 2131 } 2132 else 2133 { 2134 // LDC: becomes good in -O2, but looks brittle. 2135 // PERF GDC without AVX 2136 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2137 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2138 a.ptr[2] = (a.array[2] > b.array[2]) ? a.array[2] : b.array[2]; 2139 a.ptr[3] = (a.array[3] > b.array[3]) ? a.array[3] : b.array[3]; 2140 a.ptr[4] = (a.array[4] > b.array[4]) ? a.array[4] : b.array[4]; 2141 a.ptr[5] = (a.array[5] > b.array[5]) ? a.array[5] : b.array[5]; 2142 a.ptr[6] = (a.array[6] > b.array[6]) ? a.array[6] : b.array[6]; 2143 a.ptr[7] = (a.array[7] > b.array[7]) ? a.array[7] : b.array[7]; 2144 return a; 2145 } 2146 } 2147 unittest 2148 { 2149 __m256 A = _mm256_setr_ps(4.0, 1.0, -9.0, float.infinity, 1, 2, 3, 4); 2150 __m256 B = _mm256_setr_ps(1.0, 8.0, 0.0, 100000.0f , 4, 3, 2, 1); 2151 __m256 M = _mm256_max_ps(A, B); 2152 float[8] correct = [4.0, 8.0, 0.0, float.infinity , 4, 3, 3, 4]; 2153 } 2154 2155 // Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 2156 /// packed minimum values. 2157 __m256d _mm256_min_pd (__m256d a, __m256d b) pure @trusted 2158 { 2159 // PERF DMD 2160 static if (GDC_or_LDC_with_AVX) 2161 { 2162 return __builtin_ia32_minpd256(a, b); 2163 } 2164 else 2165 { 2166 // LDC: becomes good in -O2 2167 // PERF: GDC without AVX 2168 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2169 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2170 a.ptr[2] = (a.array[2] < b.array[2]) ? a.array[2] : b.array[2]; 2171 a.ptr[3] = (a.array[3] < b.array[3]) ? a.array[3] : b.array[3]; 2172 return a; 2173 } 2174 } 2175 unittest 2176 { 2177 __m256d A = _mm256_setr_pd(4.0, 1.0, -9.0, double.infinity); 2178 __m256d B = _mm256_setr_pd(1.0, 8.0, 0.0, 100000.0); 2179 __m256d M = _mm256_min_pd(A, B); 2180 double[4] correct = [1.0, 8.0, -9.0, 100000.0]; 2181 } 2182 2183 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return 2184 /// packed maximum values. 2185 __m256 _mm256_min_ps (__m256 a, __m256 b) pure @trusted 2186 { 2187 // PERF DMD 2188 static if (GDC_or_LDC_with_AVX) 2189 { 2190 return __builtin_ia32_minps256(a, b); 2191 } 2192 else 2193 { 2194 // LDC: becomes good in -O2, but looks brittle. 2195 // PERF GDC without AVX 2196 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2197 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2198 a.ptr[2] = (a.array[2] < b.array[2]) ? a.array[2] : b.array[2]; 2199 a.ptr[3] = (a.array[3] < b.array[3]) ? a.array[3] : b.array[3]; 2200 a.ptr[4] = (a.array[4] < b.array[4]) ? a.array[4] : b.array[4]; 2201 a.ptr[5] = (a.array[5] < b.array[5]) ? a.array[5] : b.array[5]; 2202 a.ptr[6] = (a.array[6] < b.array[6]) ? a.array[6] : b.array[6]; 2203 a.ptr[7] = (a.array[7] < b.array[7]) ? a.array[7] : b.array[7]; 2204 return a; 2205 } 2206 } 2207 unittest 2208 { 2209 __m256 A = _mm256_setr_ps(4.0, 1.0, -9.0, float.infinity, 1, 2, 3, 4); 2210 __m256 B = _mm256_setr_ps(1.0, 8.0, 0.0, 100000.0f , 4, 3, 2, 1); 2211 __m256 M = _mm256_min_ps(A, B); 2212 float[8] correct = [1.0, 1.0, -9.0, 100000.0f , 1, 2, 2, 1]; 2213 } 2214 2215 /// Duplicate even-indexed double-precision (64-bit) floating-point elements from `a`. 2216 __m256d _mm256_movedup_pd (__m256d a) @trusted 2217 { 2218 // PERF DMD 2219 static if (GDC_with_AVX) 2220 { 2221 return __builtin_ia32_movddup256 (a); 2222 } 2223 else 2224 { 2225 a.ptr[1] = a.array[0]; 2226 a.ptr[3] = a.array[2]; 2227 return a; 2228 } 2229 } 2230 unittest 2231 { 2232 __m256d A = _mm256_setr_pd(1.0, 2, 3, 4); 2233 A = _mm256_movedup_pd(A); 2234 double[4] correct = [1.0, 1, 3, 3]; 2235 assert(A.array == correct); 2236 } 2237 2238 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`. 2239 __m256 _mm256_movehdup_ps (__m256 a) @trusted 2240 { 2241 // PERF DMD 2242 static if (GDC_with_AVX) 2243 { 2244 return __builtin_ia32_movshdup256 (a); 2245 } 2246 else 2247 { 2248 a.ptr[0] = a.array[1]; 2249 a.ptr[2] = a.array[3]; 2250 a.ptr[4] = a.array[5]; 2251 a.ptr[6] = a.array[7]; 2252 return a; 2253 } 2254 } 2255 unittest 2256 { 2257 __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8); 2258 A = _mm256_movehdup_ps(A); 2259 float[8] correct = [2.0, 2, 4, 4, 6, 6, 8, 8]; 2260 assert(A.array == correct); 2261 } 2262 2263 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`. 2264 __m256 _mm256_moveldup_ps (__m256 a) @trusted 2265 { 2266 // PERF DMD 2267 static if (GDC_with_AVX) 2268 { 2269 return __builtin_ia32_movsldup256 (a); 2270 } 2271 else 2272 { 2273 a.ptr[1] = a.array[0]; 2274 a.ptr[3] = a.array[2]; 2275 a.ptr[5] = a.array[4]; 2276 a.ptr[7] = a.array[6]; 2277 return a; 2278 } 2279 } 2280 unittest 2281 { 2282 __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8); 2283 A = _mm256_moveldup_ps(A); 2284 float[8] correct = [1.0, 1, 3, 3, 5, 5, 7, 7]; 2285 assert(A.array == correct); 2286 } 2287 2288 /// Set each bit of result mask based on the most significant bit of the corresponding packed 2289 /// double-precision (64-bit) floating-point element in `a`. 2290 int _mm256_movemask_pd (__m256d a) @safe 2291 { 2292 // PERF DMD 2293 static if (GDC_or_LDC_with_AVX) 2294 { 2295 return __builtin_ia32_movmskpd256(a); 2296 } 2297 else static if (LDC_with_SSE2) 2298 { 2299 // this doesn't benefit GDC, and not clear for arm64. 2300 __m128d A_lo = _mm256_extractf128_pd!0(a); 2301 __m128d A_hi = _mm256_extractf128_pd!1(a); 2302 2303 return (_mm_movemask_pd(A_hi) << 2) | _mm_movemask_pd(A_lo); 2304 } 2305 else 2306 { 2307 // Fortunately, branchless on arm64 2308 long4 lv = cast(long4)a; 2309 int r = 0; 2310 if (lv.array[0] < 0) r += 1; 2311 if (lv.array[1] < 0) r += 2; 2312 if (lv.array[2] < 0) r += 4; 2313 if (lv.array[3] < 0) r += 8; 2314 return r; 2315 } 2316 } 2317 unittest 2318 { 2319 __m256d A = _mm256_setr_pd(-1, -double.infinity, 0, -1); 2320 assert(_mm256_movemask_pd(A) == 1 + 2 + 8); 2321 } 2322 2323 /// Set each bit of mask result based on the most significant bit of the corresponding packed 2324 /// single-precision (32-bit) floating-point element in `a`. 2325 int _mm256_movemask_ps (__m256 a) @system 2326 { 2327 // PERF DMD 2328 // PERF GDC without AVX 2329 static if (GDC_or_LDC_with_AVX) 2330 { 2331 return __builtin_ia32_movmskps256(a); 2332 } 2333 else version(LDC) 2334 { 2335 // this doesn't benefit GDC (unable to inline), but benefits both LDC with SSE2 and ARM64 2336 __m128 A_lo = _mm256_extractf128_ps!0(a); 2337 __m128 A_hi = _mm256_extractf128_ps!1(a); 2338 return (_mm_movemask_ps(A_hi) << 4) | _mm_movemask_ps(A_lo); 2339 } 2340 else 2341 { 2342 int8 lv = cast(int8)a; 2343 int r = 0; 2344 if (lv.array[0] < 0) r += 1; 2345 if (lv.array[1] < 0) r += 2; 2346 if (lv.array[2] < 0) r += 4; 2347 if (lv.array[3] < 0) r += 8; 2348 if (lv.array[4] < 0) r += 16; 2349 if (lv.array[5] < 0) r += 32; 2350 if (lv.array[6] < 0) r += 64; 2351 if (lv.array[7] < 0) r += 128; 2352 return r; 2353 } 2354 } 2355 unittest 2356 { 2357 __m256 A = _mm256_setr_ps(-1, -double.infinity, 0, -1, 1, double.infinity, -2, double.nan); 2358 assert(_mm256_movemask_ps(A) == 1 + 2 + 8 + 64); 2359 } 2360 2361 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`. 2362 __m256d _mm256_mul_pd (__m256d a, __m256d b) pure @safe 2363 { 2364 return a * b; 2365 } 2366 unittest 2367 { 2368 __m256d a = [-2.0, 1.5, -2.0, 1.5]; 2369 a = _mm256_mul_pd(a, a); 2370 assert(a.array == [4.0, 2.25, 4.0, 2.25]); 2371 } 2372 2373 /// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`. 2374 __m256 _mm256_mul_ps (__m256 a, __m256 b) pure @safe 2375 { 2376 return a * b; 2377 } 2378 unittest 2379 { 2380 __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 1.5f, -2.0f, 3.0f, 1.0f]; 2381 a = _mm256_mul_ps(a, a); 2382 float[8] correct = [2.25f, 4.0f, 9.0f, 1.0f, 2.25f, 4.0f, 9.0f, 1.0f]; 2383 assert(a.array == correct); 2384 } 2385 2386 2387 /// Compute the bitwise NOT of 256 bits in `a`. #BONUS 2388 __m256i _mm256_not_si256 (__m256i a) pure @safe 2389 { 2390 return ~a; 2391 } 2392 unittest 2393 { 2394 __m256i A = _mm256_set1_epi64x(-748); 2395 long4 notA = cast(long4) _mm256_not_si256(A); 2396 int[4] correct = [747, 747, 747, 747]; 2397 assert(notA.array == correct); 2398 } 2399 2400 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 2401 __m256d _mm256_or_pd (__m256d a, __m256d b) pure @safe 2402 { 2403 return cast(__m256d)( cast(__m256i)a | cast(__m256i)b ); 2404 } 2405 2406 /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in `a` and `b`. 2407 __m256 _mm256_or_ps (__m256 a, __m256 b) pure @safe 2408 { 2409 return cast(__m256)( cast(__m256i)a | cast(__m256i)b ); 2410 } 2411 2412 /// Shuffle double-precision (64-bit) floating-point elements in `a` using the control in `imm8`. 2413 __m128d _mm_permute_pd(int imm8)(__m128d a) pure @trusted 2414 { 2415 static if (GDC_with_AVX) 2416 { 2417 return __builtin_ia32_vpermilpd(a, imm8 & 3); 2418 } 2419 else 2420 { 2421 // Shufflevector not particularly better for LDC here 2422 __m128d r; 2423 r.ptr[0] = a.array[imm8 & 1]; 2424 r.ptr[1] = a.array[(imm8 >> 1) & 1]; 2425 return r; 2426 } 2427 } 2428 unittest 2429 { 2430 __m128d A = _mm_setr_pd(5, 6); 2431 __m128d B = _mm_permute_pd!1(A); 2432 __m128d C = _mm_permute_pd!3(A); 2433 double[2] RB = [6, 5]; 2434 double[2] RC = [6, 6]; 2435 assert(B.array == RB); 2436 assert(C.array == RC); 2437 } 2438 2439 ///ditto 2440 __m256d _mm256_permute_pd(int imm8)(__m256d a) pure @trusted 2441 { 2442 // PERF DMD 2443 static if (GDC_with_AVX) 2444 { 2445 return __builtin_ia32_vpermilpd256(a, imm8 & 15); 2446 } 2447 else version(LDC) 2448 { 2449 return shufflevectorLDC!(double4, 2450 (imm8 >> 0) & 1, 2451 ( (imm8 >> 1) & 1), 2452 2 + ( (imm8 >> 2) & 1), 2453 2 + ( (imm8 >> 3) & 1) )(a, a); 2454 } 2455 else 2456 { 2457 __m256d r; 2458 r.ptr[0] = a.array[ imm8 & 1]; 2459 r.ptr[1] = a.array[(imm8 >> 1) & 1]; 2460 r.ptr[2] = a.array[2 + ((imm8 >> 2) & 1)]; 2461 r.ptr[3] = a.array[2 + ((imm8 >> 3) & 1)]; 2462 return r; 2463 } 2464 } 2465 unittest 2466 { 2467 __m256d A = _mm256_setr_pd(0.0, 1, 2, 3); 2468 __m256d R = _mm256_permute_pd!(1 + 4)(A); 2469 double[4] correct = [1.0, 0, 3, 2]; 2470 assert(R.array == correct); 2471 } 2472 2473 /// Shuffle single-precision (32-bit) floating-point elements in `a` using the control in `imm8`. 2474 __m128 _mm_permute_ps(int imm8)(__m128 a) pure @trusted 2475 { 2476 // PERF DMD 2477 static if (GDC_with_AVX) 2478 { 2479 return __builtin_ia32_vpermilps(a, cast(ubyte)imm8); 2480 } 2481 else version(LDC) 2482 { 2483 return shufflevectorLDC!(float4, (imm8 >> 0) & 3, (imm8 >> 2) & 3, (imm8 >> 4) & 3, 2484 (imm8 >> 6) & 3)(a, a); 2485 } 2486 else 2487 { 2488 // PERF: could use _mm_shuffle_ps which is a super set 2489 // when AVX isn't available 2490 __m128 r; 2491 r.ptr[0] = a.array[(imm8 >> 0) & 3]; 2492 r.ptr[1] = a.array[(imm8 >> 2) & 3]; 2493 r.ptr[2] = a.array[(imm8 >> 4) & 3]; 2494 r.ptr[3] = a.array[(imm8 >> 6) & 3]; 2495 return r; 2496 } 2497 } 2498 unittest 2499 { 2500 __m128 A = _mm_setr_ps(0.0f, 1, 2, 3); 2501 __m128 R = _mm_permute_ps!(1 + 4 * 3 + 16 * 0 + 64 * 2)(A); 2502 float[4] correct = [1.0f, 3, 0, 2]; 2503 assert(R.array == correct); 2504 } 2505 2506 /// Shuffle single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using 2507 /// the control in `imm8`. The same shuffle is applied in lower and higher 128-bit lane. 2508 __m256 _mm256_permute_ps(int imm8)(__m256 a,) pure @trusted 2509 { 2510 // PERF DMD 2511 static if (GDC_with_AVX) 2512 { 2513 return __builtin_ia32_vpermilps256(a, cast(ubyte)imm8); 2514 } 2515 else version(LDC) 2516 { 2517 return shufflevectorLDC!(float8, 2518 (imm8 >> 0) & 3, (imm8 >> 2) & 3, (imm8 >> 4) & 3, (imm8 >> 6) & 3, 2519 4 + ((imm8 >> 0) & 3), 4 + ((imm8 >> 2) & 3), 4 + ((imm8 >> 4) & 3), 2520 4 + ((imm8 >> 6) & 3))(a, a); 2521 } 2522 else 2523 { 2524 __m256 r; 2525 r.ptr[0] = a.array[(imm8 >> 0) & 3]; 2526 r.ptr[1] = a.array[(imm8 >> 2) & 3]; 2527 r.ptr[2] = a.array[(imm8 >> 4) & 3]; 2528 r.ptr[3] = a.array[(imm8 >> 6) & 3]; 2529 r.ptr[4] = a.array[4 + ((imm8 >> 0) & 3)]; 2530 r.ptr[5] = a.array[4 + ((imm8 >> 2) & 3)]; 2531 r.ptr[6] = a.array[4 + ((imm8 >> 4) & 3)]; 2532 r.ptr[7] = a.array[4 + ((imm8 >> 6) & 3)]; 2533 return r; 2534 } 2535 } 2536 unittest 2537 { 2538 __m256 A = _mm256_setr_ps(0.0f, 1, 2, 3, 4, 5, 6, 7); 2539 __m256 R = _mm256_permute_ps!(1 + 4 * 3 + 16 * 0 + 64 * 2)(A); 2540 float[8] correct = [1.0f, 3, 0, 2, 5, 7, 4, 6]; 2541 assert(R.array == correct); 2542 } 2543 2544 /// Shuffle 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 2545 /// selected by `imm8` from `a` and `b`. 2546 __m256d _mm256_permute2f128_pd(int imm8)(__m256d a, __m256d b) pure @safe 2547 { 2548 return cast(__m256d) _mm256_permute2f128_si256!imm8(cast(__m256i)a, cast(__m256i)b); 2549 } 2550 ///ditto 2551 __m256d _mm256_permute2f128_ps(int imm8)(__m256 a, __m256 b) pure @safe 2552 { 2553 return cast(__m256) _mm256_permute2f128_si256!imm8(cast(__m256i)a, cast(__m256i)b); 2554 } 2555 ///ditto 2556 __m256i _mm256_permute2f128_si256(int imm8)(__m256i a, __m256i b) pure @trusted 2557 { 2558 static if (GDC_with_AVX) 2559 { 2560 return cast(__m256i) __builtin_ia32_vperm2f128_si256(cast(int8)a, cast(int8)b, cast(ubyte)imm8); 2561 } 2562 else 2563 { 2564 static __m128i SELECT4(int imm4)(__m256i a, __m256i b) pure @trusted 2565 { 2566 static assert(imm4 >= 0 && imm4 <= 15); 2567 static if (imm4 & 8) 2568 { 2569 return _mm_setzero_si128(); 2570 } 2571 else static if ((imm4 & 2) == 0) 2572 { 2573 long2 r; 2574 enum int index = 2*(imm4 & 1); 2575 r.ptr[0] = a.array[index+0]; 2576 r.ptr[1] = a.array[index+1]; 2577 return cast(__m128i)r; 2578 } 2579 else 2580 { 2581 static assert( (imm4 & 2) != 0); 2582 long2 r; 2583 enum int index = 2*(imm4 & 1); 2584 r.ptr[0] = b.array[index+0]; 2585 r.ptr[1] = b.array[index+1]; 2586 return cast(__m128i)r; 2587 } 2588 } 2589 2590 long4 r; 2591 __m128i lo = SELECT4!(imm8 & 15)(a, b); 2592 __m128i hi = SELECT4!((imm8 >> 4) & 15)(a, b); 2593 return _mm256_set_m128i(hi, lo); 2594 } 2595 } 2596 unittest 2597 { 2598 __m256d A = _mm256_setr_pd(8.0, 1, 2, 3); 2599 __m256d B = _mm256_setr_pd(4.0, 5, 6, 7); 2600 __m256d R = _mm256_permute2f128_pd!(128 + 2)(A, B); 2601 double[4] correct = [4.0, 5.0, 0.0, 0.0]; 2602 assert(R.array == correct); 2603 2604 __m256d R2 = _mm256_permute2f128_pd!(3*16 + 1)(A, B); 2605 double[4] correct2 = [2.0, 3.0, 6.0, 7.0]; 2606 assert(R2.array == correct2); 2607 } 2608 2609 /// Shuffle double-precision (64-bit) floating-point elements in `a` using the control in `b`. 2610 /// Warning: the selector is in bit 1, not bit 0, of each 64-bit element! 2611 /// This is really not intuitive. 2612 __m128d _mm_permutevar_pd(__m128d a, __m128i b) pure @trusted 2613 { 2614 enum bool implementWithByteShuffle = GDC_with_SSSE3 || LDC_with_SSSE3 || LDC_with_ARM64; 2615 2616 static if (GDC_or_LDC_with_AVX) 2617 { 2618 return cast(__m128d) __builtin_ia32_vpermilvarpd(a, cast(long2)b); 2619 } 2620 else static if (implementWithByteShuffle) 2621 { 2622 align(16) static immutable byte[16] mmAddBase_u8 = [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]; 2623 align(16) static immutable byte[16] mmBroadcast_u8 = [0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8]; 2624 int4 bi = cast(int4)b; 2625 long2 two; 2626 two = 2; 2627 bi = _mm_slli_epi64(cast(__m128i)( (cast(long2)bi) & two), 2); 2628 bi = _mm_shuffle_epi8(bi, *cast(__m128i*)mmBroadcast_u8.ptr); 2629 // bi is now [ind0 ind0 ind0 ind0 ind0 ind0 ind0 ind0 ind1 ind1 ind1 ind1 ind1 ind1 ind1 ind1 ] 2630 byte16 bytesIndices = cast(byte16)bi; 2631 bytesIndices = bytesIndices + *cast(byte16*)mmAddBase_u8.ptr; 2632 2633 // which allows us to make a single _mm_shuffle_epi8 2634 return cast(__m128d) _mm_shuffle_epi8(cast(__m128i)a, cast(__m128i)bytesIndices); 2635 } 2636 else 2637 { 2638 // This isn't great in ARM64, TBL or TBX instructions can't do that. 2639 // that could fit the bill, if it had 64-bit operands. But it only has 8-bit operands. 2640 // SVE2 could do it with svtbx[_f64] probably. 2641 long2 bl = cast(long2)b; 2642 __m128d r; 2643 r.ptr[0] = a.array[ (bl.array[0] & 2) >> 1]; 2644 r.ptr[1] = a.array[ (bl.array[1] & 2) >> 1]; 2645 return r; 2646 } 2647 } 2648 unittest 2649 { 2650 __m128d A = _mm_setr_pd(5, 6); 2651 __m128d B = _mm_permutevar_pd(A, _mm_setr_epi64(2, 1)); 2652 __m128d C = _mm_permutevar_pd(A, _mm_setr_epi64(1 + 2 + 4, 2)); 2653 // yup, this is super strange, it's actually taking bit 1 and not bit 0 of each 64-bit element 2654 double[2] RB = [6, 5]; 2655 double[2] RC = [6, 6]; 2656 assert(B.array == RB); 2657 assert(C.array == RC); 2658 } 2659 2660 ///ditto 2661 __m256d _mm256_permutevar_pd (__m256d a, __m256i b) pure @trusted 2662 { 2663 // Worth it: for GDC, in SSSE3+ 2664 // for LDC, all the time 2665 version(LDC) 2666 enum bool implementWithByteShuffle = true; 2667 else 2668 enum bool implementWithByteShuffle = GDC_with_SSSE3; 2669 2670 // PERF DMD 2671 static if (GDC_or_LDC_with_AVX) 2672 { 2673 return cast(__m256d) __builtin_ia32_vpermilvarpd256(a, cast(long4)b); 2674 } 2675 else static if (implementWithByteShuffle) 2676 { 2677 // because we don't have 256-bit vectors, split and use _mm_permutevar_ps 2678 __m128d a_lo = _mm256_extractf128_pd!0(a); 2679 __m128d a_hi = _mm256_extractf128_pd!1(a); 2680 __m128i b_lo = _mm256_extractf128_si256!0(b); 2681 __m128i b_hi = _mm256_extractf128_si256!1(b); 2682 __m128d r_lo = _mm_permutevar_pd(a_lo, b_lo); 2683 __m128d r_hi = _mm_permutevar_pd(a_hi, b_hi); 2684 return _mm256_set_m128d(r_hi, r_lo); 2685 } 2686 else 2687 { 2688 long4 bl = cast(long4)b; 2689 __m256d r; 2690 r.ptr[0] = a.array[ (bl.array[0] & 2) >> 1]; 2691 r.ptr[1] = a.array[ (bl.array[1] & 2) >> 1]; 2692 r.ptr[2] = a.array[2 + ((bl.array[2] & 2) >> 1)]; 2693 r.ptr[3] = a.array[2 + ((bl.array[3] & 2) >> 1)]; 2694 return r; 2695 } 2696 } 2697 unittest 2698 { 2699 __m256d A = _mm256_setr_pd(5, 6, 7, 8); 2700 __m256d B = _mm256_permutevar_pd(A, _mm256_setr_epi64(2, 1, 0, 2)); 2701 __m256d C = _mm256_permutevar_pd(A, _mm256_setr_epi64(1 + 2 + 4, 2, 2, 0)); 2702 // yup, this is super strange, it's actually taking bit 1 and not bit 0 of each 64-bit element 2703 double[4] RB = [6, 5, 7, 8]; 2704 double[4] RC = [6, 6, 8, 7]; 2705 assert(B.array == RB); 2706 assert(C.array == RC); 2707 } 2708 2709 /// Shuffle single-precision (32-bit) floating-point elements in `a` using the control in `b`. 2710 __m128 _mm_permutevar_ps (__m128 a, __m128i b) @trusted 2711 { 2712 // PERF DMD 2713 2714 enum bool implementWithByteShuffle = GDC_with_SSSE3 || LDC_with_SSSE3 || LDC_with_ARM64; 2715 2716 static if (GDC_or_LDC_with_AVX) 2717 { 2718 return cast(__m128) __builtin_ia32_vpermilvarps(a, cast(int4)b); 2719 } 2720 else static if (implementWithByteShuffle) 2721 { 2722 // This workaround is worth it: in GDC with SSSE3, in LDC with SSSE3, in ARM64 (neon) 2723 int4 bi = cast(int4)b; 2724 int4 three; 2725 three = 3; 2726 bi = _mm_slli_epi32(bi & three, 2); 2727 // bi is [ind0 0 0 0 ind1 0 0 0 ind2 0 0 0 ind3 0 0 0] 2728 bi = bi | _mm_slli_si128!1(bi); 2729 bi = bi | _mm_slli_si128!2(bi); 2730 // bi is now [ind0 ind0 ind0 ind0 ind1 ind1 ind1 ind1 ind2 ind2 ind2 ind2 ind3 ind3 ind3 ind3] 2731 byte16 bytesIndices = cast(byte16)bi; 2732 align(16) static immutable byte[16] mmAddBase_u8 = [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]; 2733 bytesIndices = bytesIndices + *cast(byte16*)mmAddBase_u8.ptr; 2734 2735 // which allows us to make a single _mm_shuffle_epi8 2736 return cast(__m128) _mm_shuffle_epi8(cast(__m128i)a, cast(__m128i)bytesIndices); 2737 } 2738 else 2739 { 2740 2741 int4 bi = cast(int4)b; 2742 __m128 r; 2743 r.ptr[0] = a.array[ (bi.array[0] & 3) ]; 2744 r.ptr[1] = a.array[ (bi.array[1] & 3) ]; 2745 r.ptr[2] = a.array[ (bi.array[2] & 3) ]; 2746 r.ptr[3] = a.array[ (bi.array[3] & 3) ]; 2747 return r; 2748 } 2749 } 2750 unittest 2751 { 2752 __m128 A = _mm_setr_ps(5, 6, 7, 8); 2753 __m128 B = _mm_permutevar_ps(A, _mm_setr_epi32(2, 1, 0, 2 + 4)); 2754 __m128 C = _mm_permutevar_ps(A, _mm_setr_epi32(2, 3 + 8, 1, 0)); 2755 float[4] RB = [7, 6, 5, 7]; 2756 float[4] RC = [7, 8, 6, 5]; 2757 assert(B.array == RB); 2758 assert(C.array == RC); 2759 } 2760 2761 ///ditto 2762 __m256 _mm256_permutevar_ps (__m256 a, __m256i b) @trusted 2763 { 2764 // In order to do those two, it is necessary to use _mm_shuffle_epi8 and reconstruct the integers afterwards. 2765 enum bool implementWithByteShuffle = GDC_with_SSSE3 || LDC_with_SSSE3 || LDC_with_ARM64; 2766 2767 static if (GDC_or_LDC_with_AVX) 2768 { 2769 return __builtin_ia32_vpermilvarps256(a, cast(int8)b); 2770 } 2771 else static if (implementWithByteShuffle) 2772 { 2773 // because we don't have 256-bit vectors, split and use _mm_permutevar_ps 2774 __m128 a_lo = _mm256_extractf128_ps!0(a); 2775 __m128 a_hi = _mm256_extractf128_ps!1(a); 2776 __m128i b_lo = _mm256_extractf128_si256!0(b); 2777 __m128i b_hi = _mm256_extractf128_si256!1(b); 2778 __m128 r_lo = _mm_permutevar_ps(a_lo, b_lo); 2779 __m128 r_hi = _mm_permutevar_ps(a_hi, b_hi); 2780 return _mm256_set_m128(r_hi, r_lo); 2781 } 2782 else 2783 { 2784 int8 bi = cast(int8)b; 2785 __m256 r; 2786 r.ptr[0] = a.array[ (bi.array[0] & 3) ]; 2787 r.ptr[1] = a.array[ (bi.array[1] & 3) ]; 2788 r.ptr[2] = a.array[ (bi.array[2] & 3) ]; 2789 r.ptr[3] = a.array[ (bi.array[3] & 3) ]; 2790 r.ptr[4] = a.array[ 4 + (bi.array[4] & 3) ]; 2791 r.ptr[5] = a.array[ 4 + (bi.array[5] & 3) ]; 2792 r.ptr[6] = a.array[ 4 + (bi.array[6] & 3) ]; 2793 r.ptr[7] = a.array[ 4 + (bi.array[7] & 3) ]; 2794 return r; 2795 } 2796 } 2797 unittest 2798 { 2799 __m256 A = _mm256_setr_ps(1, 2, 3, 4, 5, 6, 7, 8); 2800 __m256 B = _mm256_permutevar_ps(A, _mm256_setr_epi32(2, 1, 0, 2, 3, 2, 1, 0)); 2801 __m256 C = _mm256_permutevar_ps(A, _mm256_setr_epi32(2, 3 + 8, 1, 0, 2, 3, 0, 1)); 2802 float[8] RB = [3.0f, 2, 1, 3, 8, 7, 6, 5]; 2803 float[8] RC = [3.0f, 4, 2, 1, 7, 8, 5, 6]; 2804 assert(B.array == RB); 2805 assert(C.array == RC); 2806 } 2807 2808 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements 2809 /// in `a`. The maximum relative error for this approximation is less than 1.5*2^-12. 2810 __m256 _mm256_rcp_ps (__m256 a) pure @trusted 2811 { 2812 // PERF DMD 2813 static if (GDC_or_LDC_with_AVX) 2814 { 2815 return __builtin_ia32_rcpps256(a); 2816 } 2817 else 2818 { 2819 a.ptr[0] = 1.0f / a.array[0]; 2820 a.ptr[1] = 1.0f / a.array[1]; 2821 a.ptr[2] = 1.0f / a.array[2]; 2822 a.ptr[3] = 1.0f / a.array[3]; 2823 a.ptr[4] = 1.0f / a.array[4]; 2824 a.ptr[5] = 1.0f / a.array[5]; 2825 a.ptr[6] = 1.0f / a.array[6]; 2826 a.ptr[7] = 1.0f / a.array[7]; 2827 return a; 2828 } 2829 } 2830 unittest 2831 { 2832 __m256 A = _mm256_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f, 9, -46, 1869816, 55583); 2833 __m256 groundTruth = _mm256_set1_ps(1.0f) / A; 2834 __m256 result = _mm256_rcp_ps(A); 2835 foreach(i; 0..8) 2836 { 2837 double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1; 2838 assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 2839 } 2840 } 2841 2842 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 2843 /// rounding parameter, and store the results as packed double-precision floating-point elements. 2844 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 2845 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 2846 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 2847 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 2848 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 2849 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 2850 __m256d _mm256_round_pd(int rounding)(__m256d a) @trusted 2851 { 2852 // PERF DMD 2853 static if (GDC_with_AVX) 2854 { 2855 return __builtin_ia32_roundpd256(a, rounding); 2856 } 2857 else static if (LDC_with_AVX) 2858 { 2859 return __builtin_ia32_roundpd256(a, rounding); 2860 } 2861 else 2862 { 2863 static if (rounding & _MM_FROUND_CUR_DIRECTION) 2864 { 2865 // PERF: non-AVX x86, would probably be faster to convert those double at once to int64 2866 2867 __m128d A_lo = _mm256_extractf128_pd!0(a); 2868 __m128d A_hi = _mm256_extractf128_pd!1(a); 2869 2870 // Convert to 64-bit integers one by one 2871 long x0 = _mm_cvtsd_si64(A_lo); 2872 long x2 = _mm_cvtsd_si64(A_hi); 2873 A_lo.ptr[0] = A_lo.array[1]; 2874 A_hi.ptr[0] = A_hi.array[1]; 2875 long x1 = _mm_cvtsd_si64(A_lo); 2876 long x3 = _mm_cvtsd_si64(A_hi); 2877 2878 return _mm256_setr_pd(x0, x1, x2, x3); 2879 } 2880 else 2881 { 2882 version(GNU) pragma(inline, false); // this was required for SSE4.1 rounding, let it here 2883 2884 uint old = _MM_GET_ROUNDING_MODE(); 2885 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 2886 2887 __m128d A_lo = _mm256_extractf128_pd!0(a); 2888 __m128d A_hi = _mm256_extractf128_pd!1(a); 2889 2890 // Convert to 64-bit integers one by one 2891 long x0 = _mm_cvtsd_si64(A_lo); 2892 long x2 = _mm_cvtsd_si64(A_hi); 2893 A_lo.ptr[0] = A_lo.array[1]; 2894 A_hi.ptr[0] = A_hi.array[1]; 2895 long x1 = _mm_cvtsd_si64(A_lo); 2896 long x3 = _mm_cvtsd_si64(A_hi); 2897 2898 // Convert back to double to achieve the rounding 2899 // The problem is that a 64-bit double can't represent all the values 2900 // a 64-bit integer can (and vice-versa). So this function won't work for 2901 // large values. (FUTURE: what range exactly?) 2902 _MM_SET_ROUNDING_MODE(old); 2903 return _mm256_setr_pd(x0, x1, x2, x3); 2904 } 2905 } 2906 } 2907 unittest 2908 { 2909 // tested in other intrinsics 2910 } 2911 2912 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 2913 /// rounding parameter, and store the results as packed single-precision floating-point elements. 2914 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 2915 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 2916 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 2917 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 2918 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 2919 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 2920 __m256 _mm256_round_ps(int rounding)(__m256 a) @trusted 2921 { 2922 // PERF DMD 2923 static if (GDC_or_LDC_with_AVX) 2924 { 2925 return __builtin_ia32_roundps256(a, rounding); 2926 } 2927 else static if (GDC_or_LDC_with_SSE41) 2928 { 2929 // we can use _mm_round_ps 2930 __m128 lo = _mm256_extractf128_ps!0(a); 2931 __m128 hi = _mm256_extractf128_ps!1(a); 2932 __m128 ilo = _mm_round_ps!rounding(lo); // unfortunately _mm_round_ps isn't fast for arm64, so we avoid that in that case 2933 __m128 ihi = _mm_round_ps!rounding(hi); 2934 return _mm256_set_m128(ihi, ilo); 2935 } 2936 else 2937 { 2938 static if (rounding & _MM_FROUND_CUR_DIRECTION) 2939 { 2940 __m256i integers = _mm256_cvtps_epi32(a); 2941 return _mm256_cvtepi32_ps(integers); 2942 } 2943 else 2944 { 2945 version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled 2946 uint old = _MM_GET_ROUNDING_MODE(); 2947 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 2948 scope(exit) _MM_SET_ROUNDING_MODE(old); 2949 2950 // Convert to 32-bit integers 2951 __m256i integers = _mm256_cvtps_epi32(a); 2952 2953 // Convert back to float to achieve the rounding 2954 // The problem is that a 32-float can't represent all the values 2955 // a 32-bit integer can (and vice-versa). So this function won't work for 2956 // large values. (FUTURE: what range exactly?) 2957 __m256 result = _mm256_cvtepi32_ps(integers); 2958 2959 return result; 2960 } 2961 } 2962 } 2963 unittest 2964 { 2965 // tested in other intrinsics 2966 } 2967 2968 2969 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) 2970 /// floating-point elements in `a`. The maximum relative error for this approximation is less than 2971 /// 1.5*2^-12. 2972 __m256 _mm256_rsqrt_ps (__m256 a) pure @trusted 2973 { 2974 static if (GDC_or_LDC_with_AVX) 2975 { 2976 return __builtin_ia32_rsqrtps256(a); 2977 } 2978 else version(LDC) 2979 { 2980 a[0] = 1.0f / llvm_sqrt(a[0]); 2981 a[1] = 1.0f / llvm_sqrt(a[1]); 2982 a[2] = 1.0f / llvm_sqrt(a[2]); 2983 a[3] = 1.0f / llvm_sqrt(a[3]); 2984 a[4] = 1.0f / llvm_sqrt(a[4]); 2985 a[5] = 1.0f / llvm_sqrt(a[5]); 2986 a[6] = 1.0f / llvm_sqrt(a[6]); 2987 a[7] = 1.0f / llvm_sqrt(a[7]); 2988 return a; 2989 } 2990 else 2991 { 2992 a.ptr[0] = 1.0f / sqrt(a.array[0]); 2993 a.ptr[1] = 1.0f / sqrt(a.array[1]); 2994 a.ptr[2] = 1.0f / sqrt(a.array[2]); 2995 a.ptr[3] = 1.0f / sqrt(a.array[3]); 2996 a.ptr[4] = 1.0f / sqrt(a.array[4]); 2997 a.ptr[5] = 1.0f / sqrt(a.array[5]); 2998 a.ptr[6] = 1.0f / sqrt(a.array[6]); 2999 a.ptr[7] = 1.0f / sqrt(a.array[7]); 3000 return a; 3001 } 3002 } 3003 unittest 3004 { 3005 __m256 A = _mm256_setr_ps(2.34f, 70000.0f, 0.00001f, 345.5f, 2.34f, 70000.0f, 0.00001f, 345.5f); 3006 __m256 groundTruth = _mm256_setr_ps(0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f, 3007 0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f); 3008 __m256 result = _mm256_rsqrt_ps(A); 3009 foreach(i; 0..8) 3010 { 3011 double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1; 3012 assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 3013 } 3014 } 3015 3016 /// Set packed 16-bit integers with the supplied values. 3017 __m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 3018 { 3019 short16 r; // Note: = void would prevent GDC from inlining a constant short16... 3020 r.ptr[0] = e0; 3021 r.ptr[1] = e1; 3022 r.ptr[2] = e2; 3023 r.ptr[3] = e3; 3024 r.ptr[4] = e4; 3025 r.ptr[5] = e5; 3026 r.ptr[6] = e6; 3027 r.ptr[7] = e7; 3028 r.ptr[8] = e8; 3029 r.ptr[9] = e9; 3030 r.ptr[10] = e10; 3031 r.ptr[11] = e11; 3032 r.ptr[12] = e12; 3033 r.ptr[13] = e13; 3034 r.ptr[14] = e14; 3035 r.ptr[15] = e15; 3036 return cast(__m256i) r; 3037 } 3038 unittest 3039 { 3040 short16 A = cast(short16) _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 3041 7, 6, 5, 4, 3, 2, 1, 0); 3042 foreach(i; 0..16) 3043 assert(A.array[i] == i); 3044 } 3045 3046 /// Set packed 32-bit integers with the supplied values. 3047 __m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted 3048 { 3049 // Inlines a constant with GCC -O1, LDC -O2 3050 int8 r; // = void would prevent GCC from inlining a constant call 3051 r.ptr[0] = e0; 3052 r.ptr[1] = e1; 3053 r.ptr[2] = e2; 3054 r.ptr[3] = e3; 3055 r.ptr[4] = e4; 3056 r.ptr[5] = e5; 3057 r.ptr[6] = e6; 3058 r.ptr[7] = e7; 3059 return cast(__m256i)r; 3060 } 3061 unittest 3062 { 3063 int8 A = cast(int8) _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); 3064 foreach(i; 0..8) 3065 assert(A.array[i] == i); 3066 } 3067 3068 /// Set packed 64-bit integers with the supplied values. 3069 __m256i _mm256_set_epi64x (long e3, long e2, long e1, long e0) pure @trusted 3070 { 3071 long4 r = void; 3072 r.ptr[0] = e0; 3073 r.ptr[1] = e1; 3074 r.ptr[2] = e2; 3075 r.ptr[3] = e3; 3076 return r; 3077 } 3078 unittest 3079 { 3080 __m256i A = _mm256_set_epi64x(-1, 42, long.min, long.max); 3081 long[4] correct = [long.max, long.min, 42, -1]; 3082 assert(A.array == correct); 3083 } 3084 3085 ///ditto 3086 alias _mm256_set_epi64 = _mm256_set_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API. 3087 3088 /// Set packed 8-bit integers with the supplied values. 3089 __m256i _mm256_set_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24, 3090 byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16, 3091 byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9, byte e8, 3092 byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) 3093 { 3094 // Inline a constant call in GDC -O1 and LDC -O2 3095 align(32) byte[32] result = [ e0, e1, e2, e3, e4, e5, e6, e7, 3096 e8, e9, e10, e11, e12, e13, e14, e15, 3097 e16, e17, e18, e19, e20, e21, e22, e23, 3098 e24, e25, e26, e27, e28, e29, e30, e31 ]; 3099 return *cast(__m256i*)(result.ptr); 3100 } 3101 unittest 3102 { 3103 byte32 R = cast(byte32) _mm256_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7); 3104 byte[32] correct = [7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0, 3, 2, 1, 0, 3105 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1]; 3106 assert(R.array == correct); 3107 } 3108 3109 /// Set packed `__m256d` vector with the supplied values. 3110 __m256 _mm256_set_m128 (__m128 hi, __m128 lo) pure @trusted 3111 { 3112 // DMD PERF 3113 static if (GDC_with_AVX) 3114 { 3115 __m256 r = __builtin_ia32_ps256_ps(lo); 3116 return __builtin_ia32_vinsertf128_ps256(r, hi, 1); 3117 } 3118 else 3119 { 3120 __m256 r = void; 3121 r.ptr[0] = lo.array[0]; 3122 r.ptr[1] = lo.array[1]; 3123 r.ptr[2] = lo.array[2]; 3124 r.ptr[3] = lo.array[3]; 3125 r.ptr[4] = hi.array[0]; 3126 r.ptr[5] = hi.array[1]; 3127 r.ptr[6] = hi.array[2]; 3128 r.ptr[7] = hi.array[3]; 3129 return r; 3130 } 3131 3132 /* 3133 // BUG, doesn't work if AVX vector is emulated, but SSE vector is not 3134 // See issue #108 3135 __m256 r = void; 3136 __m128* p = cast(__m128*)(&r); 3137 p[0] = lo; 3138 p[1] = hi; 3139 return r; 3140 */ 3141 } 3142 unittest 3143 { 3144 __m128 lo = _mm_setr_ps(1.0f, 2, 3, 4); 3145 __m128 hi = _mm_setr_ps(3.0f, 4, 5, 6); 3146 __m256 R = _mm256_set_m128(hi, lo); 3147 float[8] correct = [1.0f, 2, 3, 4, 3, 4, 5, 6]; 3148 assert(R.array == correct); 3149 } 3150 3151 /// Set packed `__m256d` vector with the supplied values. 3152 __m256d _mm256_set_m128d (__m128d hi, __m128d lo) pure @trusted 3153 { 3154 __m256d r = void; 3155 r.ptr[0] = lo.array[0]; 3156 r.ptr[1] = lo.array[1]; 3157 r.ptr[2] = hi.array[0]; 3158 r.ptr[3] = hi.array[1]; 3159 return r; 3160 } 3161 unittest 3162 { 3163 __m128d lo = _mm_setr_pd(1.0, 2.0); 3164 __m128d hi = _mm_setr_pd(3.0, 4.0); 3165 __m256d R = _mm256_set_m128d(hi, lo); 3166 double[4] correct = [1.0, 2.0, 3.0, 4.0]; 3167 assert(R.array == correct); 3168 } 3169 3170 /// Set packed `__m256i` vector with the supplied values. 3171 __m256i _mm256_set_m128i (__m128i hi, __m128i lo) pure @trusted 3172 { 3173 // DMD PERF 3174 static if (GDC_with_AVX) 3175 { 3176 __m256i r = cast(long4) __builtin_ia32_si256_si (lo); 3177 return cast(long4) __builtin_ia32_vinsertf128_si256(cast(int8)r, hi, 1); 3178 } 3179 else 3180 { 3181 int8 r = void; 3182 r.ptr[0] = lo.array[0]; 3183 r.ptr[1] = lo.array[1]; 3184 r.ptr[2] = lo.array[2]; 3185 r.ptr[3] = lo.array[3]; 3186 r.ptr[4] = hi.array[0]; 3187 r.ptr[5] = hi.array[1]; 3188 r.ptr[6] = hi.array[2]; 3189 r.ptr[7] = hi.array[3]; 3190 return cast(long4)r; 3191 } 3192 } 3193 unittest 3194 { 3195 __m128i lo = _mm_setr_epi32( 1, 2, 3, 4); 3196 __m128i hi = _mm_set_epi32(-3, -4, -5, -6); 3197 int8 R = cast(int8)_mm256_set_m128i(hi, lo); 3198 int[8] correct = [1, 2, 3, 4, -6, -5, -4, -3]; 3199 assert(R.array == correct); 3200 } 3201 3202 /// Set packed double-precision (64-bit) floating-point elements with the supplied values. 3203 __m256d _mm256_set_pd (double e3, double e2, double e1, double e0) pure @trusted 3204 { 3205 __m256d r = void; 3206 r.ptr[0] = e0; 3207 r.ptr[1] = e1; 3208 r.ptr[2] = e2; 3209 r.ptr[3] = e3; 3210 return r; 3211 } 3212 unittest 3213 { 3214 __m256d A = _mm256_set_pd(3, 2, 1, 546); 3215 double[4] correct = [546.0, 1.0, 2.0, 3.0]; 3216 assert(A.array == correct); 3217 } 3218 3219 /// Set packed single-precision (32-bit) floating-point elements with the supplied values. 3220 __m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) pure @trusted 3221 { 3222 // PERF: see #102, use = void? 3223 __m256 r; 3224 r.ptr[0] = e0; 3225 r.ptr[1] = e1; 3226 r.ptr[2] = e2; 3227 r.ptr[3] = e3; 3228 r.ptr[4] = e4; 3229 r.ptr[5] = e5; 3230 r.ptr[6] = e6; 3231 r.ptr[7] = e7; 3232 return r; 3233 } 3234 unittest 3235 { 3236 __m256 A = _mm256_set_ps(3, 2, 1, 546.0f, -1.25f, -2, -3, 0); 3237 float[8] correct = [0, -3, -2, -1.25f, 546.0f, 1.0, 2.0, 3.0]; 3238 assert(A.array == correct); 3239 } 3240 3241 /// Broadcast 16-bit integer `a` to all elements of the return value. 3242 __m256i _mm256_set1_epi16 (short a) pure @trusted 3243 { 3244 version(DigitalMars) 3245 { 3246 // workaround https://issues.dlang.org/show_bug.cgi?id=21469 3247 // It used to ICE, after that the codegen was just wrong. 3248 // No issue anymore in DMD 2.101, we can eventually remove that 3249 static if (__VERSION__ < 2101) 3250 { 3251 short16 v = a; 3252 return cast(__m256i) v; 3253 } 3254 else 3255 { 3256 pragma(inline, true); 3257 return cast(__m256i)(short16(a)); 3258 } 3259 } 3260 else 3261 { 3262 pragma(inline, true); 3263 return cast(__m256i)(short16(a)); 3264 } 3265 } 3266 unittest 3267 { 3268 short16 a = cast(short16) _mm256_set1_epi16(31); 3269 for (int i = 0; i < 16; ++i) 3270 assert(a.array[i] == 31); 3271 } 3272 3273 /// Broadcast 32-bit integer `a` to all elements. 3274 __m256i _mm256_set1_epi32 (int a) pure @trusted 3275 { 3276 version(DigitalMars) 3277 { 3278 // No issue anymore in DMD 2.101, we can eventually remove that 3279 static if (__VERSION__ < 2101) 3280 { 3281 int8 v = a; 3282 return cast(__m256i) v; 3283 } 3284 else 3285 { 3286 pragma(inline, true); 3287 return cast(__m256i)(int8(a)); 3288 } 3289 } 3290 else 3291 { 3292 pragma(inline, true); 3293 return cast(__m256i)(int8(a)); 3294 } 3295 } 3296 unittest 3297 { 3298 int8 a = cast(int8) _mm256_set1_epi32(31); 3299 for (int i = 0; i < 8; ++i) 3300 assert(a.array[i] == 31); 3301 } 3302 3303 /// Broadcast 64-bit integer `a` to all elements of the return value. 3304 __m256i _mm256_set1_epi64x (long a) 3305 { 3306 return cast(__m256i)(long4(a)); 3307 } 3308 unittest 3309 { 3310 long4 a = cast(long4) _mm256_set1_epi64x(-31); 3311 for (int i = 0; i < 4; ++i) 3312 assert(a.array[i] == -31); 3313 } 3314 ///ditto 3315 alias _mm256_set1_epi64 = _mm256_set1_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API. 3316 3317 /// Broadcast 8-bit integer `a` to all elements of the return value. 3318 __m256i _mm256_set1_epi8 (byte a) pure @trusted 3319 { 3320 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 3321 { 3322 byte32 v = a; 3323 return cast(__m256i) v; 3324 } 3325 else 3326 { 3327 pragma(inline, true); 3328 return cast(__m256i)(byte32(a)); 3329 } 3330 } 3331 unittest 3332 { 3333 byte32 a = cast(byte32) _mm256_set1_epi8(31); 3334 for (int i = 0; i < 32; ++i) 3335 assert(a.array[i] == 31); 3336 } 3337 3338 /// Broadcast double-precision (64-bit) floating-point value `a` to all elements of the return value. 3339 __m256d _mm256_set1_pd (double a) pure @trusted 3340 { 3341 return __m256d(a); 3342 } 3343 unittest 3344 { 3345 double a = 464.21; 3346 double[4] correct = [a, a, a, a]; 3347 double4 A = cast(double4) _mm256_set1_pd(a); 3348 assert(A.array == correct); 3349 } 3350 3351 /// Broadcast single-precision (32-bit) floating-point value `a` to all elements of the return value. 3352 __m256 _mm256_set1_ps (float a) pure @trusted 3353 { 3354 return __m256(a); 3355 } 3356 unittest 3357 { 3358 float a = 464.21f; 3359 float[8] correct = [a, a, a, a, a, a, a, a]; 3360 float8 A = cast(float8) _mm256_set1_ps(a); 3361 assert(A.array == correct); 3362 } 3363 3364 /// Set packed 16-bit integers with the supplied values in reverse order. 3365 __m256i _mm256_setr_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, 3366 short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 3367 { 3368 short[16] result = [ e15, e14, e13, e12, e11, e10, e9, e8, 3369 e7, e6, e5, e4, e3, e2, e1, e0]; 3370 static if (GDC_with_AVX) 3371 { 3372 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr); 3373 } 3374 else version(LDC) 3375 { 3376 return cast(__m256i)( loadUnaligned!(short16)(result.ptr) ); 3377 } 3378 else 3379 { 3380 short16 r; 3381 for(int n = 0; n < 16; ++n) 3382 r.ptr[n] = result[n]; 3383 return cast(__m256i)r; 3384 } 3385 } 3386 unittest 3387 { 3388 short16 A = cast(short16) _mm256_setr_epi16(-1, 0, -21, 21, 42, 127, -42, -128, 3389 -1, 0, -21, 21, 42, 127, -42, -128); 3390 short[16] correct = [-1, 0, -21, 21, 42, 127, -42, -128, 3391 -1, 0, -21, 21, 42, 127, -42, -128]; 3392 assert(A.array == correct); 3393 } 3394 3395 /// Set packed 32-bit integers with the supplied values in reverse order. 3396 __m256i _mm256_setr_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted 3397 { 3398 // Inlines a constant with GCC -O1, LDC -O2 3399 int8 r; // = void would prevent GDC from inlining a constant call 3400 r.ptr[0] = e7; 3401 r.ptr[1] = e6; 3402 r.ptr[2] = e5; 3403 r.ptr[3] = e4; 3404 r.ptr[4] = e3; 3405 r.ptr[5] = e2; 3406 r.ptr[6] = e1; 3407 r.ptr[7] = e0; 3408 return cast(__m256i)r; 3409 } 3410 unittest 3411 { 3412 int8 A = cast(int8) _mm256_setr_epi32(-1, 0, -2147483648, 2147483647, 42, 666, -42, -666); 3413 int[8] correct = [-1, 0, -2147483648, 2147483647, 42, 666, -42, -666]; 3414 assert(A.array == correct); 3415 } 3416 3417 /// Set packed 64-bit integers with the supplied values in reverse order. 3418 __m256i _mm256_setr_epi64x (long e3, long e2, long e1, long e0) pure @trusted 3419 { 3420 long4 r = void; 3421 r.ptr[0] = e3; 3422 r.ptr[1] = e2; 3423 r.ptr[2] = e1; 3424 r.ptr[3] = e0; 3425 return r; 3426 } 3427 unittest 3428 { 3429 __m256i A = _mm256_setr_epi64x(-1, 42, long.min, long.max); 3430 long[4] correct = [-1, 42, long.min, long.max]; 3431 assert(A.array == correct); 3432 } 3433 ///ditto 3434 alias _mm256_setr_epi64 = _mm256_setr_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API. 3435 3436 /// Set packed 8-bit integers with the supplied values in reverse order. 3437 __m256i _mm256_setr_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24, 3438 byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16, 3439 byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9, byte e8, 3440 byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 3441 { 3442 // Inline a constant call in GDC -O1 and LDC -O2 3443 align(32) byte[32] result = [ e31, e30, e29, e28, e27, e26, e25, e24, 3444 e23, e22, e21, e20, e19, e18, e17, e16, 3445 e15, e14, e13, e12, e11, e10, e9, e8, 3446 e7, e6, e5, e4, e3, e2, e1, e0]; 3447 return *cast(__m256i*)(result.ptr); 3448 } 3449 unittest 3450 { 3451 byte32 A = cast(byte32) _mm256_setr_epi8( -1, 0, -21, 21, 42, 127, -42, -128, 3452 -1, 0, -21, 21, 42, 127, -42, -128, 3453 -1, 0, -21, 21, 42, 127, -42, -128, 3454 -1, 0, -21, 21, 42, 127, -42, -128); 3455 byte[32] correct = [-1, 0, -21, 21, 42, 127, -42, -128, 3456 -1, 0, -21, 21, 42, 127, -42, -128, 3457 -1, 0, -21, 21, 42, 127, -42, -128, 3458 -1, 0, -21, 21, 42, 127, -42, -128]; 3459 assert(A.array == correct); 3460 } 3461 3462 /// Set packed `__m256` vector with the supplied values. 3463 __m256 _mm256_setr_m128 (__m128 lo, __m128 hi) 3464 { 3465 return _mm256_set_m128(hi, lo); 3466 } 3467 unittest 3468 { 3469 __m128 A = _mm_setr_ps(1.0f, 2, 3, 4); 3470 __m128 B = _mm_setr_ps(3.0f, 4, 5, 6); 3471 __m256 R = _mm256_setr_m128(B, A); 3472 float[8] correct = [3.0f, 4, 5, 6, 1, 2, 3, 4,]; 3473 assert(R.array == correct); 3474 } 3475 3476 /// Set packed `__m256d` vector with the supplied values. 3477 __m256d _mm256_setr_m128d (__m128d lo, __m128d hi) 3478 { 3479 return _mm256_set_m128d(hi, lo); 3480 } 3481 unittest 3482 { 3483 __m128d A = _mm_setr_pd(1.0, 2.0); 3484 __m128d B = _mm_setr_pd(3.0, 4.0); 3485 __m256d R = _mm256_setr_m128d(B, A); 3486 double[4] correct = [3.0, 4.0, 1.0, 2.0]; 3487 assert(R.array == correct); 3488 } 3489 3490 /// Set packed `__m256i` vector with the supplied values. 3491 __m256i _mm256_setr_m128i (__m128i lo, __m128i hi) 3492 { 3493 return _mm256_set_m128i(hi, lo); 3494 } 3495 unittest 3496 { 3497 __m128i A = _mm_setr_epi32( 1, 2, 3, 4); 3498 __m128i B = _mm_set_epi32(-3, -4, -5, -6); 3499 int8 R = cast(int8)_mm256_setr_m128i(B, A); 3500 int[8] correct = [-6, -5, -4, -3, 1, 2, 3, 4]; 3501 assert(R.array == correct); 3502 } 3503 3504 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order. 3505 __m256d _mm256_setr_pd (double e3, double e2, double e1, double e0) pure @trusted 3506 { 3507 version(LDC) 3508 { 3509 // PERF, probably not the best 3510 double[4] result = [e3, e2, e1, e0]; 3511 return loadUnaligned!(double4)(result.ptr); 3512 } 3513 else 3514 { 3515 __m256d r; 3516 r.ptr[0] = e3; 3517 r.ptr[1] = e2; 3518 r.ptr[2] = e1; 3519 r.ptr[3] = e0; 3520 return r; 3521 } 3522 } 3523 unittest 3524 { 3525 __m256d A = _mm256_setr_pd(3, 2, 1, 546.125); 3526 double[4] correct = [3.0, 2.0, 1.0, 546.125]; 3527 assert(A.array == correct); 3528 } 3529 3530 3531 /// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order. 3532 __m256 _mm256_setr_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) pure @trusted 3533 { 3534 // PERF DMD 3535 static if (GDC_with_AVX) 3536 { 3537 align(32) float[8] r = [ e7, e6, e5, e4, e3, e2, e1, e0]; 3538 return *cast(__m256*)r; 3539 } 3540 else version(LDC) 3541 { 3542 align(32) float[8] r = [ e7, e6, e5, e4, e3, e2, e1, e0]; 3543 return *cast(__m256*)r; 3544 } 3545 else 3546 { 3547 __m256 r; 3548 r.ptr[0] = e7; 3549 r.ptr[1] = e6; 3550 r.ptr[2] = e5; 3551 r.ptr[3] = e4; 3552 r.ptr[4] = e3; 3553 r.ptr[5] = e2; 3554 r.ptr[6] = e1; 3555 r.ptr[7] = e0; 3556 return r; 3557 } 3558 } 3559 unittest 3560 { 3561 __m256 A = _mm256_setr_ps( 3, 2, 1, 546.125f, 4, 5, 6, 7); 3562 float[8] correct = [3.0f, 2, 1, 546.125f, 4, 5, 6, 7]; 3563 assert(A.array == correct); 3564 } 3565 3566 /// Return vector of type `__m256d` with all elements set to zero. 3567 __m256d _mm256_setzero_pd() pure @safe 3568 { 3569 return double4(0.0); 3570 } 3571 unittest 3572 { 3573 __m256d A = _mm256_setzero_pd(); 3574 double[4] correct = [0.0, 0.0, 0.0, 0.0]; 3575 assert(A.array == correct); 3576 } 3577 3578 /// Return vector of type `__m256` with all elements set to zero. 3579 __m256 _mm256_setzero_ps() pure @safe 3580 { 3581 return float8(0.0f); 3582 } 3583 unittest 3584 { 3585 __m256 A = _mm256_setzero_ps(); 3586 float[8] correct = [0.0f, 0, 0, 0, 0, 0, 0, 0]; 3587 assert(A.array == correct); 3588 } 3589 3590 /// Return vector of type `__m256i` with all elements set to zero. 3591 __m256i _mm256_setzero_si256() pure @trusted 3592 { 3593 return __m256i(0); 3594 } 3595 unittest 3596 { 3597 __m256i A = _mm256_setzero_si256(); 3598 long[4] correct = [0, 0, 0, 0]; 3599 assert(A.array == correct); 3600 } 3601 3602 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the 3603 /// control in `imm8`. 3604 __m256d _mm256_shuffle_pd(int imm8)(__m256d a, __m256d b) pure @trusted 3605 { 3606 // PERF DMD 3607 static if (GDC_with_AVX) 3608 { 3609 return __builtin_ia32_shufpd256(a, b, imm8); 3610 } 3611 else version(LDC) 3612 { 3613 return shufflevectorLDC!(double4, 3614 (imm8 >> 0) & 1, 3615 4 + ( (imm8 >> 1) & 1), 3616 2 + ( (imm8 >> 2) & 1), 3617 6 + ( (imm8 >> 3) & 1) )(a, b); 3618 } 3619 else 3620 { 3621 double4 r = void; 3622 r.ptr[0] = a.array[(imm8 >> 0) & 1]; 3623 r.ptr[1] = b.array[(imm8 >> 1) & 1]; 3624 r.ptr[2] = a.array[2 + ( (imm8 >> 2) & 1)]; 3625 r.ptr[3] = b.array[2 + ( (imm8 >> 3) & 1)]; 3626 return r; 3627 } 3628 } 3629 unittest 3630 { 3631 __m256d A = _mm256_setr_pd( 0, 1, 2, 3); 3632 __m256d B = _mm256_setr_pd( 4, 5, 6, 7); 3633 __m256d C = _mm256_shuffle_pd!75 /* 01001011 */(A, B); 3634 double[4] correct = [1.0, 5.0, 2.0, 7.0]; 3635 assert(C.array == correct); 3636 } 3637 3638 /// Shuffle single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using 3639 /// the control in `imm8`. 3640 __m256 _mm256_shuffle_ps(int imm8)(__m256 a, __m256 b) pure @trusted 3641 { 3642 // PERF DMD 3643 static if (GDC_with_AVX) 3644 { 3645 return __builtin_ia32_shufps256(a, b, imm8); 3646 } 3647 else version(LDC) 3648 { 3649 return shufflevectorLDC!(float8, (imm8 >> 0) & 3, 3650 (imm8 >> 2) & 3, 3651 8 + ( (imm8 >> 4) & 3), 3652 8 + ( (imm8 >> 6) & 3), 3653 4 + ( (imm8 >> 0) & 3), 3654 4 + ( (imm8 >> 2) & 3), 3655 12 + ( (imm8 >> 4) & 3), 3656 12 + ( (imm8 >> 6) & 3) )(a, b); 3657 } 3658 else 3659 { 3660 float8 r = void; 3661 r.ptr[0] = a.array[(imm8 >> 0) & 3]; 3662 r.ptr[1] = a.array[(imm8 >> 2) & 3]; 3663 r.ptr[2] = b.array[(imm8 >> 4) & 3]; 3664 r.ptr[3] = b.array[(imm8 >> 6) & 3]; 3665 r.ptr[4] = a.array[4 + ( (imm8 >> 0) & 3 )]; 3666 r.ptr[5] = a.array[4 + ( (imm8 >> 2) & 3 )]; 3667 r.ptr[6] = b.array[4 + ( (imm8 >> 4) & 3 )]; 3668 r.ptr[7] = b.array[4 + ( (imm8 >> 6) & 3 )]; 3669 return r; 3670 } 3671 } 3672 unittest 3673 { 3674 __m256 A = _mm256_setr_ps( 0, 1, 2, 3, 4, 5, 6, 7); 3675 __m256 B = _mm256_setr_ps( 8, 9, 10, 11, 12, 13, 14, 15); 3676 __m256 C = _mm256_shuffle_ps!75 /* 01001011 */(A, B); 3677 float[8] correct = [3.0f, 2, 8, 9, 7, 6, 12, 13]; 3678 assert(C.array == correct); 3679 } 3680 3681 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `a`. 3682 __m256d _mm256_sqrt_pd (__m256d a) pure @trusted 3683 { 3684 static if (GDC_with_AVX) 3685 { 3686 return __builtin_ia32_sqrtpd256(a); 3687 } 3688 else version(LDC) 3689 { 3690 static if (__VERSION__ >= 2084) 3691 return llvm_sqrt(a); // that capability appeared in LDC 1.14 3692 else 3693 { 3694 a.ptr[0] = llvm_sqrt(a.array[0]); 3695 a.ptr[1] = llvm_sqrt(a.array[1]); 3696 a.ptr[2] = llvm_sqrt(a.array[2]); 3697 a.ptr[3] = llvm_sqrt(a.array[3]); 3698 return a; 3699 } 3700 } 3701 else 3702 { 3703 a.ptr[0] = sqrt(a.array[0]); 3704 a.ptr[1] = sqrt(a.array[1]); 3705 a.ptr[2] = sqrt(a.array[2]); 3706 a.ptr[3] = sqrt(a.array[3]); 3707 return a; 3708 } 3709 } 3710 unittest 3711 { 3712 __m256d A = _mm256_sqrt_pd(_mm256_set1_pd(4.0)); 3713 double[4] correct = [2.0, 2, 2, 2]; 3714 assert(A.array == correct); 3715 } 3716 3717 /// Compute the square root of packed single-precision (32-bit) floating-point elements in `a`. 3718 __m256 _mm256_sqrt_ps (__m256 a) pure @trusted 3719 { 3720 static if (GDC_with_AVX) 3721 { 3722 return __builtin_ia32_sqrtps256(a); 3723 } 3724 else version(LDC) 3725 { 3726 static if (__VERSION__ >= 2084) 3727 return llvm_sqrt(a); // that capability appeared in LDC 1.14 3728 else 3729 { 3730 a.ptr[0] = llvm_sqrt(a.array[0]); 3731 a.ptr[1] = llvm_sqrt(a.array[1]); 3732 a.ptr[2] = llvm_sqrt(a.array[2]); 3733 a.ptr[3] = llvm_sqrt(a.array[3]); 3734 a.ptr[4] = llvm_sqrt(a.array[4]); 3735 a.ptr[5] = llvm_sqrt(a.array[5]); 3736 a.ptr[6] = llvm_sqrt(a.array[6]); 3737 a.ptr[7] = llvm_sqrt(a.array[7]); 3738 return a; 3739 } 3740 } 3741 else 3742 { 3743 a.ptr[0] = sqrt(a.array[0]); 3744 a.ptr[1] = sqrt(a.array[1]); 3745 a.ptr[2] = sqrt(a.array[2]); 3746 a.ptr[3] = sqrt(a.array[3]); 3747 a.ptr[4] = sqrt(a.array[4]); 3748 a.ptr[5] = sqrt(a.array[5]); 3749 a.ptr[6] = sqrt(a.array[6]); 3750 a.ptr[7] = sqrt(a.array[7]); 3751 return a; 3752 } 3753 } 3754 unittest 3755 { 3756 __m256 A = _mm256_sqrt_ps(_mm256_set1_ps(4.0f)); 3757 float[8] correct = [2.0f, 2, 2, 2, 2, 2, 2, 2]; 3758 assert(A.array == correct); 3759 } 3760 3761 /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 3762 /// `a` into memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 3763 /// exception may be generated. 3764 void _mm256_store_pd (double* mem_addr, __m256d a) pure @system 3765 { 3766 *cast(__m256d*)mem_addr = a; 3767 } 3768 unittest 3769 { 3770 align(32) double[4] mem; 3771 double[4] correct = [1.0, 2, 3, 4]; 3772 _mm256_store_pd(mem.ptr, _mm256_setr_pd(1.0, 2, 3, 4)); 3773 assert(mem == correct); 3774 } 3775 3776 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 3777 /// `a` into memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 3778 /// exception may be generated. 3779 void _mm256_store_ps (float* mem_addr, __m256 a) pure @system 3780 { 3781 *cast(__m256*)mem_addr = a; 3782 } 3783 unittest 3784 { 3785 align(32) float[8] mem; 3786 float[8] correct = [1.0, 2, 3, 4, 5, 6, 7, 8]; 3787 _mm256_store_ps(mem.ptr, _mm256_set_ps(8.0, 7, 6, 5, 4, 3, 2, 1)); 3788 assert(mem == correct); 3789 } 3790 3791 /// Store 256-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 32-byte 3792 /// boundary or a general-protection exception may be generated. 3793 void _mm256_store_si256 (__m256i * mem_addr, __m256i a) pure @safe 3794 { 3795 *mem_addr = a; 3796 } 3797 unittest 3798 { 3799 align(32) long[4] mem; 3800 long[4] correct = [5, -6, -7, 8]; 3801 _mm256_store_si256(cast(__m256i*)(mem.ptr), _mm256_setr_epi64x(5, -6, -7, 8)); 3802 assert(mem == correct); 3803 } 3804 3805 /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 3806 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary. 3807 void _mm256_storeu_pd (double * mem_addr, __m256d a) pure @system 3808 { 3809 // PERF DMD 3810 static if (GDC_with_AVX) 3811 { 3812 __builtin_ia32_storeupd256(mem_addr, a); 3813 } 3814 else version(LDC) 3815 { 3816 storeUnaligned!__m256d(a, mem_addr); 3817 } 3818 else 3819 { 3820 for(int n = 0; n < 4; ++n) 3821 mem_addr[n] = a.array[n]; 3822 } 3823 } 3824 unittest 3825 { 3826 align(32) double[6] arr = [0.0, 0, 0, 0, 0, 0]; 3827 _mm256_storeu_pd(&arr[1], _mm256_set1_pd(4.0)); 3828 double[4] correct = [4.0, 4, 4, 4]; 3829 assert(arr[1..5] == correct); 3830 } 3831 3832 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 3833 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary. 3834 void _mm256_storeu_ps (float* mem_addr, __m256 a) pure @system 3835 { 3836 // PERF DMD 3837 static if (GDC_with_AVX) 3838 { 3839 __builtin_ia32_storeups256(mem_addr, a); 3840 } 3841 else version(LDC) 3842 { 3843 storeUnaligned!__m256(a, mem_addr); 3844 } 3845 else 3846 { 3847 for(int n = 0; n < 8; ++n) 3848 mem_addr[n] = a.array[n]; 3849 } 3850 } 3851 unittest 3852 { 3853 align(32) float[10] arr = [0.0f, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 3854 _mm256_storeu_ps(&arr[1], _mm256_set1_ps(4.0f)); 3855 float[8] correct = [4.0f, 4, 4, 4, 4, 4, 4, 4]; 3856 assert(arr[1..9] == correct); 3857 } 3858 3859 3860 /// Store 256-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned 3861 /// on any particular boundary. 3862 void _mm256_storeu_si256 (__m256i* mem_addr, __m256i a) pure @trusted 3863 { 3864 // PERF DMD 3865 static if (GDC_with_AVX) 3866 { 3867 __builtin_ia32_storedqu256(cast(char*)mem_addr, cast(ubyte32) a); 3868 } 3869 else version(LDC) 3870 { 3871 storeUnaligned!__m256i(a, cast(long*)mem_addr); 3872 } 3873 else 3874 { 3875 long4 v = cast(long4)a; 3876 long* p = cast(long*)mem_addr; 3877 for(int n = 0; n < 4; ++n) 3878 p[n] = v[n]; 3879 } 3880 } 3881 unittest 3882 { 3883 align(32) long[6] arr = [0, 0, 0, 0, 0, 0]; 3884 _mm256_storeu_si256( cast(__m256i*) &arr[1], _mm256_set1_epi64x(4)); 3885 long[4] correct = [4, 4, 4, 4]; 3886 assert(arr[1..5] == correct); 3887 } 3888 3889 /// Store the high and low 128-bit halves (each composed of 4 packed single-precision (32-bit) 3890 /// floating-point elements) from `a` into memory two different 128-bit locations. 3891 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 3892 void _mm256_storeu2_m128 (float* hiaddr, float* loaddr, __m256 a) pure @system 3893 { 3894 // This is way better on GDC, and similarly in LDC, vs using other intrinsics 3895 loaddr[0] = a.array[0]; 3896 loaddr[1] = a.array[1]; 3897 loaddr[2] = a.array[2]; 3898 loaddr[3] = a.array[3]; 3899 hiaddr[0] = a.array[4]; 3900 hiaddr[1] = a.array[5]; 3901 hiaddr[2] = a.array[6]; 3902 hiaddr[3] = a.array[7]; 3903 } 3904 unittest 3905 { 3906 align(32) float[11] A = [0.0f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 3907 _mm256_storeu2_m128(&A[1], &A[6], _mm256_set1_ps(2.0f)); 3908 float[11] correct = [0.0f, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0]; 3909 assert(A == correct); 3910 } 3911 3912 /// Store the high and low 128-bit halves (each composed of 2 packed double-precision (64-bit) 3913 /// floating-point elements) from `a` into memory two different 128-bit locations. 3914 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 3915 void _mm256_storeu2_m128d (double* hiaddr, double* loaddr, __m256d a) pure @system 3916 { 3917 loaddr[0] = a.array[0]; 3918 loaddr[1] = a.array[1]; 3919 hiaddr[0] = a.array[2]; 3920 hiaddr[1] = a.array[3]; 3921 } 3922 unittest 3923 { 3924 double[2] A; 3925 double[2] B; 3926 _mm256_storeu2_m128d(A.ptr, B.ptr, _mm256_set1_pd(-43.0)); 3927 double[2] correct = [-43.0, -43]; 3928 assert(A == correct); 3929 assert(B == correct); 3930 } 3931 3932 /// Store the high and low 128-bit halves (each composed of integer data) from `a` into memory two 3933 /// different 128-bit locations. 3934 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 3935 void _mm256_storeu2_m128i (__m128i* hiaddr, __m128i* loaddr, __m256i a) pure @trusted 3936 { 3937 long* hi = cast(long*)hiaddr; 3938 long* lo = cast(long*)loaddr; 3939 lo[0] = a.array[0]; 3940 lo[1] = a.array[1]; 3941 hi[0] = a.array[2]; 3942 hi[1] = a.array[3]; 3943 } 3944 unittest 3945 { 3946 long[2] A; 3947 long[2] B; 3948 _mm256_storeu2_m128i(cast(__m128i*)A.ptr, cast(__m128i*)B.ptr, _mm256_set1_epi64x(-42)); 3949 long[2] correct = [-42, -42]; 3950 assert(A == correct); 3951 assert(B == correct); 3952 } 3953 3954 /// Store 256-bits (composed of 4 packed single-precision (64-bit) floating-point elements) from 3955 /// `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 32-byte 3956 /// boundary or a general-protection exception may be generated. 3957 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 3958 void _mm256_stream_pd (double* mem_addr, __m256d a) pure @system 3959 { 3960 // PERF DMD 3961 // PERF GDC + SSE2 3962 static if (LDC_with_InlineIREx) 3963 { 3964 enum prefix = `!0 = !{ i32 1 }`; 3965 enum ir = ` 3966 store <4 x double> %1, <4 x double>* %0, align 32, !nontemporal !0 3967 ret void`; 3968 LDCInlineIREx!(prefix, ir, "", void, double4*, double4)(cast(double4*)mem_addr, a); 3969 } 3970 else static if (GDC_with_AVX) // any hope to be non-temporal? Using SSE2 instructions. 3971 { 3972 __builtin_ia32_movntpd256 (mem_addr, a); 3973 } 3974 else 3975 { 3976 // Regular store instead. 3977 __m256d* dest = cast(__m256d*)mem_addr; 3978 *dest = a; 3979 } 3980 } 3981 unittest 3982 { 3983 align(32) double[4] mem; 3984 double[4] correct = [5.0, -6, -7, 8]; 3985 _mm256_stream_pd(mem.ptr, _mm256_setr_pd(5.0, -6, -7, 8)); 3986 assert(mem == correct); 3987 } 3988 3989 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 3990 /// `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 32-byte 3991 /// boundary or a general-protection exception may be generated. 3992 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 3993 void _mm256_stream_ps (float* mem_addr, __m256 a) pure @system 3994 { 3995 // PERF DMD 3996 // PERF GDC + SSE2 3997 static if (LDC_with_InlineIREx) 3998 { 3999 enum prefix = `!0 = !{ i32 1 }`; 4000 enum ir = ` 4001 store <8 x float> %1, <8 x float>* %0, align 32, !nontemporal !0 4002 ret void`; 4003 LDCInlineIREx!(prefix, ir, "", void, float8*, float8)(cast(float8*)mem_addr, a); 4004 } 4005 else static if (GDC_with_AVX) 4006 { 4007 __builtin_ia32_movntps256 (mem_addr, a); 4008 } 4009 else 4010 { 4011 // Regular store instead. 4012 __m256* dest = cast(__m256*)mem_addr; 4013 *dest = a; 4014 } 4015 } 4016 unittest 4017 { 4018 align(32) float[8] mem; 4019 float[8] correct = [5, -6, -7, 8, 1, 2, 3, 4]; 4020 _mm256_stream_ps(mem.ptr, _mm256_setr_ps(5, -6, -7, 8, 1, 2, 3, 4)); 4021 assert(mem == correct); 4022 } 4023 4024 /// Store 256-bits of integer data from `a` into memory using a non-temporal memory hint. 4025 /// `mem_addr` must be aligned on a 32-byte boundary or a general-protection exception may be 4026 /// generated. 4027 /// Note: there isn't any particular instruction in AVX to do that. It just defers to SSE2. 4028 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4029 void _mm256_stream_si256 (__m256i * mem_addr, __m256i a) pure @trusted 4030 { 4031 // PERF DMD 4032 // PERF GDC 4033 static if (LDC_with_InlineIREx) 4034 { 4035 enum prefix = `!0 = !{ i32 1 }`; 4036 enum ir = ` 4037 store <4 x i64> %1, <4 x i64>* %0, align 16, !nontemporal !0 4038 ret void`; 4039 LDCInlineIREx!(prefix, ir, "", void, long4*, long4)(mem_addr, a); 4040 } 4041 else static if (GDC_with_SSE2) // any hope to be non-temporal? Using SSE2 instructions. 4042 { 4043 long2 lo, hi; 4044 lo.ptr[0] = a.array[0]; 4045 lo.ptr[1] = a.array[1]; 4046 hi.ptr[0] = a.array[2]; 4047 hi.ptr[1] = a.array[3]; 4048 _mm_stream_si128(cast(__m128i*)mem_addr, cast(__m128i)lo); 4049 _mm_stream_si128((cast(__m128i*)mem_addr) + 1, cast(__m128i)hi); 4050 } 4051 else 4052 { 4053 // Regular store instead. 4054 __m256i* dest = cast(__m256i*)mem_addr; 4055 *dest = a; 4056 } 4057 } 4058 unittest 4059 { 4060 align(32) long[4] mem; 4061 long[4] correct = [5, -6, -7, 8]; 4062 _mm256_stream_si256(cast(__m256i*)(mem.ptr), _mm256_setr_epi64x(5, -6, -7, 8)); 4063 assert(mem == correct); 4064 } 4065 4066 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from 4067 /// packed double-precision (64-bit) floating-point elements in `a`. 4068 __m256d _mm256_sub_pd (__m256d a, __m256d b) pure @safe 4069 { 4070 return a - b; 4071 } 4072 unittest 4073 { 4074 __m256d a = [1.5, -2.0, 3.0, 200000.0]; 4075 a = _mm256_sub_pd(a, a); 4076 double[4] correct = [0.0, 0, 0, 0]; 4077 assert(a.array == correct); 4078 } 4079 4080 /// Subtract packed single-precision (32-bit) floating-point elements in `b` from 4081 /// packed single-precision (32-bit) floating-point elements in `a`. 4082 __m256 _mm256_sub_ps (__m256 a, __m256 b) pure @safe 4083 { 4084 return a - b; 4085 } 4086 unittest 4087 { 4088 __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 1.5f, -2000.0f, 3.0f, 1.0f]; 4089 a = _mm256_sub_ps(a, a); 4090 float[8] correct = [0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f]; 4091 assert(a.array == correct); 4092 } 4093 4094 /// Compute the bitwise NOT of `a` and then AND with `b`, producing an intermediate value, and 4095 /// return 1 if the sign bit of each 64-bit element in the intermediate value is zero, 4096 /// otherwise return 0. 4097 int _mm_testc_pd (__m128d a, __m128d b) pure @trusted 4098 { 4099 static if (GDC_or_LDC_with_AVX) 4100 { 4101 return __builtin_ia32_vtestcpd(a, b); 4102 } 4103 else 4104 { 4105 // PERF: maybe do the generic version more like simde 4106 long2 la = cast(long2)a; 4107 long2 lb = cast(long2)b; 4108 long2 r = ~la & lb; 4109 return r.array[0] >= 0 && r.array[1] >= 0; 4110 } 4111 } 4112 unittest 4113 { 4114 __m128d A = _mm_setr_pd(-1, 1); 4115 __m128d B = _mm_setr_pd(-1, -1); 4116 __m128d C = _mm_setr_pd(1, -1); 4117 assert(_mm_testc_pd(A, A) == 1); 4118 assert(_mm_testc_pd(A, B) == 0); 4119 assert(_mm_testc_pd(B, A) == 1); 4120 } 4121 4122 ///ditto 4123 int _mm256_testc_pd (__m256d a, __m256d b) pure @safe 4124 { 4125 static if (GDC_or_LDC_with_AVX) 4126 { 4127 return __builtin_ia32_vtestcpd256(a, b); 4128 } 4129 else static if (LDC_with_ARM64) 4130 { 4131 // better to split than do vanilla (down to 10 inst) 4132 __m128d lo_a = _mm256_extractf128_pd!0(a); 4133 __m128d lo_b = _mm256_extractf128_pd!0(b); 4134 __m128d hi_a = _mm256_extractf128_pd!1(a); 4135 __m128d hi_b = _mm256_extractf128_pd!1(b); 4136 return _mm_testc_pd(lo_a, lo_b) & _mm_testc_pd(hi_a, hi_b); 4137 } 4138 else 4139 { 4140 // PERF: do the generic version more like simde, maybe this get rids of arm64 version 4141 long4 la = cast(long4)a; 4142 long4 lb = cast(long4)b; 4143 long4 r = ~la & lb; 4144 return r.array[0] >= 0 && r.array[1] >= 0 && r.array[2] >= 0 && r.array[3] >= 0; 4145 } 4146 } 4147 unittest 4148 { 4149 __m256d A = _mm256_setr_pd(-1, 1, -1, 1); 4150 __m256d B = _mm256_setr_pd(-1, -1, -1, -1); 4151 __m256d C = _mm256_setr_pd(1, -1, 1, -1); 4152 assert(_mm256_testc_pd(A, A) == 1); 4153 assert(_mm256_testc_pd(A, B) == 0); 4154 assert(_mm256_testc_pd(B, A) == 1); 4155 } 4156 4157 /// Compute the bitwise NOT of `a` and then AND with `b`, producing an intermediate value, and 4158 /// return 1 if the sign bit of each 32-bit element in the intermediate value is zero, 4159 /// otherwise return 0. 4160 int _mm_testc_ps (__m128 a, __m128 b) pure @safe 4161 { 4162 // PERF DMD 4163 static if (GDC_or_LDC_with_AVX) 4164 { 4165 return __builtin_ia32_vtestcps(a, b); 4166 } 4167 else static if (LDC_with_ARM64) 4168 { 4169 int4 la = cast(int4)a; 4170 int4 lb = cast(int4)b; 4171 int4 r = ~la & lb; 4172 int4 shift; 4173 shift = 31; 4174 r >>= shift; 4175 int[4] zero = [0, 0, 0, 0]; 4176 return r.array == zero; 4177 } 4178 else 4179 { 4180 // PERF: do the generic version more like simde, maybe this get rids of arm64 version 4181 int4 la = cast(int4)a; 4182 int4 lb = cast(int4)b; 4183 int4 r = ~la & lb; 4184 return r.array[0] >= 0 && r.array[1] >= 0 && r.array[2] >= 0 && r.array[3] >= 0; 4185 } 4186 } 4187 unittest 4188 { 4189 __m128 A = _mm_setr_ps(-1, 1, -1, 1); 4190 __m128 B = _mm_setr_ps(-1, -1, -1, -1); 4191 __m128 C = _mm_setr_ps(1, -1, 1, -1); 4192 assert(_mm_testc_ps(A, A) == 1); 4193 assert(_mm_testc_ps(A, B) == 0); 4194 assert(_mm_testc_ps(B, A) == 1); 4195 } 4196 4197 ///ditto 4198 int _mm256_testc_ps (__m256 a, __m256 b) pure @safe 4199 { 4200 // PERF DMD 4201 static if (GDC_or_LDC_with_AVX) 4202 { 4203 return __builtin_ia32_vtestcps256(a, b); 4204 } 4205 else static if (LDC_with_ARM64) 4206 { 4207 int8 la = cast(int8)a; 4208 int8 lb = cast(int8)b; 4209 int8 r = ~la & lb; 4210 int8 shift; 4211 shift = 31; 4212 r >>= shift; 4213 int[8] zero = [0, 0, 0, 0, 0, 0, 0, 0]; 4214 return r.array == zero; 4215 } 4216 else 4217 { 4218 // PERF: do the generic version more like simde, maybe this get rids of arm64 version 4219 int8 la = cast(int8)a; 4220 int8 lb = cast(int8)b; 4221 int8 r = ~la & lb; 4222 return r.array[0] >= 0 4223 && r.array[1] >= 0 4224 && r.array[2] >= 0 4225 && r.array[3] >= 0 4226 && r.array[4] >= 0 4227 && r.array[5] >= 0 4228 && r.array[6] >= 0 4229 && r.array[7] >= 0; 4230 } 4231 } 4232 unittest 4233 { 4234 __m256 A = _mm256_setr_ps(-1, 1, -1, 1, -1, 1, -1, 1); 4235 __m256 B = _mm256_setr_ps(-1, -1, -1, -1, -1, -1, -1, -1); 4236 __m256 C = _mm256_setr_ps( 1, -1, 1, -1, 1, 1, 1, 1); 4237 assert(_mm256_testc_ps(A, A) == 1); 4238 assert(_mm256_testc_ps(B, B) == 1); 4239 assert(_mm256_testc_ps(A, B) == 0); 4240 assert(_mm256_testc_ps(B, A) == 1); 4241 assert(_mm256_testc_ps(C, B) == 0); 4242 assert(_mm256_testc_ps(B, C) == 1); 4243 } 4244 4245 /// Compute the bitwise NOT of `a` and then AND with `b`, and return 1 if the result is zero, 4246 /// otherwise return 0. 4247 /// In other words, test if all bits masked by `b` are also 1 in `a`. 4248 int _mm256_testc_si256 (__m256i a, __m256i b) pure @trusted 4249 { 4250 static if (GDC_or_LDC_with_AVX) 4251 { 4252 return __builtin_ia32_ptestc256(cast(long4)a, cast(long4)b); 4253 } 4254 else static if (LDC_with_ARM64) 4255 { 4256 // better to split than do vanilla (down to 10 inst) 4257 __m128i lo_a = _mm256_extractf128_si256!0(a); 4258 __m128i lo_b = _mm256_extractf128_si256!0(b); 4259 __m128i hi_a = _mm256_extractf128_si256!1(a); 4260 __m128i hi_b = _mm256_extractf128_si256!1(b); 4261 return _mm_testc_si128(lo_a, lo_b) & _mm_testc_si128(hi_a, hi_b); 4262 } 4263 else 4264 { 4265 __m256i c = ~a & b; 4266 long[4] zero = [0, 0, 0, 0]; 4267 return c.array == zero; 4268 } 4269 } 4270 unittest 4271 { 4272 __m256i A = _mm256_setr_epi64(0x01, 0x02, 0x04, 0xf8); 4273 __m256i M1 = _mm256_setr_epi64(0xfe, 0xfd, 0x00, 0x00); 4274 __m256i M2 = _mm256_setr_epi64(0x00, 0x00, 0x04, 0x00); 4275 assert(_mm256_testc_si256(A, A) == 1); 4276 assert(_mm256_testc_si256(A, M1) == 0); 4277 assert(_mm256_testc_si256(A, M2) == 1); 4278 } 4279 4280 /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 4281 /// elements) in `a` and `b`, producing an intermediate 128-bit value, and set ZF to 1 if the 4282 /// sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. 4283 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set 4284 /// CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise 4285 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 4286 /// 4287 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`, 4288 /// AND there is at least one negative number in `b` that correspond to a negative number in `a`. 4289 int _mm_testnzc_pd (__m128d a, __m128d b) pure @safe 4290 { 4291 // PERF DMD 4292 static if (GDC_or_LDC_with_AVX) 4293 { 4294 return __builtin_ia32_vtestnzcpd(a, b); 4295 } 4296 else 4297 { 4298 // ZF = 0 means "there is at least one pair of negative numbers" 4299 // ZF = 1 means "no pairs of negative numbers" 4300 // CF = 0 means "there is a negative number in b that is next to a positive number in a" 4301 // CF = 1 means "all negative numbers in b are also negative in a" 4302 // Consequently, CF = 0 and ZF = 0 means: 4303 // "There is a pair of matching negative numbers in a and b, 4304 // AND also there is a negative number in b, that is matching a positive number in a" 4305 // Phew. 4306 4307 // courtesy of simd-everywhere 4308 __m128i m = _mm_and_si128(cast(__m128i)a, cast(__m128i)b); 4309 __m128i m2 = _mm_andnot_si128(cast(__m128i)a, cast(__m128i)b); 4310 m = _mm_srli_epi64(m, 63); 4311 m2 = _mm_srli_epi64(m2, 63); 4312 return cast(int)( m.array[0] | m.array[2]) & (m2.array[0] | m2.array[2]); 4313 } 4314 } 4315 unittest 4316 { 4317 __m128d PM = _mm_setr_pd( 1, -1); 4318 __m128d MP = _mm_setr_pd(-1, 1); 4319 __m128d MM = _mm_setr_pd(-1, -1); 4320 assert(_mm_testnzc_pd(PM, MP) == 0); 4321 assert(_mm_testnzc_pd(PM, MM) == 1); 4322 assert(_mm_testnzc_pd(MP, MP) == 0); 4323 assert(_mm_testnzc_pd(MP, MM) == 1); 4324 assert(_mm_testnzc_pd(MM, MM) == 0); 4325 } 4326 4327 /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 4328 /// elements) in `a` and `b`, producing an intermediate 256-bit value, and set ZF to 1 if the 4329 /// sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. 4330 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set 4331 /// CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise 4332 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 4333 /// 4334 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`, 4335 /// AND there is at least one negative number in `b` that correspond to a negative number in `a`. 4336 int _mm256_testnzc_pd (__m256d a, __m256d b) pure @safe 4337 { 4338 // PERF DMD 4339 static if (GDC_or_LDC_with_AVX) 4340 { 4341 return __builtin_ia32_vtestnzcpd256(a, b); 4342 } 4343 else 4344 { 4345 long4 la = cast(long4)a; 4346 long4 lb = cast(long4)b; 4347 long4 r = la & lb; 4348 long m = r.array[0] | r.array[1] | r.array[2] | r.array[3]; 4349 int ZF = (~m >> 63) & 1; 4350 long4 r2 = ~la & lb; 4351 long m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3]; 4352 int CF = (~m2 >> 63) & 1; 4353 return (CF | ZF) == 0; 4354 } 4355 } 4356 unittest 4357 { 4358 __m256d PM = _mm256_setr_pd( 1, -1, 1, 1); 4359 __m256d MP = _mm256_setr_pd(-1, 1, 1, 1); 4360 __m256d MM = _mm256_setr_pd(-1, -1, 1, 1); 4361 assert(_mm256_testnzc_pd(PM, MP) == 0); 4362 assert(_mm256_testnzc_pd(PM, MM) == 1); 4363 assert(_mm256_testnzc_pd(MP, MP) == 0); 4364 assert(_mm256_testnzc_pd(MP, MM) == 1); 4365 assert(_mm256_testnzc_pd(MM, MM) == 0); 4366 } 4367 4368 /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 4369 /// elements) in `a` and `b`, producing an intermediate 128-bit value, and set ZF to 1 if the 4370 /// sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. 4371 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set 4372 /// CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise 4373 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 4374 /// 4375 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`, 4376 /// AND there is at least one negative number in `b` that correspond to a negative number in `a`. 4377 int _mm_testnzc_ps (__m128 a, __m128 b) pure @safe 4378 { 4379 // PERF DMD 4380 static if (GDC_or_LDC_with_AVX) 4381 { 4382 return __builtin_ia32_vtestnzcps(a, b); 4383 } 4384 else 4385 { 4386 int4 la = cast(int4)a; 4387 int4 lb = cast(int4)b; 4388 int4 r = la & lb; 4389 int m = r.array[0] | r.array[1] | r.array[2] | r.array[3]; 4390 int ZF = (~m >> 31) & 1; 4391 int4 r2 = ~la & lb; 4392 int m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3]; 4393 int CF = (~m2 >> 31) & 1; 4394 return (CF | ZF) == 0; 4395 } 4396 } 4397 unittest 4398 { 4399 __m128 PM = _mm_setr_ps( 1, -1, 1, 1); 4400 __m128 MP = _mm_setr_ps(-1, 1, 1, 1); 4401 __m128 MM = _mm_setr_ps(-1, -1, 1, 1); 4402 assert(_mm_testnzc_ps(PM, MP) == 0); 4403 assert(_mm_testnzc_ps(PM, MM) == 1); 4404 assert(_mm_testnzc_ps(MP, MP) == 0); 4405 assert(_mm_testnzc_ps(MP, MM) == 1); 4406 assert(_mm_testnzc_ps(MM, MM) == 0); 4407 } 4408 4409 /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 4410 /// elements) in `a` and `b`, producing an intermediate 256-bit value, and set ZF to 1 if the 4411 /// sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. 4412 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set 4413 /// CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise 4414 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 4415 /// 4416 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`, 4417 /// AND there is at least one negative number in `b` that correspond to a negative number in `a`. 4418 int _mm256_testnzc_ps (__m256 a, __m256 b) pure @safe 4419 { 4420 // PERF DMD 4421 static if (GDC_or_LDC_with_AVX) 4422 { 4423 return __builtin_ia32_vtestnzcps256(a, b); 4424 } 4425 else 4426 { 4427 int8 la = cast(int8)a; 4428 int8 lb = cast(int8)b; 4429 int8 r = la & lb; 4430 int m = r.array[0] | r.array[1] | r.array[2] | r.array[3] 4431 | r.array[4] | r.array[5] | r.array[6] | r.array[7]; 4432 int ZF = (~m >> 31) & 1; 4433 int8 r2 = ~la & lb; 4434 int m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3] 4435 | r2.array[4] | r2.array[5] | r2.array[6] | r2.array[7]; 4436 int CF = (~m2 >> 31) & 1; 4437 return (CF | ZF) == 0; 4438 } 4439 } 4440 unittest 4441 { 4442 __m256 PM = _mm256_setr_ps(1, 1, 1, 1, 1, -1, 1, 1); 4443 __m256 MP = _mm256_setr_ps(1, 1, 1, 1, -1, 1, 1, 1); 4444 __m256 MM = _mm256_setr_ps(1, 1, 1, 1, -1, -1, 1, 1); 4445 assert(_mm256_testnzc_ps(PM, MP) == 0); 4446 assert(_mm256_testnzc_ps(PM, MM) == 1); 4447 assert(_mm256_testnzc_ps(MP, MP) == 0); 4448 assert(_mm256_testnzc_ps(MP, MM) == 1); 4449 assert(_mm256_testnzc_ps(MM, MM) == 0); 4450 } 4451 4452 /// Compute the bitwise AND of 256 bits (representing integer data) in `a` and `b`, 4453 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 4454 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 4455 /// result is zero, otherwise set CF to 0. 4456 /// Return 1 if both the ZF and CF values are zero, otherwise return 0. 4457 int _mm256_testnzc_si256 (__m256i a, __m256i b) pure @trusted 4458 { 4459 // PERF ARM64 4460 // PERF DMD 4461 // PERF LDC without AVX 4462 static if (GDC_or_LDC_with_AVX) 4463 { 4464 return __builtin_ia32_ptestnzc256(cast(long4) a, cast(long4) b); 4465 } 4466 else 4467 { 4468 // Need to defer to _mm_testnzc_si128 if possible, for more speed 4469 __m256i c = a & b; 4470 __m256i d = ~a & b; 4471 long m = c.array[0] | c.array[1] | c.array[2] | c.array[3]; 4472 long n = d.array[0] | d.array[1] | d.array[2] | d.array[3]; 4473 return (m != 0) & (n != 0); 4474 } 4475 } 4476 unittest 4477 { 4478 __m256i A = _mm256_setr_epi32(0x01, 0x02, 0x04, 0xf8, 0, 0, 0, 0); 4479 __m256i M = _mm256_setr_epi32(0x01, 0x40, 0x00, 0x00, 0, 0, 0, 0); 4480 __m256i Z = _mm256_setzero_si256(); 4481 assert(_mm256_testnzc_si256(A, Z) == 0); 4482 assert(_mm256_testnzc_si256(A, M) == 1); 4483 assert(_mm256_testnzc_si256(A, A) == 0); 4484 } 4485 4486 /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 4487 /// elements) in `a` and `b`, producing an intermediate 128-bit value, return 1 if the sign bit of 4488 /// each 64-bit element in the intermediate value is zero, otherwise return 0. 4489 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place. 4490 int _mm_testz_pd (__m128d a, __m128d b) pure @trusted 4491 { 4492 static if (GDC_or_LDC_with_AVX) 4493 { 4494 return __builtin_ia32_vtestzpd(a, b); 4495 } 4496 else 4497 { 4498 long2 la = cast(long2)a; 4499 long2 lb = cast(long2)b; 4500 long2 r = la & lb; 4501 long m = r.array[0] | r.array[1]; 4502 return (~m >> 63) & 1; 4503 } 4504 } 4505 unittest 4506 { 4507 __m128d A = _mm_setr_pd(-1, 1); 4508 __m128d B = _mm_setr_pd(-1, -1); 4509 __m128d C = _mm_setr_pd(1, -1); 4510 assert(_mm_testz_pd(A, A) == 0); 4511 assert(_mm_testz_pd(A, B) == 0); 4512 assert(_mm_testz_pd(C, A) == 1); 4513 } 4514 4515 /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 4516 /// elements) in `a` and `b`, producing an intermediate 256-bit value, return 1 if the sign bit of 4517 /// each 64-bit element in the intermediate value is zero, otherwise return 0. 4518 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place. 4519 int _mm256_testz_pd (__m256d a, __m256d b) pure @trusted 4520 { 4521 static if (GDC_or_LDC_with_AVX) 4522 { 4523 return __builtin_ia32_vtestzpd256(a, b); 4524 } 4525 else 4526 { 4527 long4 la = cast(long4)a; 4528 long4 lb = cast(long4)b; 4529 long4 r = la & lb; 4530 long r2 = r.array[0] | r.array[1] | r.array[2] | r.array[3]; 4531 return (~r2 >> 63) & 1; 4532 } 4533 } 4534 unittest 4535 { 4536 __m256d A = _mm256_setr_pd(-1, 1, -1, 1); 4537 __m256d B = _mm256_setr_pd(1, 1, -1, 1); 4538 __m256d C = _mm256_setr_pd(1, -1, 1, -1); 4539 assert(_mm256_testz_pd(A, A) == 0); 4540 assert(_mm256_testz_pd(A, B) == 0); 4541 assert(_mm256_testz_pd(C, A) == 1); 4542 } 4543 4544 /// Compute the bitwise AND of 128 bits (representing double-precision (32-bit) floating-point 4545 /// elements) in `a` and `b`, producing an intermediate 128-bit value, return 1 if the sign bit of 4546 /// each 32-bit element in the intermediate value is zero, otherwise return 0. 4547 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place. 4548 int _mm_testz_ps (__m128 a, __m128 b) pure @safe 4549 { 4550 // PERF DMD 4551 static if (GDC_or_LDC_with_AVX) 4552 { 4553 return __builtin_ia32_vtestzps(a, b); 4554 } 4555 else 4556 { 4557 int4 la = cast(int4)a; 4558 int4 lb = cast(int4)b; 4559 int4 r = la & lb; 4560 int m = r.array[0] | r.array[1] | r.array[2] | r.array[3]; 4561 return (~m >> 31) & 1; 4562 } 4563 } 4564 unittest 4565 { 4566 __m128 A = _mm_setr_ps(-1, 1, -1, 1); 4567 __m128 B = _mm_setr_ps( 1, 1, -1, 1); 4568 __m128 C = _mm_setr_ps( 1, -1, 1, -1); 4569 assert(_mm_testz_ps(A, A) == 0); 4570 assert(_mm_testz_ps(A, B) == 0); 4571 assert(_mm_testz_ps(C, A) == 1); 4572 assert(_mm_testz_ps(C, B) == 1); 4573 } 4574 4575 /// Compute the bitwise AND of 256 bits (representing double-precision (32-bit) floating-point 4576 /// elements) in `a` and `b`, producing an intermediate 256-bit value, return 1 if the sign bit of 4577 /// each 32-bit element in the intermediate value is zero, otherwise return 0. 4578 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place. 4579 int _mm256_testz_ps (__m256 a, __m256 b) pure @safe 4580 { 4581 // PERF DMD 4582 static if (GDC_or_LDC_with_AVX) 4583 { 4584 return __builtin_ia32_vtestzps256(a, b); 4585 } 4586 else 4587 { 4588 int8 la = cast(int8)a; 4589 int8 lb = cast(int8)b; 4590 int8 r = la & lb; 4591 int m = r.array[0] | r.array[1] | r.array[2] | r.array[3] 4592 | r.array[4] | r.array[5] | r.array[6] | r.array[7]; 4593 return (~m >> 31) & 1; 4594 } 4595 } 4596 4597 /// Compute the bitwise AND of 256 bits (representing integer data) in 4598 /// and return 1 if the result is zero, otherwise return 0. 4599 /// In other words, test if all bits masked by `b` are 0 in `a`. 4600 int _mm256_testz_si256 (__m256i a, __m256i b) @trusted 4601 { 4602 // PERF DMD 4603 static if (GDC_with_AVX) 4604 { 4605 return __builtin_ia32_ptestz256(cast(long4)a, cast(long4)b); 4606 } 4607 else static if (LDC_with_AVX) 4608 { 4609 return __builtin_ia32_ptestz256(cast(long4)a, cast(long4)b); 4610 } 4611 else version(LDC) 4612 { 4613 // better to split than do vanilla (down to 8 inst in arm64) 4614 __m128i lo_a = _mm256_extractf128_si256!0(a); 4615 __m128i lo_b = _mm256_extractf128_si256!0(b); 4616 __m128i hi_a = _mm256_extractf128_si256!1(a); 4617 __m128i hi_b = _mm256_extractf128_si256!1(b); 4618 return _mm_testz_si128(lo_a, lo_b) & _mm_testz_si128(hi_a, hi_b); 4619 } 4620 else 4621 { 4622 __m256i c = a & b; 4623 long[4] zero = [0, 0, 0, 0]; 4624 return c.array == zero; 4625 } 4626 } 4627 unittest 4628 { 4629 __m256i A = _mm256_setr_epi32(0x01, 0x02, 0x04, 0xf8, 0x01, 0x02, 0x04, 0xf8); 4630 __m256i M1 = _mm256_setr_epi32(0xfe, 0xfd, 0x00, 0x07, 0xfe, 0xfd, 0x00, 0x07); 4631 __m256i M2 = _mm256_setr_epi32(0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00); 4632 assert(_mm256_testz_si256(A, A) == 0); 4633 assert(_mm256_testz_si256(A, M1) == 1); 4634 assert(_mm256_testz_si256(A, M2) == 0); 4635 } 4636 4637 /// Return vector of type __m256d with undefined elements. 4638 __m256d _mm256_undefined_pd () pure @safe 4639 { 4640 __m256d r = void; 4641 return r; 4642 } 4643 4644 /// Return vector of type __m256 with undefined elements. 4645 __m256 _mm256_undefined_ps () pure @safe 4646 { 4647 __m256 r = void; 4648 return r; 4649 } 4650 4651 /// Return vector of type __m256i with undefined elements. 4652 __m256i _mm256_undefined_si256 () pure @safe 4653 { 4654 __m256i r = void; 4655 return r; 4656 } 4657 4658 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 4659 /// each 128-bit lane in `a` and `b`. 4660 __m256d _mm256_unpackhi_pd (__m256d a, __m256d b) pure @trusted 4661 { 4662 version(LDC) 4663 { 4664 return shufflevectorLDC!(double4, 1, 5, 3, 7)(a, b); 4665 } 4666 else static if (GDC_with_AVX) 4667 { 4668 return __builtin_ia32_unpckhpd256 (a, b); 4669 } 4670 else 4671 { 4672 __m256d r; 4673 r.ptr[0] = a.array[1]; 4674 r.ptr[1] = b.array[1]; 4675 r.ptr[2] = a.array[3]; 4676 r.ptr[3] = b.array[3]; 4677 return r; 4678 } 4679 } 4680 unittest 4681 { 4682 __m256d A = _mm256_setr_pd(1.0, 2, 3, 4); 4683 __m256d B = _mm256_setr_pd(5.0, 6, 7, 8); 4684 __m256d C = _mm256_unpackhi_pd(A, B); 4685 double[4] correct = [2.0, 6, 4, 8]; 4686 assert(C.array == correct); 4687 } 4688 4689 4690 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 4691 /// each 128-bit lane in `a` and `b`. 4692 __m256 _mm256_unpackhi_ps (__m256 a, __m256 b) pure @trusted 4693 { 4694 version(LDC) 4695 { 4696 return shufflevectorLDC!(float8, 2, 10, 3, 11, 6, 14, 7, 15)(a, b); 4697 } 4698 else static if (GDC_with_AVX) 4699 { 4700 return __builtin_ia32_unpckhps256 (a, b); 4701 } 4702 else 4703 { 4704 __m256 r; 4705 r.ptr[0] = a.array[2]; 4706 r.ptr[1] = b.array[2]; 4707 r.ptr[2] = a.array[3]; 4708 r.ptr[3] = b.array[3]; 4709 r.ptr[4] = a.array[6]; 4710 r.ptr[5] = b.array[6]; 4711 r.ptr[6] = a.array[7]; 4712 r.ptr[7] = b.array[7]; 4713 return r; 4714 } 4715 } 4716 unittest 4717 { 4718 __m256 A = _mm256_setr_ps(0.0f, 1, 2, 3, 4, 5, 6, 7); 4719 __m256 B = _mm256_setr_ps(8.0f, 9, 10, 11, 12, 13, 14, 15); 4720 __m256 C = _mm256_unpackhi_ps(A, B); 4721 float[8] correct = [2.0f, 10, 3, 11, 6, 14, 7, 15]; 4722 assert(C.array == correct); 4723 } 4724 4725 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of 4726 /// each 128-bit lane in `a` and `b`. 4727 __m256d _mm256_unpacklo_pd (__m256d a, __m256d b) 4728 { 4729 version(LDC) 4730 { 4731 return shufflevectorLDC!(double4, 0, 4, 2, 6)(a, b); 4732 } 4733 else static if (GDC_with_AVX) 4734 { 4735 return __builtin_ia32_unpcklpd256 (a, b); 4736 } 4737 else 4738 { 4739 __m256d r; 4740 r.ptr[0] = a.array[0]; 4741 r.ptr[1] = b.array[0]; 4742 r.ptr[2] = a.array[2]; 4743 r.ptr[3] = b.array[2]; 4744 return r; 4745 } 4746 } 4747 unittest 4748 { 4749 __m256d A = _mm256_setr_pd(1.0, 2, 3, 4); 4750 __m256d B = _mm256_setr_pd(5.0, 6, 7, 8); 4751 __m256d C = _mm256_unpacklo_pd(A, B); 4752 double[4] correct = [1.0, 5, 3, 7]; 4753 assert(C.array == correct); 4754 } 4755 4756 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of 4757 /// each 128-bit lane in `a` and `b`. 4758 __m256 _mm256_unpacklo_ps (__m256 a, __m256 b) 4759 { 4760 version(LDC) 4761 { 4762 return shufflevectorLDC!(float8, 0, 8, 1, 9, 4, 12, 5, 13)(a, b); 4763 } 4764 else static if (GDC_with_AVX) 4765 { 4766 return __builtin_ia32_unpcklps256 (a, b); 4767 } 4768 else 4769 { 4770 __m256 r; 4771 r.ptr[0] = a.array[0]; 4772 r.ptr[1] = b.array[0]; 4773 r.ptr[2] = a.array[1]; 4774 r.ptr[3] = b.array[1]; 4775 r.ptr[4] = a.array[4]; 4776 r.ptr[5] = b.array[4]; 4777 r.ptr[6] = a.array[5]; 4778 r.ptr[7] = b.array[5]; 4779 return r; 4780 } 4781 } 4782 unittest 4783 { 4784 __m256 A = _mm256_setr_ps(0.0f, 1, 2, 3, 4, 5, 6, 7); 4785 __m256 B = _mm256_setr_ps(8.0f, 9, 10, 11, 12, 13, 14, 15); 4786 __m256 C = _mm256_unpacklo_ps(A, B); 4787 float[8] correct = [0.0f, 8, 1, 9, 4, 12, 5, 13]; 4788 assert(C.array == correct); 4789 } 4790 4791 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 4792 __m256d _mm256_xor_pd (__m256d a, __m256d b) pure @safe 4793 { 4794 return cast(__m256d)( cast(__m256i)a ^ cast(__m256i)b ); 4795 } 4796 4797 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`. 4798 __m256 _mm256_xor_ps (__m256 a, __m256 b) pure @safe 4799 { 4800 return cast(__m256)( cast(__m256i)a ^ cast(__m256i)b ); 4801 } 4802 4803 void _mm256_zeroall () pure @safe 4804 { 4805 // PERF DMD needs to do it explicitely if AVX is ever used one day. 4806 4807 static if (GDC_with_AVX) 4808 { 4809 __builtin_ia32_vzeroall(); 4810 } 4811 else 4812 { 4813 // Do nothing. The transitions penalty are supposed handled by the backend (eg: LLVM). 4814 } 4815 } 4816 4817 void _mm256_zeroupper () pure @safe 4818 { 4819 // PERF DMD needs to do it explicitely if AVX is ever used. 4820 4821 static if (GDC_with_AVX) 4822 { 4823 __builtin_ia32_vzeroupper(); 4824 } 4825 else 4826 { 4827 // Do nothing. The transitions penalty are supposed handled by the backend (eg: LLVM). 4828 } 4829 4830 } 4831 4832 /// Cast vector of type `__m128d` to type `__m256d`; the upper 128 bits of the result are zeroed. 4833 __m256d _mm256_zextpd128_pd256 (__m128d a) pure @trusted 4834 { 4835 __m256d r; 4836 r.ptr[0] = a.array[0]; 4837 r.ptr[1] = a.array[1]; 4838 r.ptr[2] = 0; 4839 r.ptr[3] = 0; 4840 return r; 4841 } 4842 unittest 4843 { 4844 __m256d R = _mm256_zextpd128_pd256(_mm_setr_pd(2.0, -3.0)); 4845 double[4] correct = [2.0, -3, 0, 0]; 4846 assert(R.array == correct); 4847 } 4848 4849 /// Cast vector of type `__m128` to type `__m256`; the upper 128 bits of the result are zeroed. 4850 __m256 _mm256_zextps128_ps256 (__m128 a) pure @trusted 4851 { 4852 double2 la = cast(double2)a; 4853 double4 r; 4854 r.ptr[0] = la.array[0]; 4855 r.ptr[1] = la.array[1]; 4856 r.ptr[2] = 0; 4857 r.ptr[3] = 0; 4858 return cast(__m256)r; 4859 } 4860 unittest 4861 { 4862 __m256 R = _mm256_zextps128_ps256(_mm_setr_ps(2.0, -3.0, 4, -5)); 4863 float[8] correct = [2.0, -3, 4, -5, 0, 0, 0, 0]; 4864 assert(R.array == correct); 4865 } 4866 4867 /// Cast vector of type `__m128i` to type `__m256i`; the upper 128 bits of the result are zeroed. 4868 __m256i _mm256_zextsi128_si256 (__m128i a) pure @trusted 4869 { 4870 long2 la = cast(long2)a; 4871 __m256i r; 4872 r.ptr[0] = la.array[0]; 4873 r.ptr[1] = la.array[1]; 4874 r.ptr[2] = 0; 4875 r.ptr[3] = 0; 4876 return r; 4877 } 4878 unittest 4879 { 4880 __m256i R = _mm256_zextsi128_si256(_mm_setr_epi64(-1, 99)); 4881 long[4] correct = [-1, 99, 0, 0]; 4882 assert(R.array == correct); 4883 }