1 /** 2 * AVX and FP16C intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX 4 * 5 * Copyright: Guillaume Piolat 2022. 6 * Johan Engelen 2022. 7 * cet 2024. 8 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 9 */ 10 module inteli.avxintrin; 11 12 // AVX instructions 13 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=AVX 14 // Note: this header will work whether you have AVX enabled or not. 15 // With LDC, use "dflags-ldc": ["-mattr=+avx"] or equivalent to actively 16 // generate AVX instructions. 17 // With GDC, use "dflags-gdc": ["-mavx"] or equivalent to actively 18 // generate AVX instructions. 19 20 // This header also implements FP16C intrinsics. 21 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=F16C 22 // With LDC, use "dflags-ldc": ["-mattr=+f16c"] or equivalent to actively 23 // generate F16C instructions. 24 // With GDC, use "dflags-gdc": ["-mf16c"] or equivalent to actively 25 // generate F16C instructions. 26 27 /// IMPORTANT NOTE ABOUT MASK LOAD/STORE: 28 /// 29 /// In theory, masked load/store can adress unadressable memory provided the mask is zero. 30 /// In practice, that is not the case for the following reasons: 31 /// 32 /// - AMD manual says: 33 /// "Exception and trap behavior for elements not selected for loading or storing from/to memory 34 /// is implementation dependent. For instance, a given implementation may signal a data 35 /// breakpoint or a page fault for doublewords that are zero-masked and not actually written." 36 /// 37 /// - Intel fetches the whole cacheline anyway: 38 /// https://erik.science/2019/06/21/AVX-fun.html 39 /// "Even if the mask is stored in the special mask registers, it will still first fetch the data 40 /// before checking the mask." 41 /// 42 /// So intel-intrinsics adopted the tightened semantics of only adressing fully addressable memory 43 /// with masked loads and stores. 44 45 46 /// Some AVX intrinsics takes a float comparison constant. 47 /// When labelled "ordered" it means "AND ordered" 48 /// When labelled "unordered" it means "OR unordered" 49 alias _CMP_EQ = int; 50 ///ditto 51 enum : _CMP_EQ 52 { 53 _CMP_EQ_OQ = 0x00, // Equal (ordered, non-signaling) 54 _CMP_LT_OS = 0x01, // Less-than (ordered, signaling) 55 _CMP_LE_OS = 0x02, // Less-than-or-equal (ordered, signaling) 56 _CMP_UNORD_Q = 0x03, // Unordered (non-signaling) 57 _CMP_NEQ_UQ = 0x04, // Not-equal (unordered, non-signaling) 58 _CMP_NLT_US = 0x05, // Not-less-than (unordered, signaling) 59 _CMP_NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling) 60 _CMP_ORD_Q = 0x07, // Ordered (nonsignaling) 61 _CMP_EQ_UQ = 0x08, // Equal (unordered, non-signaling) 62 _CMP_NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling) 63 _CMP_NGT_US = 0x0a, // Not-greater-than (unordered, signaling) 64 _CMP_FALSE_OQ = 0x0b, // False (ordered, non-signaling) 65 _CMP_NEQ_OQ = 0x0c, // Not-equal (ordered, non-signaling) 66 _CMP_GE_OS = 0x0d, // Greater-than-or-equal (ordered, signaling) 67 _CMP_GT_OS = 0x0e, // Greater-than (ordered, signaling) 68 _CMP_TRUE_UQ = 0x0f, // True (unordered, non-signaling) 69 _CMP_EQ_OS = 0x10, // Equal (ordered, signaling) 70 _CMP_LT_OQ = 0x11, // Less-than (ordered, non-signaling) 71 _CMP_LE_OQ = 0x12, // Less-than-or-equal (ordered, non-signaling) 72 _CMP_UNORD_S = 0x13, // Unordered (signaling) 73 _CMP_NEQ_US = 0x14, // Not-equal (unordered, signaling) 74 _CMP_NLT_UQ = 0x15, // Not-less-than (unordered, non-signaling) 75 _CMP_NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, non-signaling) 76 _CMP_ORD_S = 0x17, // Ordered (signaling) 77 _CMP_EQ_US = 0x18, // Equal (unordered, signaling) 78 _CMP_NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, non-signaling) 79 _CMP_NGT_UQ = 0x1a, // Not-greater-than (unordered, non-signaling) 80 _CMP_FALSE_OS = 0x1b, // False (ordered, signaling) 81 _CMP_NEQ_OS = 0x1c, // Not-equal (ordered, signaling) 82 _CMP_GE_OQ = 0x1d, // Greater-than-or-equal (ordered, non-signaling) 83 _CMP_GT_OQ = 0x1e, // Greater-than (ordered, non-signaling) 84 _CMP_TRUE_US = 0x1f // (unordered, signaling) 85 } 86 87 public import inteli.types; 88 import inteli.internals; 89 90 // Pull in all previous instruction set intrinsics. 91 public import inteli.smmintrin; 92 public import inteli.tmmintrin; 93 public import inteli.nmmintrin; 94 95 96 97 // In x86, LDC earlier version may have trouble preserving the stack pointer when an unsupported 98 // 256-bit vector type is passed, and AVX is disabled. 99 // This leads to disabling some intrinsics in this particular situation, since they are not safe for 100 // the caller. 101 version(LDC) 102 { 103 version(X86) 104 { 105 enum llvm256BitStackWorkaroundIn32BitX86 = __VERSION__ < 2099; 106 } 107 else 108 enum llvm256BitStackWorkaroundIn32BitX86 = false; 109 } 110 else 111 enum llvm256BitStackWorkaroundIn32BitX86 = false; 112 113 114 115 116 nothrow @nogc: 117 118 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 119 __m256d _mm256_add_pd (__m256d a, __m256d b) pure @trusted 120 { 121 return a + b; 122 } 123 unittest 124 { 125 align(32) double[4] A = [-1, 2, -3, 40000]; 126 align(32) double[4] B = [ 9, -7, 8, -0.5]; 127 __m256d R = _mm256_add_pd(_mm256_load_pd(A.ptr), _mm256_load_pd(B.ptr)); 128 double[4] correct = [8, -5, 5, 39999.5]; 129 assert(R.array == correct); 130 } 131 132 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`. 133 __m256 _mm256_add_ps (__m256 a, __m256 b) pure @trusted 134 { 135 return a + b; 136 } 137 unittest 138 { 139 align(32) float[8] A = [-1.0f, 2, -3, 40000, 0, 3, 5, 6]; 140 align(32) float[8] B = [ 9.0f, -7, 8, -0.5, 8, 7, 3, -1]; 141 __m256 R = _mm256_add_ps(_mm256_load_ps(A.ptr), _mm256_load_ps(B.ptr)); 142 float[8] correct = [8, -5, 5, 39999.5, 8, 10, 8, 5]; 143 assert(R.array == correct); 144 } 145 146 /// Alternatively add and subtract packed double-precision (64-bit) floating-point 147 /// elements in `a` to/from packed elements in `b`. 148 __m256d _mm256_addsub_pd (__m256d a, __m256d b) pure @trusted 149 { 150 // PERF DMD 151 static if (GDC_or_LDC_with_AVX) 152 { 153 return __builtin_ia32_addsubpd256(a, b); 154 } 155 else 156 { 157 //// Note: GDC x86 generates addsubpd since GDC 11.1 with -O3 158 //// LDC x86 generates addsubpd since LDC 1.18 with -O2 159 //// LDC ARM: not fantastic, ok since LDC 1.18 -O2 160 a.ptr[0] = a.array[0] + (-b.array[0]); 161 a.ptr[1] = a.array[1] + b.array[1]; 162 a.ptr[2] = a.array[2] + (-b.array[2]); 163 a.ptr[3] = a.array[3] + b.array[3]; 164 return a; 165 } 166 } 167 unittest 168 { 169 align(32) double[4] A = [-1, 2, -3, 40000]; 170 align(32) double[4] B = [ 9, -7, 8, -0.5]; 171 __m256d R = _mm256_addsub_pd(_mm256_load_pd(A.ptr), _mm256_load_pd(B.ptr)); 172 double[4] correct = [-10, -5, -11, 39999.5]; 173 assert(R.array == correct); 174 } 175 176 /// Alternatively add and subtract packed single-precision (32-bit) floating-point elements 177 /// in `a` to/from packed elements in `b`. 178 __m256 _mm256_addsub_ps (__m256 a, __m256 b) pure @trusted 179 { 180 // PERF DMD 181 static if (GDC_or_LDC_with_AVX) 182 { 183 return __builtin_ia32_addsubps256(a, b); 184 } 185 else 186 { 187 // Note: GDC x86 generates addsubps since GDC 11 -O3 188 // and in absence of AVX, a pair of SSE3 addsubps since GDC 12 -O2 189 // LDC x86 generates addsubps since LDC 1.18 -O2 190 // and in absence of AVX, a pair of SSE3 addsubps since LDC 1.1 -O1 191 // LDC ARM: neat output since LDC 1.21 -O2 192 193 a.ptr[0] = a.array[0] + (-b.array[0]); 194 a.ptr[1] = a.array[1] + b.array[1]; 195 a.ptr[2] = a.array[2] + (-b.array[2]); 196 a.ptr[3] = a.array[3] + b.array[3]; 197 a.ptr[4] = a.array[4] + (-b.array[4]); 198 a.ptr[5] = a.array[5] + b.array[5]; 199 a.ptr[6] = a.array[6] + (-b.array[6]); 200 a.ptr[7] = a.array[7] + b.array[7]; 201 return a; 202 } 203 } 204 unittest 205 { 206 align(32) float[8] A = [-1.0f, 2, -3, 40000, 0, 3, 5, 6]; 207 align(32) float[8] B = [ 9.0f, -7, 8, -0.5, 8, 7, 3, -1]; 208 __m256 R = _mm256_addsub_ps(_mm256_load_ps(A.ptr), _mm256_load_ps(B.ptr)); 209 float[8] correct = [ -10, -5, -11, 39999.5, -8, 10, 2, 5]; 210 assert(R.array == correct); 211 } 212 213 /// Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in `a` and `b`. 214 __m256d _mm256_and_pd (__m256d a, __m256d b) pure @trusted 215 { 216 // Note: GCC avxintrin.h uses the builtins for AND NOTAND OR of _ps and _pd, 217 // but those do not seem needed at any optimization level. 218 return cast(__m256d)(cast(__m256i)a & cast(__m256i)b); 219 } 220 unittest 221 { 222 double a = 4.32; 223 double b = -78.99; 224 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 225 __m256d A = _mm256_set_pd(a, b, a, b); 226 __m256d B = _mm256_set_pd(b, a, b, a); 227 long4 R = cast(long4)( _mm256_and_pd(A, B) ); 228 assert(R.array[0] == correct); 229 assert(R.array[1] == correct); 230 assert(R.array[2] == correct); 231 assert(R.array[3] == correct); 232 } 233 234 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`. 235 __m256 _mm256_and_ps (__m256 a, __m256 b) pure @trusted 236 { 237 return cast(__m256)(cast(__m256i)a & cast(__m256i)b); 238 } 239 unittest 240 { 241 float a = 4.32f; 242 float b = -78.99f; 243 int correct = (*cast(int*)(&a)) & (*cast(int*)(&b)); 244 __m256 A = _mm256_set_ps(a, b, a, b, a, b, a, b); 245 __m256 B = _mm256_set_ps(b, a, b, a, b, a, b, a); 246 int8 R = cast(int8)( _mm256_and_ps(A, B) ); 247 foreach(i; 0..8) 248 assert(R.array[i] == correct); 249 } 250 251 /// Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in `a` 252 /// and then AND with b. 253 __m256d _mm256_andnot_pd (__m256d a, __m256d b) pure @trusted 254 { 255 // PERF DMD 256 __m256i notA = _mm256_not_si256(cast(__m256i)a); 257 __m256i ib = cast(__m256i)b; 258 __m256i ab = notA & ib; 259 return cast(__m256d)ab; 260 } 261 unittest 262 { 263 double a = 4.32; 264 double b = -78.99; 265 long notA = ~ ( *cast(long*)(&a) ); 266 long correct = notA & (*cast(long*)(&b)); 267 __m256d A = _mm256_set_pd(a, a, a, a); 268 __m256d B = _mm256_set_pd(b, b, b, b); 269 long4 R = cast(long4)( _mm256_andnot_pd(A, B) ); 270 foreach(i; 0..4) 271 assert(R.array[i] == correct); 272 } 273 274 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` 275 /// and then AND with b. 276 __m256 _mm256_andnot_ps (__m256 a, __m256 b) pure @trusted 277 { 278 // PERF DMD 279 __m256i notA = _mm256_not_si256(cast(__m256i)a); 280 __m256i ib = cast(__m256i)b; 281 __m256i ab = notA & ib; 282 return cast(__m256)ab; 283 } 284 unittest 285 { 286 float a = 4.32f; 287 float b = -78.99f; 288 int notA = ~ ( *cast(int*)(&a) ); 289 int correct = notA & (*cast(int*)(&b)); 290 __m256 A = _mm256_set1_ps(a); 291 __m256 B = _mm256_set1_ps(b); 292 int8 R = cast(int8)( _mm256_andnot_ps(A, B) ); 293 foreach(i; 0..8) 294 assert(R.array[i] == correct); 295 } 296 297 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control 298 /// mask `imm8`. 299 __m256d _mm256_blend_pd(int imm8)(__m256d a, __m256d b) 300 { 301 static assert(imm8 >= 0 && imm8 < 16); 302 303 // PERF DMD 304 static if (GDC_with_AVX) 305 { 306 return __builtin_ia32_blendpd256 (a, b, imm8); 307 } 308 else 309 { 310 // Works great with LDC. 311 double4 r; 312 for (int n = 0; n < 4; ++n) 313 { 314 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 315 } 316 return r; 317 } 318 } 319 unittest 320 { 321 __m256d A = _mm256_setr_pd(0, 1, 2, 3); 322 __m256d B = _mm256_setr_pd(8, 9, 10, 11); 323 double4 C = _mm256_blend_pd!0x06(A, B); 324 double[4] correct = [0, 9, 10, 3]; 325 assert(C.array == correct); 326 } 327 328 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control 329 /// mask `imm8`. 330 __m256 _mm256_blend_ps(int imm8)(__m256 a, __m256 b) pure @trusted 331 { 332 static assert(imm8 >= 0 && imm8 < 256); 333 // PERF DMD 334 static if (GDC_with_AVX) 335 { 336 return __builtin_ia32_blendps256 (a, b, imm8); 337 } 338 else version(LDC) 339 { 340 // LDC x86: generates a vblendps since LDC 1.1 -O0 341 // arm64: pretty good, four instructions worst case 342 return shufflevectorLDC!(float8, (imm8 & 1) ? 8 : 0, 343 (imm8 & 2) ? 9 : 1, 344 (imm8 & 4) ? 10 : 2, 345 (imm8 & 8) ? 11 : 3, 346 (imm8 & 16) ? 12 : 4, 347 (imm8 & 32) ? 13 : 5, 348 (imm8 & 64) ? 14 : 6, 349 (imm8 & 128) ? 15 : 7)(a, b); 350 } 351 else 352 { 353 // LDC x86: vblendps generated since LDC 1.27 -O1 354 float8 r; 355 for (int n = 0; n < 8; ++n) 356 { 357 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 358 } 359 return r; 360 } 361 } 362 unittest 363 { 364 __m256 A = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7); 365 __m256 B = _mm256_setr_ps(8, 9, 10, 11, 12, 13, 14, 15); 366 float8 C = _mm256_blend_ps!0xe7(A, B); 367 float[8] correct = [8, 9, 10, 3, 4, 13, 14, 15]; 368 assert(C.array == correct); 369 } 370 371 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using mask. 372 __m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask) @trusted 373 { 374 // PERF DMD 375 static if (GDC_with_AVX) 376 { 377 // Amazingly enough, GCC/GDC generates the vblendvpd instruction 378 // with -mavx2 but not -mavx. 379 // Not sure what is the reason, and there is a replacement sequence. 380 // Sounds like a bug, similar to _mm_blendv_pd 381 // or maybe the instruction in unsafe? 382 return __builtin_ia32_blendvpd256(a, b, mask); 383 } 384 else static if (LDC_with_AVX) 385 { 386 return __builtin_ia32_blendvpd256(a, b, mask); 387 } 388 else 389 { 390 // LDC x86: vblendvpd since LDC 1.27 -O2 391 // arm64: only 4 instructions, since LDC 1.27 -O2 392 __m256d r; 393 long4 lmask = cast(long4)mask; 394 for (int n = 0; n < 4; ++n) 395 { 396 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 397 } 398 return r; 399 } 400 } 401 unittest 402 { 403 __m256d A = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0); 404 __m256d B = _mm256_setr_pd(5.0, 6.0, 7.0, 8.0); 405 __m256d M = _mm256_setr_pd(-3.0, 2.0, 1.0, -4.0); 406 __m256d R = _mm256_blendv_pd(A, B, M); 407 double[4] correct1 = [5.0, 2.0, 3.0, 8.0]; 408 assert(R.array == correct1); 409 } 410 411 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` 412 /// using `mask`. 413 __m256 _mm256_blendv_ps (__m256 a, __m256 b, __m256 mask) @trusted 414 { 415 // PERF DMD 416 static if (GDC_or_LDC_with_AVX) 417 { 418 return __builtin_ia32_blendvps256(a, b, mask); 419 } 420 else static if (LDC_with_ARM64) 421 { 422 int8 shift; 423 shift = 31; 424 int8 lmask = cast(int8)mask >> shift; 425 int8 ia = cast(int8)a; 426 int8 ib = cast(int8)b; 427 return cast(__m256)(ia ^ ((ia ^ ib) & lmask)); 428 } 429 else 430 { 431 // In both LDC and GDC with SSE4.1, this generates blendvps as fallback 432 __m256 r; 433 int8 lmask = cast(int8)mask; 434 for (int n = 0; n < 8; ++n) 435 { 436 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 437 } 438 return r; 439 } 440 } 441 unittest 442 { 443 __m256 A = _mm256_setr_ps(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); 444 __m256 B = _mm256_setr_ps(5.0f, 6.0f, 7.0f, 8.0f, 5.0f, 6.0f, 7.0f, 8.0f); 445 __m256 M = _mm256_setr_ps(-3.0f, 2.0f, 1.0f, -4.0f, -3.0f, 2.0f, 1.0f, -4.0f); 446 __m256 R = _mm256_blendv_ps(A, B, M); 447 float[8] correct1 = [5.0f, 2.0f, 3.0f, 8.0f, 5.0f, 2.0f, 3.0f, 8.0f]; 448 assert(R.array == correct1); 449 } 450 451 /// Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit) 452 /// floating-point elements) to all elements. 453 /// This effectively duplicates the 128-bit vector. 454 __m256d _mm256_broadcast_pd (const(__m128d)* mem_addr) pure @trusted 455 { 456 // PERF DMD 457 static if (GDC_with_AVX) 458 { 459 return __builtin_ia32_vbroadcastf128_pd256(cast(float4*)mem_addr); 460 } 461 else 462 { 463 const(double)* p = cast(const(double)*) mem_addr; 464 __m256d r; 465 r.ptr[0] = p[0]; 466 r.ptr[1] = p[1]; 467 r.ptr[2] = p[0]; 468 r.ptr[3] = p[1]; 469 return r; 470 } 471 } 472 unittest 473 { 474 __m128d A = _mm_setr_pd(3, -4); 475 __m256d B = _mm256_broadcast_pd(&A); 476 double[4] correct = [3, -4, 3, -4]; 477 assert(B.array == correct); 478 } 479 480 /// Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) 481 /// floating-point elements) to all elements. 482 /// This effectively duplicates the 128-bit vector. 483 __m256 _mm256_broadcast_ps (const(__m128)* mem_addr) pure @trusted 484 { 485 // PERF DMD 486 static if (GDC_with_AVX) 487 { 488 return __builtin_ia32_vbroadcastf128_ps256(cast(float4*)mem_addr); 489 } 490 else 491 { 492 const(float)* p = cast(const(float)*)mem_addr; 493 __m256 r; 494 r.ptr[0] = p[0]; 495 r.ptr[1] = p[1]; 496 r.ptr[2] = p[2]; 497 r.ptr[3] = p[3]; 498 r.ptr[4] = p[0]; 499 r.ptr[5] = p[1]; 500 r.ptr[6] = p[2]; 501 r.ptr[7] = p[3]; 502 return r; 503 } 504 } 505 unittest 506 { 507 __m128 A = _mm_setr_ps(1, 2, 3, -4); 508 __m256 B = _mm256_broadcast_ps(&A); 509 float[8] correct = [1.0f, 2, 3, -4, 1, 2, 3, -4]; 510 assert(B.array == correct); 511 } 512 513 /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements. 514 __m256d _mm256_broadcast_sd (const(double)* mem_addr) pure @trusted 515 { 516 static if (GDC_with_AVX) 517 { 518 return __builtin_ia32_vbroadcastsd256(mem_addr); 519 } 520 else 521 { 522 double a = *mem_addr; 523 __m256d r; 524 r.ptr[0] = a; 525 r.ptr[1] = a; 526 r.ptr[2] = a; 527 r.ptr[3] = a; 528 return r; 529 } 530 } 531 unittest 532 { 533 double t = 7.5f; 534 __m256d A = _mm256_broadcast_sd(&t); 535 double[4] correct = [7.5, 7.5, 7.5, 7.5]; 536 assert(A.array == correct); 537 } 538 539 /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements. 540 __m128 _mm_broadcast_ss (const(float)* mem_addr) pure @trusted 541 { 542 // PERF DMD 543 static if (GDC_with_AVX) 544 { 545 return __builtin_ia32_vbroadcastss(mem_addr); 546 } 547 else 548 { 549 float a = *mem_addr; 550 __m128 r; 551 r.ptr[0] = a; 552 r.ptr[1] = a; 553 r.ptr[2] = a; 554 r.ptr[3] = a; 555 return r; 556 } 557 } 558 unittest 559 { 560 float t = 7.5f; 561 __m128 A = _mm_broadcast_ss(&t); 562 float[4] correct = [7.5f, 7.5f, 7.5f, 7.5f]; 563 assert(A.array == correct); 564 } 565 566 __m256 _mm256_broadcast_ss (const(float)* mem_addr) 567 { 568 // PERF DMD 569 static if (GDC_with_AVX) 570 { 571 return __builtin_ia32_vbroadcastss256 (mem_addr); 572 } 573 else 574 { 575 float a = *mem_addr; 576 __m256 r = __m256(a); 577 return r; 578 } 579 } 580 unittest 581 { 582 float t = 7.5f; 583 __m256 A = _mm256_broadcast_ss(&t); 584 float[8] correct = [7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f]; 585 assert(A.array == correct); 586 } 587 588 /// Cast vector of type `__m256d` to type `__m256`. 589 __m256 _mm256_castpd_ps (__m256d a) pure @safe 590 { 591 return cast(__m256)a; 592 } 593 594 /// Cast vector of type `__m256d` to type `__m256i`. 595 __m256i _mm256_castpd_si256 (__m256d a) pure @safe 596 { 597 return cast(__m256i)a; 598 } 599 600 /// Cast vector of type `__m128d` to type `__m256d`; the upper 128 bits of the result are undefined. 601 __m256d _mm256_castpd128_pd256 (__m128d a) pure @trusted 602 { 603 static if (GDC_with_AVX) 604 { 605 return __builtin_ia32_pd256_pd(a); 606 } 607 else 608 { 609 __m256d r = void; 610 r.ptr[0] = a.array[0]; 611 r.ptr[1] = a.array[1]; 612 return r; 613 } 614 } 615 unittest 616 { 617 __m128d A = _mm_setr_pd(4.0, -6.125); 618 __m256d B = _mm256_castpd128_pd256(A); 619 assert(B.array[0] == 4.0); 620 assert(B.array[1] == -6.125); 621 } 622 623 /// Cast vector of type `__m256d` to type `__m128d`; the upper 128 bits of `a` are lost. 624 __m128d _mm256_castpd256_pd128 (__m256d a) pure @trusted 625 { 626 static if (GDC_with_AVX) 627 { 628 return __builtin_ia32_pd_pd256(a); 629 } 630 else 631 { 632 __m128d r; 633 r.ptr[0] = a.array[0]; 634 r.ptr[1] = a.array[1]; 635 return r; 636 } 637 } 638 unittest 639 { 640 __m256d A = _mm256_set_pd(1, 2, -6.25, 4.0); 641 __m128d B = _mm256_castpd256_pd128(A); 642 assert(B.array[0] == 4.0); 643 assert(B.array[1] == -6.25); 644 } 645 646 /// Cast vector of type `__m256` to type `__m256d`. 647 __m256d _mm256_castps_pd (__m256 a) pure @safe 648 { 649 return cast(__m256d)a; 650 } 651 652 /// Cast vector of type `__m256` to type `__m256i`. 653 __m256i _mm256_castps_si256 (__m256 a) pure @safe 654 { 655 return cast(__m256i)a; 656 } 657 658 /// Cast vector of type `__m128` to type `__m256`; the upper 128 bits of the result are undefined. 659 __m256 _mm256_castps128_ps256 (__m128 a) pure @trusted 660 { 661 static if (GDC_with_AVX) 662 { 663 return __builtin_ia32_ps256_ps(a); 664 } 665 else 666 { 667 __m256 r = void; 668 r.ptr[0] = a.array[0]; 669 r.ptr[1] = a.array[1]; 670 r.ptr[2] = a.array[2]; 671 r.ptr[3] = a.array[3]; 672 return r; 673 } 674 } 675 unittest 676 { 677 __m128 A = _mm_setr_ps(1.0f, 2, 3, 4); 678 __m256 B = _mm256_castps128_ps256(A); 679 float[4] correct = [1.0f, 2, 3, 4]; 680 assert(B.array[0..4] == correct); 681 } 682 683 /// Cast vector of type `__m256` to type `__m128`. The upper 128-bit of `a` are lost. 684 __m128 _mm256_castps256_ps128 (__m256 a) pure @trusted 685 { 686 return *cast(const(__m128)*)(&a); 687 } 688 unittest 689 { 690 __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8); 691 __m128 B = _mm256_castps256_ps128(A); 692 float[4] correct = [1.0f, 2, 3, 4]; 693 assert(B.array == correct); 694 } 695 696 /// Cast vector of type `__m128i` to type `__m256i`; the upper 128 bits of the result are undefined. 697 __m256i _mm256_castsi128_si256 (__m128i a) pure @trusted 698 { 699 long2 la = cast(long2)a; 700 long4 r = void; 701 r.ptr[0] = la.array[0]; 702 r.ptr[1] = la.array[1]; 703 return r; 704 } 705 unittest 706 { 707 __m128i A = _mm_setr_epi64(-1, 42); 708 __m256i B = _mm256_castsi128_si256(A); 709 long[2] correct = [-1, 42]; 710 assert(B.array[0..2] == correct); 711 } 712 713 /// Cast vector of type `__m256i` to type `__m256d`. 714 __m256d _mm256_castsi256_pd (__m256i a) pure @safe 715 { 716 return cast(__m256d)a; 717 } 718 719 /// Cast vector of type `__m256i` to type `__m256`. 720 __m256 _mm256_castsi256_ps (__m256i a) pure @safe 721 { 722 return cast(__m256)a; 723 } 724 725 /// Cast vector of type `__m256i` to type `__m128i`. The upper 128-bit of `a` are lost. 726 __m128i _mm256_castsi256_si128 (__m256i a) pure @trusted 727 { 728 long2 r = void; 729 r.ptr[0] = a.array[0]; 730 r.ptr[1] = a.array[1]; 731 return cast(__m128i)r; 732 } 733 unittest 734 { 735 long4 A; 736 A.ptr[0] = -1; 737 A.ptr[1] = 42; 738 long2 B = cast(long2)(_mm256_castsi256_si128(A)); 739 long[2] correct = [-1, 42]; 740 assert(B.array[0..2] == correct); 741 } 742 743 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer 744 /// value, and store the results as packed double-precision floating-point elements. 745 __m256d _mm256_ceil_pd (__m256d a) @safe 746 { 747 static if (LDC_with_ARM64) 748 { 749 __m128d lo = _mm256_extractf128_pd!0(a); 750 __m128d hi = _mm256_extractf128_pd!1(a); 751 __m128d ilo = _mm_ceil_pd(lo); 752 __m128d ihi = _mm_ceil_pd(hi); 753 return _mm256_set_m128d(ihi, ilo); 754 } 755 else 756 { 757 return _mm256_round_pd!2(a); 758 } 759 } 760 unittest 761 { 762 __m256d A = _mm256_setr_pd(1.3f, -2.12f, 53.6f, -2.7f); 763 A = _mm256_ceil_pd(A); 764 double[4] correct = [2.0, -2.0, 54.0, -2.0]; 765 assert(A.array == correct); 766 } 767 768 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer 769 /// value, and store the results as packed single-precision floating-point elements. 770 __m256 _mm256_ceil_ps (__m256 a) @safe 771 { 772 static if (LDC_with_ARM64) 773 { 774 __m128 lo = _mm256_extractf128_ps!0(a); 775 __m128 hi = _mm256_extractf128_ps!1(a); 776 __m128 ilo = _mm_ceil_ps(lo); 777 __m128 ihi = _mm_ceil_ps(hi); 778 return _mm256_set_m128(ihi, ilo); 779 } 780 else 781 { 782 return _mm256_round_ps!2(a); 783 } 784 } 785 unittest 786 { 787 __m256 A = _mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f); 788 __m256 C = _mm256_ceil_ps(A); 789 float[8] correct = [2.0f, -2.0f, 54.0f, -2.0f, -1, 3, -53, 3]; 790 assert(C.array == correct); 791 } 792 793 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b` based on the 794 /// comparison operand specified by `imm8`. 795 __m128d _mm_cmp_pd(int imm8)(__m128d a, __m128d b) pure @safe 796 { 797 enum comparison = mapAVXFPComparison(imm8); 798 return cast(__m128d) cmppd!comparison(a, b); 799 } 800 unittest 801 { 802 __m128d A = _mm_setr_pd(double.infinity, double.nan); 803 __m128d B = _mm_setr_pd(3.0, 4.0); 804 long2 R = cast(long2) _mm_cmp_pd!_CMP_GT_OS(A, B); 805 long[2] correct = [-1, 0]; 806 assert(R.array == correct); 807 808 long2 R2 = cast(long2) _mm_cmp_pd!_CMP_NLE_UQ(A, B); 809 long[2] correct2 = [-1, -1]; 810 assert(R2.array == correct2); 811 } 812 813 ///ditto 814 __m256d _mm256_cmp_pd(int imm8)(__m256d a, __m256d b) pure @safe 815 { 816 enum comparison = mapAVXFPComparison(imm8); 817 return cast(__m256d) cmppd256!comparison(a, b); 818 } 819 unittest 820 { 821 __m256d A = _mm256_setr_pd(1.0, 2.0, 3.0, double.nan); 822 __m256d B = _mm256_setr_pd(3.0, 2.0, 1.0, double.nan); 823 __m256i R = cast(__m256i) _mm256_cmp_pd!_CMP_LT_OS(A, B); 824 long[4] correct = [-1, 0, 0, 0]; 825 assert(R.array == correct); 826 } 827 828 /// Compare packed double-precision (32-bit) floating-point elements in `a` and `b` based on the 829 /// comparison operand specified by `imm8`. 830 __m128 _mm_cmp_ps(int imm8)(__m128 a, __m128 b) pure @safe 831 { 832 enum comparison = mapAVXFPComparison(imm8); 833 return cast(__m128) cmpps!comparison(a, b); 834 } 835 836 ///ditto 837 __m256 _mm256_cmp_ps(int imm8)(__m256 a, __m256 b) pure @safe 838 { 839 enum comparison = mapAVXFPComparison(imm8); 840 return cast(__m256) cmpps256!comparison(a, b); 841 } 842 843 /// Compare the lower double-precision (64-bit) floating-point element in `a` and `b` based on the 844 /// comparison operand specified by `imm8`, store the result in the lower element of result, and 845 /// copy the upper element from `a` to the upper element of result. 846 __m128d _mm_cmp_sd(int imm8)(__m128d a, __m128d b) pure @safe 847 { 848 enum comparison = mapAVXFPComparison(imm8); 849 return cast(__m128d) cmpsd!comparison(a, b); 850 } 851 852 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` based on the 853 /// comparison operand specified by `imm8`, store the result in the lower element of result, and 854 /// copy the upper 3 packed elements from `a` to the upper elements of result. 855 __m128 _mm_cmp_ss(int imm8)(__m128 a, __m128 b) pure @safe 856 { 857 enum comparison = mapAVXFPComparison(imm8); 858 return cast(__m128) cmpss!comparison(a, b); 859 } 860 861 /// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point 862 /// elements. 863 __m256d _mm256_cvtepi32_pd (__m128i a) pure @trusted 864 { 865 static if (LDC_with_optimizations) 866 { 867 enum ir = ` 868 %r = sitofp <4 x i32> %0 to <4 x double> 869 ret <4 x double> %r`; 870 return LDCInlineIR!(ir, double4, __m128i)(a); 871 } 872 else static if (GDC_with_AVX) 873 { 874 return __builtin_ia32_cvtdq2pd256(a); 875 } 876 else 877 { 878 double4 r; 879 r.ptr[0] = a.array[0]; 880 r.ptr[1] = a.array[1]; 881 r.ptr[2] = a.array[2]; 882 r.ptr[3] = a.array[3]; 883 return r; 884 } 885 } 886 unittest 887 { 888 __m256d R = _mm256_cvtepi32_pd(_mm_set1_epi32(54)); 889 double[4] correct = [54.0, 54, 54, 54]; 890 assert(R.array == correct); 891 } 892 893 /// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point 894 /// elements. 895 __m256 _mm256_cvtepi32_ps (__m256i a) pure @trusted 896 { 897 static if (LDC_with_optimizations) 898 { 899 enum ir = ` 900 %r = sitofp <8 x i32> %0 to <8 x float> 901 ret <8 x float> %r`; 902 return LDCInlineIR!(ir, float8, int8)(cast(int8)a); 903 } 904 else static if (GDC_with_AVX) 905 { 906 return __builtin_ia32_cvtdq2ps256(cast(int8)a); 907 } 908 else 909 { 910 int8 ia = cast(int8)a; 911 __m256 r; 912 r.ptr[0] = ia.array[0]; 913 r.ptr[1] = ia.array[1]; 914 r.ptr[2] = ia.array[2]; 915 r.ptr[3] = ia.array[3]; 916 r.ptr[4] = ia.array[4]; 917 r.ptr[5] = ia.array[5]; 918 r.ptr[6] = ia.array[6]; 919 r.ptr[7] = ia.array[7]; 920 return r; 921 } 922 } 923 unittest 924 { 925 __m256 R = _mm256_cvtepi32_ps(_mm256_set1_epi32(5)); 926 float[8] correct = [5.0f, 5, 5, 5, 5, 5, 5, 5]; 927 assert(R.array == correct); 928 } 929 930 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit 931 /// integers. Follows the current rounding mode. 932 __m128i _mm256_cvtpd_epi32 (__m256d a) @safe 933 { 934 static if (GDC_or_LDC_with_AVX) 935 { 936 return __builtin_ia32_cvtpd2dq256(a); 937 } 938 else 939 { 940 __m128d lo = _mm256_extractf128_pd!0(a); 941 __m128d hi = _mm256_extractf128_pd!1(a); 942 __m128i ilo = _mm_cvtpd_epi32(lo); // Only lower 64-bit contains significant values 943 __m128i ihi = _mm_cvtpd_epi32(hi); 944 return _mm_unpacklo_epi64(ilo, ihi); 945 } 946 } 947 unittest 948 { 949 int4 A = _mm256_cvtpd_epi32(_mm256_setr_pd(61.0, 55.0, -100, 1_000_000)); 950 int[4] correct = [61, 55, -100, 1_000_000]; 951 assert(A.array == correct); 952 } 953 954 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed single-precision (32-bit) 955 /// floating-point elements. 956 __m128 _mm256_cvtpd_ps (__m256d a) pure @trusted 957 { 958 // PERF DMD 959 static if (GDC_or_LDC_with_AVX) 960 { 961 return __builtin_ia32_cvtpd2ps256(a); 962 } 963 else 964 { 965 __m128 r; 966 r.ptr[0] = a.array[0]; 967 r.ptr[1] = a.array[1]; 968 r.ptr[2] = a.array[2]; 969 r.ptr[3] = a.array[3]; 970 return r; 971 } 972 } 973 unittest 974 { 975 __m256d A = _mm256_setr_pd(1.0, 2, 3, 5); 976 __m128 R = _mm256_cvtpd_ps(A); 977 float[4] correct = [1.0f, 2, 3, 5]; 978 assert(R.array == correct); 979 } 980 981 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit 982 /// integers, using the current rounding mode. 983 __m256i _mm256_cvtps_epi32 (__m256 a) @trusted 984 { 985 static if (GDC_or_LDC_with_AVX) 986 { 987 return cast(__m256i) __builtin_ia32_cvtps2dq256(a); 988 } 989 else 990 { 991 __m128 lo = _mm256_extractf128_ps!0(a); 992 __m128 hi = _mm256_extractf128_ps!1(a); 993 __m128i ilo = _mm_cvtps_epi32(lo); 994 __m128i ihi = _mm_cvtps_epi32(hi); 995 return _mm256_set_m128i(ihi, ilo); 996 } 997 } 998 unittest 999 { 1000 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1001 1002 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1003 __m256i A = _mm256_cvtps_epi32(_mm256_setr_ps(1.4f, -2.1f, 53.5f, -2.9f, -1.4f, 2.1f, -53.5f, 2.9f)); 1004 assert( (cast(int8)A).array == [1, -2, 54, -3, -1, 2, -54, 3]); 1005 1006 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1007 A = _mm256_cvtps_epi32(_mm256_setr_ps(1.3f, -2.11f, 53.4f, -2.8f, -1.3f, 2.11f, -53.4f, 2.8f)); 1008 assert( (cast(int8)A).array == [1, -3, 53, -3, -2, 2, -54, 2]); 1009 1010 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1011 A = _mm256_cvtps_epi32(_mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f)); 1012 assert( (cast(int8)A).array == [2, -2, 54, -2, -1, 3, -53, 3]); 1013 1014 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1015 A = _mm256_cvtps_epi32(_mm256_setr_ps(1.4f, -2.17f, 53.8f, -2.91f, -1.4f, 2.17f, -53.8f, 2.91f)); 1016 assert( (cast(int8)A).array == [1, -2, 53, -2, -1, 2, -53, 2]); 1017 1018 _MM_SET_ROUNDING_MODE(savedRounding); 1019 } 1020 1021 1022 /// Convert packed single-precision (32-bit) floating-point elements in `a`` to packed double-precision 1023 /// (64-bit) floating-point elements. 1024 __m256d _mm256_cvtps_pd (__m128 a) pure @trusted 1025 { 1026 // PERF DMD 1027 static if (GDC_with_AVX) 1028 { 1029 return __builtin_ia32_cvtps2pd256(a); // LDC doesn't have the builtin 1030 } 1031 else 1032 { 1033 // LDC: x86, needs -O2 to generate cvtps2pd since LDC 1.2.0 1034 __m256d r; 1035 r.ptr[0] = a.array[0]; 1036 r.ptr[1] = a.array[1]; 1037 r.ptr[2] = a.array[2]; 1038 r.ptr[3] = a.array[3]; 1039 return r; 1040 } 1041 } 1042 unittest 1043 { 1044 __m128 A = _mm_setr_ps(1.0f, 2, 3, 5); 1045 __m256d R = _mm256_cvtps_pd(A); 1046 double[4] correct = [1.0, 2, 3, 5]; 1047 assert(R.array == correct); 1048 } 1049 1050 /// Return the lower double-precision (64-bit) floating-point element of `a`. 1051 double _mm256_cvtsd_f64 (__m256d a) pure @safe 1052 { 1053 return a.array[0]; 1054 } 1055 1056 /// Return the lower 32-bit integer in `a`. 1057 int _mm256_cvtsi256_si32 (__m256i a) pure @safe 1058 { 1059 return (cast(int8)a).array[0]; 1060 } 1061 1062 /// Return the lower single-precision (32-bit) floating-point element of `a`. 1063 float _mm256_cvtss_f32 (__m256 a) pure @safe 1064 { 1065 return a.array[0]; 1066 } 1067 1068 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit 1069 /// integers with truncation. 1070 __m128i _mm256_cvttpd_epi32 (__m256d a) pure @trusted 1071 { 1072 // PERF DMD 1073 static if (GDC_or_LDC_with_AVX) 1074 { 1075 return cast(__m128i)__builtin_ia32_cvttpd2dq256(a); 1076 } 1077 else 1078 { 1079 __m128i r; 1080 r.ptr[0] = cast(int)a.array[0]; 1081 r.ptr[1] = cast(int)a.array[1]; 1082 r.ptr[2] = cast(int)a.array[2]; 1083 r.ptr[3] = cast(int)a.array[3]; 1084 return r; 1085 } 1086 } 1087 unittest 1088 { 1089 __m256d A = _mm256_set_pd(4.7, -1000.9, -7.1, 3.1); 1090 __m128i R = _mm256_cvttpd_epi32(A); 1091 int[4] correct = [3, -7, -1000, 4]; 1092 assert(R.array == correct); 1093 } 1094 1095 /// Convert packed single-precision (32-bit) floating-point elements in `a`. 1096 __m256i _mm256_cvttps_epi32 (__m256 a) pure @trusted 1097 { 1098 // PERF DMD 1099 static if (GDC_or_LDC_with_AVX) 1100 { 1101 return cast(__m256i)__builtin_ia32_cvttps2dq256(a); 1102 } 1103 else 1104 { 1105 int8 r; 1106 r.ptr[0] = cast(int)a.array[0]; 1107 r.ptr[1] = cast(int)a.array[1]; 1108 r.ptr[2] = cast(int)a.array[2]; 1109 r.ptr[3] = cast(int)a.array[3]; 1110 r.ptr[4] = cast(int)a.array[4]; 1111 r.ptr[5] = cast(int)a.array[5]; 1112 r.ptr[6] = cast(int)a.array[6]; 1113 r.ptr[7] = cast(int)a.array[7]; 1114 return cast(__m256i)r; 1115 } 1116 } 1117 unittest 1118 { 1119 __m256 A = _mm256_set_ps(4.7, -1000.9, -7.1, 3.1, 1.4, 2.9, -2.9, 0); 1120 int8 R = cast(int8) _mm256_cvttps_epi32(A); 1121 int[8] correct = [0, -2, 2, 1, 3, -7, -1000, 4]; 1122 assert(R.array == correct); 1123 } 1124 1125 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`. 1126 __m256d _mm256_div_pd (__m256d a, __m256d b) pure @safe 1127 { 1128 return a / b; 1129 } 1130 unittest 1131 { 1132 __m256d a = [1.5, -2.0, 3.0, 1.0]; 1133 a = _mm256_div_pd(a, a); 1134 double[4] correct = [1.0, 1.0, 1.0, 1.0]; 1135 assert(a.array == correct); 1136 } 1137 1138 /// Divide packed single-precision (32-bit) floating-point elements in `a` by packed elements in `b`. 1139 __m256 _mm256_div_ps (__m256 a, __m256 b) pure @safe 1140 { 1141 return a / b; 1142 } 1143 unittest 1144 { 1145 __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 4.5f, -5.0f, 6.0f, 7.0f]; 1146 a = _mm256_div_ps(a, a); 1147 float[8] correct = [1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f]; 1148 assert(a.array == correct); 1149 } 1150 1151 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements in `a` and 1152 /// `b` using the high 4 bits in `imm8`, sum the four products, and conditionally store the sum 1153 /// using the low 4 bits of `imm8`. 1154 __m256 _mm256_dp_ps(int imm8)(__m256 a, __m256 b) 1155 { 1156 // PERF DMD 1157 static if (GDC_or_LDC_with_AVX) 1158 { 1159 return __builtin_ia32_dpps256(a, b, cast(ubyte)imm8); 1160 } 1161 else 1162 { 1163 // Note: in LDC with SSE4.1 but no AVX, we _could_ increase perf a bit by using two 1164 // _mm_dp_ps. 1165 __m256 zero = _mm256_setzero_ps(); 1166 enum ubyte op = (imm8 >>> 4) & 15; 1167 __m256 temp = _mm256_blend_ps!( op | (op << 4) )(zero, a * b); 1168 float lo = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3]; 1169 float hi = temp.array[4] + temp.array[5] + temp.array[6] + temp.array[7]; 1170 __m256 r = _mm256_set_m128(_mm_set1_ps(hi), _mm_set1_ps(lo)); 1171 enum ubyte op2 = (imm8 & 15); 1172 return _mm256_blend_ps!(op2 | (op2 << 4))(zero, r); 1173 } 1174 } 1175 unittest 1176 { 1177 // Products: 9 14 20 24 6 16 12 -24 1178 __m256 A = _mm256_setr_ps(1.0f, 2.0f, 4.0f, 8.0f, 1.0f, 2.0f, 4.0f, 8.0f); 1179 __m256 B = _mm256_setr_ps(9.0f, 7.0f, 5.0f, 3.0f, 6.0f, 8.0f, 3.0f,-3.0f); 1180 float8 R1 = _mm256_dp_ps!(0xf0 + 0xf)(A, B); 1181 float8 R2 = _mm256_dp_ps!(0x30 + 0x5)(A, B); 1182 float8 R3 = _mm256_dp_ps!(0x50 + 0xa)(A, B); 1183 float[8] correct1 = [67.0f, 67.0f, 67.0f,67.0f, 10, 10, 10, 10]; 1184 float[8] correct2 = [23.0f, 0.0f, 23.0f, 0.0f, 22, 0, 22, 0]; 1185 float[8] correct3 = [0.0f, 29.0f, 0.0f, 29.0f, 0, 18, 0, 18]; 1186 assert(R1.array == correct1); 1187 assert(R2.array == correct2); 1188 assert(R3.array == correct3); 1189 } 1190 1191 /// Extract a 32-bit integer from `a`, selected with `imm8`. 1192 int _mm256_extract_epi32 (__m256i a, const int imm8) pure @trusted 1193 { 1194 return (cast(int8)a).array[imm8 & 7]; 1195 } 1196 unittest 1197 { 1198 align(16) int[8] data = [-1, 2, -3, 4, 9, -7, 8, -6]; 1199 auto A = _mm256_loadu_si256(cast(__m256i*) data.ptr); 1200 assert(_mm256_extract_epi32(A, 0) == -1); 1201 assert(_mm256_extract_epi32(A, 1 + 8) == 2); 1202 assert(_mm256_extract_epi32(A, 3 + 16) == 4); 1203 assert(_mm256_extract_epi32(A, 7 + 32) == -6); 1204 } 1205 1206 /// Extract a 64-bit integer from `a`, selected with `index`. 1207 long _mm256_extract_epi64 (__m256i a, const int index) pure @safe 1208 { 1209 return a.array[index & 3]; 1210 } 1211 unittest 1212 { 1213 __m256i A = _mm256_setr_epi64x(-7, 6, 42, 0); 1214 assert(_mm256_extract_epi64(A, -8) == -7); 1215 assert(_mm256_extract_epi64(A, 1) == 6); 1216 assert(_mm256_extract_epi64(A, 2 + 4) == 42); 1217 } 1218 1219 /// Extract a 128-bits lane from `a`, selected with `index` (0 or 1). 1220 /// Note: `_mm256_extractf128_pd!0` is equivalent to `_mm256_castpd256_pd128`. 1221 __m128d _mm256_extractf128_pd(ubyte imm8)(__m256d a) pure @trusted 1222 { 1223 version(GNU) pragma(inline, true); // else GDC has trouble inlining this 1224 1225 // PERF DMD 1226 static if (GDC_with_AVX) 1227 { 1228 // Note: needs to be a template intrinsics because of this builtin. 1229 return __builtin_ia32_vextractf128_pd256(a, imm8 & 1); 1230 } 1231 else 1232 { 1233 double2 r = void; 1234 enum int index = 2*(imm8 & 1); 1235 r.ptr[0] = a.array[index+0]; 1236 r.ptr[1] = a.array[index+1]; 1237 return r; 1238 } 1239 } 1240 unittest 1241 { 1242 __m256d A = _mm256_setr_pd(1.0, 2, 3, 4); 1243 double[4] correct = [1.0, 2, 3, 4]; 1244 __m128d l0 = _mm256_extractf128_pd!18(A); 1245 __m128d l1 = _mm256_extractf128_pd!55(A); 1246 assert(l0.array == correct[0..2]); 1247 assert(l1.array == correct[2..4]); 1248 } 1249 1250 ///ditto 1251 __m128 _mm256_extractf128_ps(ubyte imm8)(__m256 a) pure @trusted 1252 { 1253 version(GNU) pragma(inline, true); // else GDC has trouble inlining this 1254 1255 // PERF DMD 1256 static if (GDC_with_AVX) 1257 { 1258 return __builtin_ia32_vextractf128_ps256(a, imm8 & 1); 1259 } 1260 else 1261 { 1262 float4 r = void; // Optimize well since LDC 1.1 -O1 1263 enum int index = 4*(imm8 & 1); 1264 r.ptr[0] = a.array[index+0]; 1265 r.ptr[1] = a.array[index+1]; 1266 r.ptr[2] = a.array[index+2]; 1267 r.ptr[3] = a.array[index+3]; 1268 return r; 1269 } 1270 } 1271 unittest 1272 { 1273 __m256 A = _mm256_setr_ps(1.0, 2, 3, 4, 5, 6, 7, 8); 1274 float[8] correct = [1.0, 2, 3, 4, 5, 6, 7, 8]; 1275 __m128 l0 = _mm256_extractf128_ps!8(A); 1276 __m128 l1 = _mm256_extractf128_ps!255(A); 1277 assert(l0.array == correct[0..4]); 1278 assert(l1.array == correct[4..8]); 1279 } 1280 1281 ///ditto 1282 __m128i _mm256_extractf128_si256(ubyte imm8)(__m256i a) pure @trusted 1283 { 1284 version(GNU) pragma(inline, true); // else GDC has trouble inlining this 1285 1286 // PERF DMD 1287 static if (GDC_with_AVX) 1288 { 1289 // Note: if it weren't for this GDC intrinsic, _mm256_extractf128_si256 1290 // could be a non-template, however, this wins in -O0. 1291 // Same story for _mm256_extractf128_ps and _mm256_extractf128_pd 1292 return __builtin_ia32_vextractf128_si256(cast(int8)a, imm8 & 1); 1293 } 1294 else 1295 { 1296 long2 r = void; 1297 enum int index = 2*(imm8 & 1); 1298 r.ptr[0] = a.array[index+0]; 1299 r.ptr[1] = a.array[index+1]; 1300 return cast(__m128i)r; 1301 } 1302 } 1303 unittest 1304 { 1305 __m256i A = _mm256_setr_epi32(9, 2, 3, 4, 5, 6, 7, 8); 1306 int[8] correct = [9, 2, 3, 4, 5, 6, 7, 8]; 1307 __m128i l0 = _mm256_extractf128_si256!0(A); 1308 __m128i l1 = _mm256_extractf128_si256!1(A); 1309 assert(l0.array == correct[0..4]); 1310 assert(l1.array == correct[4..8]); 1311 } 1312 1313 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an integer 1314 /// value, and store the results as packed double-precision floating-point elements. 1315 __m256d _mm256_floor_pd (__m256d a) @safe 1316 { 1317 static if (LDC_with_ARM64) 1318 { 1319 __m128d lo = _mm256_extractf128_pd!0(a); 1320 __m128d hi = _mm256_extractf128_pd!1(a); 1321 __m128d ilo = _mm_floor_pd(lo); 1322 __m128d ihi = _mm_floor_pd(hi); 1323 return _mm256_set_m128d(ihi, ilo); 1324 } 1325 else 1326 { 1327 return _mm256_round_pd!1(a); 1328 } 1329 } 1330 unittest 1331 { 1332 __m256d A = _mm256_setr_pd(1.3f, -2.12f, 53.6f, -2.7f); 1333 A = _mm256_floor_pd(A); 1334 double[4] correct = [1.0, -3.0, 53.0, -3.0]; 1335 assert(A.array == correct); 1336 } 1337 1338 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an integer 1339 /// value, and store the results as packed single-precision floating-point elements. 1340 __m256 _mm256_floor_ps (__m256 a) @safe 1341 { 1342 static if (LDC_with_ARM64) 1343 { 1344 __m128 lo = _mm256_extractf128_ps!0(a); 1345 __m128 hi = _mm256_extractf128_ps!1(a); 1346 __m128 ilo = _mm_floor_ps(lo); 1347 __m128 ihi = _mm_floor_ps(hi); 1348 return _mm256_set_m128(ihi, ilo); 1349 } 1350 else 1351 { 1352 return _mm256_round_ps!1(a); 1353 } 1354 } 1355 unittest 1356 { 1357 __m256 A = _mm256_setr_ps(1.3f, -2.12f, 53.6f, -2.7f, -1.3f, 2.12f, -53.6f, 2.7f); 1358 __m256 C = _mm256_floor_ps(A); 1359 float[8] correct = [1.0f, -3.0f, 53.0f, -3.0f, -2, 2, -54, 2]; 1360 assert(C.array == correct); 1361 } 1362 1363 /// Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in `a` 1364 /// and `b`. 1365 __m256d _mm256_hadd_pd (__m256d a, __m256d b) pure @trusted 1366 { 1367 static if (GDC_or_LDC_with_AVX) 1368 { 1369 return __builtin_ia32_haddpd256(a, b); 1370 } 1371 else 1372 { 1373 __m256d res; 1374 res.ptr[0] = a.array[1] + a.array[0]; 1375 res.ptr[1] = b.array[1] + b.array[0]; 1376 res.ptr[2] = a.array[3] + a.array[2]; 1377 res.ptr[3] = b.array[3] + b.array[2]; 1378 return res; 1379 } 1380 } 1381 unittest 1382 { 1383 __m256d A =_mm256_setr_pd(1.5, 2.0, 21.0, 9.0); 1384 __m256d B =_mm256_setr_pd(1.0, 7.0, 100.0, 14.0); 1385 __m256d C = _mm256_hadd_pd(A, B); 1386 double[4] correct = [3.5, 8.0, 30.0, 114.0]; 1387 assert(C.array == correct); 1388 } 1389 1390 /// Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in `a` and 1391 /// `b`. 1392 __m256 _mm256_hadd_ps (__m256 a, __m256 b) pure @trusted 1393 { 1394 // PERD DMD 1395 static if (GDC_or_LDC_with_AVX) 1396 { 1397 return __builtin_ia32_haddps256(a, b); 1398 } 1399 else static if (LDC_with_ARM64) 1400 { 1401 __m128 a_hi = _mm256_extractf128_ps!1(a); 1402 __m128 a_lo = _mm256_extractf128_ps!0(a); 1403 __m128 b_hi = _mm256_extractf128_ps!1(b); 1404 __m128 b_lo = _mm256_extractf128_ps!0(b); 1405 __m128 hi = vpaddq_f32(a_hi, b_hi); 1406 __m128 lo = vpaddq_f32(a_lo, b_lo); 1407 return _mm256_set_m128(hi, lo); 1408 } 1409 else 1410 { 1411 __m256 res; 1412 res.ptr[0] = a.array[1] + a.array[0]; 1413 res.ptr[1] = a.array[3] + a.array[2]; 1414 res.ptr[2] = b.array[1] + b.array[0]; 1415 res.ptr[3] = b.array[3] + b.array[2]; 1416 res.ptr[4] = a.array[5] + a.array[4]; 1417 res.ptr[5] = a.array[7] + a.array[6]; 1418 res.ptr[6] = b.array[5] + b.array[4]; 1419 res.ptr[7] = b.array[7] + b.array[6]; 1420 return res; 1421 } 1422 } 1423 unittest 1424 { 1425 __m256 A =_mm256_setr_ps(1.0f, 2.0f, 3.0f, 5.0f, 1.0f, 2.0f, 3.0f, 5.0f); 1426 __m256 B =_mm256_setr_ps(1.5f, 2.0f, 3.5f, 4.0f, 1.5f, 2.0f, 3.5f, 5.0f); 1427 __m256 R = _mm256_hadd_ps(A, B); 1428 float[8] correct = [3.0f, 8.0f, 3.5f, 7.5f, 3.0f, 8.0f, 3.5f, 8.5f]; 1429 assert(R.array == correct); 1430 } 1431 1432 /// Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in 1433 /// `a` and `b`. 1434 __m256d _mm256_hsub_pd (__m256d a, __m256d b) pure @trusted 1435 { 1436 static if (GDC_or_LDC_with_AVX) 1437 { 1438 return __builtin_ia32_hsubpd256(a, b); 1439 } 1440 else 1441 { 1442 // 2 zip1, 2 zip2, 2 fsub... I don't think there is better in arm64 1443 __m256d res; 1444 res.ptr[0] = a.array[0] - a.array[1]; 1445 res.ptr[1] = b.array[0] - b.array[1]; 1446 res.ptr[2] = a.array[2] - a.array[3]; 1447 res.ptr[3] = b.array[2] - b.array[3]; 1448 return res; 1449 } 1450 } 1451 unittest 1452 { 1453 __m256d A =_mm256_setr_pd(1.5, 2.0, 21.0, 9.0); 1454 __m256d B =_mm256_setr_pd(1.0, 7.0, 100.0, 14.0); 1455 __m256d C = _mm256_hsub_pd(A, B); 1456 double[4] correct = [-0.5, -6.0, 12.0, 86.0]; 1457 assert(C.array == correct); 1458 } 1459 1460 __m256 _mm256_hsub_ps (__m256 a, __m256 b) pure @trusted 1461 { 1462 // PERD DMD 1463 static if (GDC_or_LDC_with_AVX) 1464 { 1465 return __builtin_ia32_hsubps256(a, b); 1466 } 1467 else 1468 { 1469 __m128 a_hi = _mm256_extractf128_ps!1(a); 1470 __m128 a_lo = _mm256_extractf128_ps!0(a); 1471 __m128 b_hi = _mm256_extractf128_ps!1(b); 1472 __m128 b_lo = _mm256_extractf128_ps!0(b); 1473 __m128 hi = _mm_hsub_ps(a_hi, b_hi); 1474 __m128 lo = _mm_hsub_ps(a_lo, b_lo); 1475 return _mm256_set_m128(hi, lo); 1476 } 1477 } 1478 unittest 1479 { 1480 __m256 A =_mm256_setr_ps(1.0f, 2.0f, 3.0f, 5.0f, 1.0f, 2.0f, 3.0f, 5.0f); 1481 __m256 B =_mm256_setr_ps(1.5f, 2.0f, 3.5f, 4.0f, 1.5f, 2.0f, 3.5f, 5.0f); 1482 __m256 R = _mm256_hsub_ps(A, B); 1483 float[8] correct = [-1.0f, -2.0f, -0.5f, -0.5f, -1.0f, -2.0f, -0.5f, -1.5f]; 1484 assert(R.array == correct); 1485 } 1486 1487 /// Copy `a`, and insert the 16-bit integer `i` into the result at the location specified by 1488 /// `index & 15`. 1489 __m256i _mm256_insert_epi16 (__m256i a, short i, const int index) pure @trusted 1490 { 1491 short16 sa = cast(short16)a; 1492 sa.ptr[index & 15] = i; 1493 return cast(__m256i)sa; 1494 } 1495 unittest 1496 { 1497 __m256i A = _mm256_set1_epi16(1); 1498 short16 R = cast(short16) _mm256_insert_epi16(A, 2, 16 + 16 + 7); 1499 short[16] correct = [1, 1, 1, 1, 1, 1, 1, 2, 1500 1, 1, 1, 1, 1, 1, 1, 1 ]; 1501 assert(R.array == correct); 1502 } 1503 1504 /// Copy `a`, and insert the 32-bit integer `i` into the result at the location specified by 1505 /// `index & 7`. 1506 __m256i _mm256_insert_epi32 (__m256i a, int i, const int index) pure @trusted 1507 { 1508 int8 ia = cast(int8)a; 1509 ia.ptr[index & 7] = i; 1510 return cast(__m256i)ia; 1511 } 1512 unittest 1513 { 1514 __m256i A = _mm256_set1_epi32(1); 1515 int8 R = cast(int8) _mm256_insert_epi32(A, -2, 8 + 8 + 1); 1516 int[8] correct = [1, -2, 1, 1, 1, 1, 1, 1]; 1517 assert(R.array == correct); 1518 } 1519 1520 /// Copy `a`, and insert the 64-bit integer `i` into the result at the location specified by 1521 /// `index & 3`. 1522 __m256i _mm256_insert_epi64(__m256i a, long i, const int index) pure @trusted 1523 { 1524 a.ptr[index & 3] = i; 1525 return a; 1526 } 1527 unittest 1528 { 1529 __m256i A = _mm256_set1_epi64(1); 1530 long4 R = cast(long4) _mm256_insert_epi64(A, -2, 2 - 4 - 4); 1531 long[4] correct = [1, 1, -2, 1]; 1532 assert(R.array == correct); 1533 } 1534 1535 /// Copy `a`, and insert the 8-bit integer `i` into the result at the location specified by 1536 /// `index & 31`. 1537 __m256i _mm256_insert_epi8(__m256i a, byte i, const int index) pure @trusted 1538 { 1539 byte32 ba = cast(byte32)a; 1540 ba.ptr[index & 31] = i; 1541 return cast(__m256i)ba; 1542 } 1543 unittest 1544 { 1545 __m256i A = _mm256_set1_epi8(1); 1546 byte32 R = cast(byte32) _mm256_insert_epi8(A, -2, 7 - 32 - 32); 1547 byte[32] correct = [1, 1, 1, 1, 1, 1, 1,-2, 1, 1, 1, 1, 1, 1, 1, 1, 1548 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]; 1549 assert(R.array == correct); 1550 } 1551 1552 /// Copy `a`, then insert 128 bits (composed of 2 packed double-precision (64-bit) 1553 /// floating-point elements) from `b` at the location specified by `imm8`. 1554 __m256d _mm256_insertf128_pd(int imm8)(__m256d a, __m128d b) pure @trusted 1555 { 1556 static if (GDC_with_AVX) 1557 { 1558 enum ubyte lane = imm8 & 1; 1559 return __builtin_ia32_vinsertf128_pd256(a, b, lane); 1560 } 1561 else 1562 { 1563 __m256d r = a; 1564 enum int index = (imm8 & 1) ? 2 : 0; 1565 r.ptr[index] = b.array[0]; 1566 r.ptr[index+1] = b.array[1]; 1567 return r; 1568 } 1569 } 1570 1571 /// Copy `a` then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point 1572 /// elements) from `b`, at the location specified by `imm8`. 1573 __m256 _mm256_insertf128_ps(int imm8)(__m256 a, __m128 b) pure @trusted 1574 { 1575 static if (GDC_with_AVX) 1576 { 1577 enum ubyte lane = imm8 & 1; 1578 return __builtin_ia32_vinsertf128_ps256(a, b, lane); 1579 } 1580 else 1581 { 1582 __m256 r = a; 1583 enum int index = (imm8 & 1) ? 4 : 0; 1584 r.ptr[index] = b.array[0]; 1585 r.ptr[index+1] = b.array[1]; 1586 r.ptr[index+2] = b.array[2]; 1587 r.ptr[index+3] = b.array[3]; 1588 return r; 1589 } 1590 } 1591 1592 /// Copy `a`, then insert 128 bits from `b` at the location specified by `imm8`. 1593 __m256i _mm256_insertf128_si256(int imm8)(__m256i a, __m128i b) pure @trusted 1594 { 1595 static if (GDC_with_AVX) 1596 { 1597 enum ubyte lane = imm8 & 1; 1598 return cast(__m256i) __builtin_ia32_vinsertf128_si256 (cast(int8)a, b, lane); 1599 } 1600 else 1601 { 1602 long2 lb = cast(long2)b; 1603 __m256i r = a; 1604 enum int index = (imm8 & 1) ? 2 : 0; 1605 r.ptr[index] = lb.array[0]; 1606 r.ptr[index+1] = lb.array[1]; 1607 return r; 1608 } 1609 } 1610 1611 /// Load 256-bits of integer data from unaligned memory into dst. 1612 /// This intrinsic may run better than `_mm256_loadu_si256` when the data crosses a cache 1613 /// line boundary. 1614 __m256i _mm256_lddqu_si256(const(__m256i)* mem_addr) @trusted 1615 { 1616 // PERF DMD 1617 static if (GDC_or_LDC_with_AVX) 1618 { 1619 return cast(__m256i) __builtin_ia32_lddqu256(cast(const(char)*)mem_addr); 1620 } 1621 else 1622 return _mm256_loadu_si256(mem_addr); 1623 } 1624 unittest 1625 { 1626 int[10] correct = [0, -1, 2, -3, 4, 9, -7, 8, -6, 34]; 1627 int8 A = cast(int8) _mm256_lddqu_si256(cast(__m256i*) &correct[1]); 1628 assert(A.array == correct[1..9]); 1629 } 1630 1631 /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) 1632 /// from memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 1633 /// exception may be generated. 1634 __m256d _mm256_load_pd (const(double)* mem_addr) pure @trusted 1635 { 1636 return *cast(__m256d*)mem_addr; 1637 } 1638 unittest 1639 { 1640 static immutable align(32) double[4] correct = [1.0, 2.0, 3.5, -42.0]; 1641 __m256d A = _mm256_load_pd(correct.ptr); 1642 assert(A.array == correct); 1643 } 1644 1645 /// Load 256-bits (composed of 8 packed single-precision (32-bit) 1646 /// floating-point elements) from memory. 1647 /// `mem_addr` must be aligned on a 32-byte boundary or a 1648 /// general-protection exception may be generated. 1649 __m256 _mm256_load_ps (const(float)* mem_addr) pure @trusted 1650 { 1651 return *cast(__m256*)mem_addr; 1652 } 1653 unittest 1654 { 1655 static immutable align(32) float[8] correct = 1656 [1.0, 2.0, 3.5, -42.0, 7.43f, 0.0f, 3, 2]; 1657 __m256 A = _mm256_load_ps(correct.ptr); 1658 assert(A.array == correct); 1659 } 1660 1661 /// Load 256-bits of integer data from memory. `mem_addr` does not need to be aligned on 1662 /// any particular boundary. 1663 // See this dlang forum post => https://forum.dlang.org/thread/vymrsngsfibkmqsqffce@forum.dlang.org 1664 __m256i _mm256_loadu_si256 (const(__m256i)* mem_addr) pure @trusted 1665 { 1666 // PERF DMD 1667 static if (GDC_with_AVX) 1668 { 1669 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) mem_addr); 1670 } 1671 else static if (LDC_with_optimizations) 1672 { 1673 return loadUnaligned!(__m256i)(cast(long*)mem_addr); 1674 } 1675 else 1676 { 1677 const(long)* p = cast(const(long)*)mem_addr; 1678 long4 r; 1679 r.ptr[0] = p[0]; 1680 r.ptr[1] = p[1]; 1681 r.ptr[2] = p[2]; 1682 r.ptr[3] = p[3]; 1683 return r; 1684 } 1685 } 1686 unittest 1687 { 1688 align(16) int[8] correct = [-1, 2, -3, 4, 9, -7, 8, -6]; 1689 int8 A = cast(int8) _mm256_loadu_si256(cast(__m256i*) correct.ptr); 1690 assert(A.array == correct); 1691 } 1692 1693 /// Load 256-bits of integer data from memory. `mem_addr` must be aligned on a 1694 /// 32-byte boundary or a general-protection exception may be generated. 1695 __m256i _mm256_load_si256 (const(void)* mem_addr) pure @system 1696 { 1697 return *cast(__m256i*)mem_addr; 1698 } 1699 unittest 1700 { 1701 static immutable align(64) long[4] correct = [1, -2, long.min, long.max]; 1702 __m256i A = _mm256_load_si256(correct.ptr); 1703 assert(A.array == correct); 1704 } 1705 1706 /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) 1707 /// from memory. `mem_addr` does not need to be aligned on any particular boundary. 1708 __m256d _mm256_loadu_pd (const(void)* mem_addr) pure @system 1709 { 1710 // PERF DMD 1711 static if (GDC_with_AVX) 1712 { 1713 return __builtin_ia32_loadupd256 ( cast(const(double)*) mem_addr); 1714 } 1715 else static if (LDC_with_optimizations) 1716 { 1717 return loadUnaligned!(__m256d)(cast(double*)mem_addr); 1718 } 1719 else 1720 { 1721 const(double)* p = cast(const(double)*)mem_addr; 1722 double4 r; 1723 r.ptr[0] = p[0]; 1724 r.ptr[1] = p[1]; 1725 r.ptr[2] = p[2]; 1726 r.ptr[3] = p[3]; 1727 return r; 1728 } 1729 } 1730 unittest 1731 { 1732 double[4] correct = [1.0, -2.0, 0.0, 768.5]; 1733 __m256d A = _mm256_loadu_pd(correct.ptr); 1734 assert(A.array == correct); 1735 } 1736 1737 /// Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory. 1738 /// `mem_addr` does not need to be aligned on any particular boundary. 1739 __m256 _mm256_loadu_ps (const(float)* mem_addr) pure @system 1740 { 1741 // PERF DMD 1742 static if (GDC_with_AVX) 1743 { 1744 return __builtin_ia32_loadups256 ( cast(const(float)*) mem_addr); 1745 } 1746 else static if (LDC_with_optimizations) 1747 { 1748 return loadUnaligned!(__m256)(cast(float*)mem_addr); 1749 } 1750 else 1751 { 1752 const(float)* p = cast(const(float)*)mem_addr; 1753 float8 r = void; 1754 r.ptr[0] = p[0]; 1755 r.ptr[1] = p[1]; 1756 r.ptr[2] = p[2]; 1757 r.ptr[3] = p[3]; 1758 r.ptr[4] = p[4]; 1759 r.ptr[5] = p[5]; 1760 r.ptr[6] = p[6]; 1761 r.ptr[7] = p[7]; 1762 return r; 1763 } 1764 } 1765 unittest 1766 { 1767 align(32) float[10] correct = [0.0f, 1, 2, 3, 4, 5, 6, 7, 8, 9]; 1768 __m256 A = _mm256_loadu_ps(&correct[1]); 1769 assert(A.array == correct[1..9]); 1770 } 1771 1772 /// Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point 1773 /// elements) from memory, and combine them into a 256-bit value. 1774 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 1775 __m256 _mm256_loadu2_m128 (const(float)* hiaddr, const(float)* loaddr) pure @system 1776 { 1777 // Note: no particular instruction for this in x86. 1778 return _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr)); 1779 } 1780 unittest 1781 { 1782 align(32) float[6] A = [4.5f, 2, 8, 97, -1, 3]; 1783 align(32) float[6] B = [6.5f, 3, 9, 98, -2, 4]; 1784 __m256 R = _mm256_loadu2_m128(&B[1], &A[1]); 1785 float[8] correct = [2.0f, 8, 97, -1, 3, 9, 98, -2]; 1786 assert(R.array == correct); 1787 } 1788 1789 /// Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point 1790 /// elements) from memory, and combine them into a 256-bit value. 1791 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 1792 __m256d _mm256_loadu2_m128d (const(double)* hiaddr, const(double)* loaddr) pure @system 1793 { 1794 // Note: no particular instruction for this in x86. 1795 return _mm256_set_m128d(_mm_loadu_pd(hiaddr), _mm_loadu_pd(loaddr)); 1796 } 1797 unittest 1798 { 1799 align(32) double[4] A = [4.5f, 2, 8, 97]; 1800 align(32) double[4] B = [6.5f, 3, 9, 98]; 1801 __m256d R = _mm256_loadu2_m128d(&B[1], &A[1]); 1802 double[4] correct = [2.0, 8, 3, 9]; 1803 assert(R.array == correct); 1804 } 1805 1806 /// Load two 128-bit values (composed of integer data) from memory, and combine them into a 1807 /// 256-bit value. `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 1808 __m256i _mm256_loadu2_m128i (const(__m128i)* hiaddr, const(__m128i)* loaddr) pure @trusted 1809 { 1810 // Note: no particular instruction for this in x86. 1811 return _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr)); 1812 } 1813 unittest 1814 { 1815 align(32) long[4] A = [5, 2, 8, 97]; 1816 align(32) long[4] B = [6, 3, 9, 98]; 1817 __m256i R = _mm256_loadu2_m128i(cast(const(__m128i)*) &B[1], cast(const(__m128i)*) &A[1]); 1818 long[4] correct = [2, 8, 3, 9]; 1819 assert(R.array == correct); 1820 } 1821 1822 version(DigitalMars) 1823 { 1824 // this avoids a bug with DMD < 2.099 -a x86 -O 1825 private enum bool maskLoadWorkaroundDMD = (__VERSION__ < 2099); 1826 } 1827 else 1828 { 1829 private enum bool maskLoadWorkaroundDMD = false; 1830 } 1831 1832 /// Load packed double-precision (64-bit) floating-point elements from memory using `mask` 1833 /// (elements are zeroed out when the high bit of the corresponding element is not set). 1834 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access 1835 /// only when needed. 1836 /// See: "Note about mask load/store" to know why you must address valid memory only. 1837 __m128d _mm_maskload_pd (const(double)* mem_addr, __m128i mask) /* pure */ @system 1838 { 1839 // PERF DMD 1840 static if (LDC_with_AVX) 1841 { 1842 // MAYDO report that the builtin is impure 1843 return __builtin_ia32_maskloadpd(mem_addr, cast(long2)mask); 1844 } 1845 else static if (GDC_with_AVX) 1846 { 1847 return __builtin_ia32_maskloadpd(cast(double2*)mem_addr, cast(long2)mask); 1848 } 1849 else 1850 { 1851 __m128d a = _mm_loadu_pd(mem_addr); 1852 __m128d zero = _mm_setzero_pd(); 1853 return _mm_blendv_pd(zero, a, cast(double2)mask); 1854 } 1855 } 1856 unittest 1857 { 1858 static if (!maskLoadWorkaroundDMD) 1859 { 1860 double[2] A = [7.5, 1]; 1861 double2 B = _mm_maskload_pd(A.ptr, _mm_setr_epi64(-1, 1)); 1862 double[2] correct = [7.5, 0]; 1863 assert(B.array == correct); 1864 } 1865 } 1866 1867 /// Load packed double-precision (64-bit) floating-point elements from memory using `mask` 1868 /// (elements are zeroed out when the high bit of the corresponding element is not set). 1869 /// See: "Note about mask load/store" to know why you must address valid memory only. 1870 __m256d _mm256_maskload_pd (const(double)* mem_addr, __m256i mask) /*pure*/ @system 1871 { 1872 // PERF DMD 1873 static if (LDC_with_AVX) 1874 { 1875 // MAYDO that the builtin is impure 1876 return __builtin_ia32_maskloadpd256(mem_addr, mask); 1877 } 1878 else static if (GDC_with_AVX) 1879 { 1880 return __builtin_ia32_maskloadpd256(cast(double4*)mem_addr, mask); 1881 } 1882 else 1883 { 1884 __m256d a = _mm256_loadu_pd(mem_addr); 1885 __m256d zero = _mm256_setzero_pd(); 1886 return _mm256_blendv_pd(zero, a, cast(double4)mask); 1887 } 1888 } 1889 unittest 1890 { 1891 static if (!maskLoadWorkaroundDMD) 1892 { 1893 double[4] A = [7.5, 1, 2, 3]; 1894 double4 B = _mm256_maskload_pd(A.ptr, _mm256_setr_epi64(1, -1, -1, 1)); 1895 double[4] correct = [0.0, 1, 2, 0]; 1896 assert(B.array == correct); 1897 } 1898 } 1899 1900 /// Load packed single-precision (32-bit) floating-point elements from memory using mask (elements 1901 /// are zeroed out when the high bit of the corresponding element is not set). 1902 /// Warning: See "Note about mask load/store" to know why you must address valid memory only. 1903 __m128 _mm_maskload_ps (const(float)* mem_addr, __m128i mask) /* pure */ @system 1904 { 1905 // PERF DMD 1906 static if (LDC_with_AVX) 1907 { 1908 // MAYDO report that the builtin is impure 1909 return __builtin_ia32_maskloadps(mem_addr, mask); 1910 } 1911 else static if (GDC_with_AVX) 1912 { 1913 return __builtin_ia32_maskloadps(cast(float4*)mem_addr, mask); 1914 } 1915 else 1916 { 1917 __m128 a = _mm_loadu_ps(mem_addr); 1918 __m128 zero = _mm_setzero_ps(); 1919 return _mm_blendv_ps(zero, a, cast(float4)mask); 1920 } 1921 } 1922 unittest 1923 { 1924 static if (!maskLoadWorkaroundDMD) 1925 { 1926 float[4] A = [7.5f, 1, 2, 3]; 1927 float4 B = _mm_maskload_ps(A.ptr, _mm_setr_epi32(1, -1, -1, 1)); // can NOT address invalid memory! 1928 float[4] correct = [0.0f, 1, 2, 0]; 1929 assert(B.array == correct); 1930 } 1931 } 1932 1933 /// Load packed single-precision (32-bit) floating-point elements from memory using `mask` 1934 /// (elements are zeroed out when the high bit of the corresponding element is not set). 1935 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access 1936 /// only when needed. 1937 /// See: "Note about mask load/store" to know why you must address valid memory only. 1938 __m256 _mm256_maskload_ps (const(float)* mem_addr, __m256i mask) /*pure*/ @system 1939 { 1940 // PERF DMD 1941 static if (LDC_with_AVX) 1942 { 1943 // MAYDO report that the builtin is impure 1944 return __builtin_ia32_maskloadps256(mem_addr, cast(int8)mask); 1945 } 1946 else static if (GDC_with_AVX) 1947 { 1948 return __builtin_ia32_maskloadps256(cast(float8*)mem_addr, cast(int8)mask); 1949 } 1950 else 1951 { 1952 __m256 a = _mm256_loadu_ps(mem_addr); 1953 __m256 zero = _mm256_setzero_ps(); 1954 return _mm256_blendv_ps(zero, a, cast(float8)mask); 1955 } 1956 } 1957 unittest 1958 { 1959 float[8] A = [1, 7.5f, 1, 2, 3, 4, 5, 6]; 1960 __m256i M = _mm256_setr_epi32(1, -1, 1, -1, 1, -1, -1, 1); 1961 float8 B = _mm256_maskload_ps(A.ptr, M); 1962 float[8] correct = [0.0f, 7.5f, 0, 2, 0, 4, 5, 0]; 1963 assert(B.array == correct); 1964 } 1965 1966 /// Store packed double-precision (64-bit) floating-point elements from `a` into memory using `mask`. 1967 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access 1968 /// only when needed. 1969 /// See: "Note about mask load/store" to know why you must address valid memory only. 1970 void _mm_maskstore_pd (double * mem_addr, __m128i mask, __m128d a) /* pure */ @system 1971 { 1972 // PERF DMD 1973 static if (LDC_with_AVX) 1974 { 1975 // MAYDO that the builtin is impure 1976 __builtin_ia32_maskstorepd(mem_addr, cast(long2)mask, a); 1977 } 1978 else static if (GDC_with_AVX) 1979 { 1980 __builtin_ia32_maskstorepd(cast(double2*)mem_addr, cast(long2)mask, a); 1981 } 1982 else 1983 { 1984 __m128d source = _mm_loadu_pd(mem_addr); 1985 __m128d r = _mm_blendv_pd(source, a, cast(double2) mask); 1986 _mm_storeu_pd(mem_addr, r); 1987 } 1988 } 1989 unittest 1990 { 1991 double[2] A = [0.0, 1.0]; 1992 __m128i M = _mm_setr_epi64(-1, 0); 1993 __m128d B = _mm_setr_pd(2.0, 3.0); 1994 _mm_maskstore_pd(A.ptr, M, B); 1995 double[2] correct = [2.0, 1.0]; 1996 assert(A == correct); 1997 } 1998 1999 2000 /// Store packed double-precision (64-bit) floating-point elements from `a` into memory using `mask`. 2001 /// See: "Note about mask load/store" to know why you must address valid memory only. 2002 static if (!llvm256BitStackWorkaroundIn32BitX86) 2003 { 2004 void _mm256_maskstore_pd (double * mem_addr, __m256i mask, __m256d a) /* pure */ @system 2005 { 2006 // PERF DMD 2007 static if (LDC_with_AVX) 2008 { 2009 // MAYDO that the builtin is impure 2010 __builtin_ia32_maskstorepd256(mem_addr, cast(long4)mask, a); 2011 } 2012 else static if (GDC_with_AVX) 2013 { 2014 __builtin_ia32_maskstorepd256(cast(double4*)mem_addr, cast(long4)mask, a); 2015 } 2016 else 2017 { 2018 __m256d source = _mm256_loadu_pd(mem_addr); 2019 __m256d r = _mm256_blendv_pd(source, a, cast(double4) mask); 2020 _mm256_storeu_pd(mem_addr, r); 2021 } 2022 } 2023 unittest 2024 { 2025 double[4] A = [0.0, 1, 2, 3]; 2026 __m256i M = _mm256_setr_epi64x(-9, 0, -1, 0); 2027 __m256d B = _mm256_setr_pd(2, 3, 4, 5); 2028 _mm256_maskstore_pd(A.ptr, M, B); 2029 double[4] correct = [2.0, 1, 4, 3]; 2030 assert(A == correct); 2031 } 2032 } 2033 2034 /// Store packed single-precision (32-bit) floating-point elements from `a` into memory using `mask`. 2035 /// Note: emulating that instruction isn't efficient, since it needs to perform memory access 2036 /// only when needed. 2037 /// See: "Note about mask load/store" to know why you must address valid memory only. 2038 void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a) /* pure */ @system 2039 { 2040 // PERF DMD 2041 static if (LDC_with_AVX) 2042 { 2043 // MAYDO report that the builtin is impure 2044 __builtin_ia32_maskstoreps(mem_addr, mask, a); 2045 } 2046 else static if (GDC_with_AVX) 2047 { 2048 __builtin_ia32_maskstoreps(cast(float4*)mem_addr, mask, a); 2049 } 2050 else 2051 { 2052 __m128 source = _mm_loadu_ps(mem_addr); 2053 __m128 r = _mm_blendv_ps(source, a, cast(float4) mask); 2054 _mm_storeu_ps(mem_addr, r); 2055 } 2056 } 2057 unittest 2058 { 2059 float[4] A = [0.0f, 1, 2, 6]; 2060 __m128i M = _mm_setr_epi32(-1, 0, -1, 0); 2061 __m128 B = _mm_setr_ps(2, 3, 4, 5); 2062 _mm_maskstore_ps(A.ptr, M, B); 2063 float[4] correct = [2.0f, 1, 4, 6]; 2064 assert(A == correct); 2065 } 2066 2067 static if (!llvm256BitStackWorkaroundIn32BitX86) 2068 { 2069 /// Store packed single-precision (32-bit) floating-point elements from `a` into memory using `mask`. 2070 /// See: "Note about mask load/store" to know why you must address valid memory only. 2071 void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a) /* pure */ @system 2072 { 2073 // PERF DMD 2074 static if (LDC_with_AVX) 2075 { 2076 // MAYDO report that the builtin is impure 2077 __builtin_ia32_maskstoreps256(mem_addr, cast(int8)mask, a); 2078 } 2079 else static if (GDC_with_AVX) 2080 { 2081 __builtin_ia32_maskstoreps256(cast(float8*)mem_addr, cast(int8)mask, a); 2082 } 2083 else 2084 { 2085 __m256 source = _mm256_loadu_ps(mem_addr); 2086 __m256 r = _mm256_blendv_ps(source, a, cast(float8) mask); 2087 _mm256_storeu_ps(mem_addr, r); 2088 } 2089 } 2090 unittest 2091 { 2092 float[8] A = [0.0f, 0, 1, 2, 3, 4, 5, 7]; 2093 __m256i M = _mm256_setr_epi32( 0, -1, 0, -1, 0, -1, -1, 0); 2094 __m256 B = _mm256_set1_ps(6.0f); 2095 _mm256_maskstore_ps(A.ptr, M, B); 2096 float[8] correct = [0.0f, 6, 1, 6, 3, 6, 6, 7]; 2097 assert(A == correct); 2098 } 2099 } 2100 2101 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 2102 /// packed maximum values. 2103 __m256d _mm256_max_pd (__m256d a, __m256d b) pure @trusted 2104 { 2105 // PERF DMD 2106 static if (GDC_or_LDC_with_AVX) 2107 { 2108 return __builtin_ia32_maxpd256(a, b); 2109 } 2110 else 2111 { 2112 // LDC: becomes good in -O2 2113 // PERF: GDC without AVX 2114 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2115 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2116 a.ptr[2] = (a.array[2] > b.array[2]) ? a.array[2] : b.array[2]; 2117 a.ptr[3] = (a.array[3] > b.array[3]) ? a.array[3] : b.array[3]; 2118 return a; 2119 } 2120 } 2121 unittest 2122 { 2123 __m256d A = _mm256_setr_pd(4.0, 1.0, -9.0, double.infinity); 2124 __m256d B = _mm256_setr_pd(1.0, 8.0, 0.0, 100000.0); 2125 __m256d M = _mm256_max_pd(A, B); 2126 double[4] correct = [4.0, 8.0, 0.0, double.infinity]; 2127 } 2128 2129 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return 2130 /// packed maximum values. 2131 __m256 _mm256_max_ps (__m256 a, __m256 b) pure @trusted 2132 { 2133 // PERF DMD 2134 static if (GDC_or_LDC_with_AVX) 2135 { 2136 return __builtin_ia32_maxps256(a, b); 2137 } 2138 else 2139 { 2140 // LDC: becomes good in -O2, but looks brittle. 2141 // PERF GDC without AVX 2142 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2143 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2144 a.ptr[2] = (a.array[2] > b.array[2]) ? a.array[2] : b.array[2]; 2145 a.ptr[3] = (a.array[3] > b.array[3]) ? a.array[3] : b.array[3]; 2146 a.ptr[4] = (a.array[4] > b.array[4]) ? a.array[4] : b.array[4]; 2147 a.ptr[5] = (a.array[5] > b.array[5]) ? a.array[5] : b.array[5]; 2148 a.ptr[6] = (a.array[6] > b.array[6]) ? a.array[6] : b.array[6]; 2149 a.ptr[7] = (a.array[7] > b.array[7]) ? a.array[7] : b.array[7]; 2150 return a; 2151 } 2152 } 2153 unittest 2154 { 2155 __m256 A = _mm256_setr_ps(4.0, 1.0, -9.0, float.infinity, 1, 2, 3, 4); 2156 __m256 B = _mm256_setr_ps(1.0, 8.0, 0.0, 100000.0f , 4, 3, 2, 1); 2157 __m256 M = _mm256_max_ps(A, B); 2158 float[8] correct = [4.0, 8.0, 0.0, float.infinity , 4, 3, 3, 4]; 2159 } 2160 2161 // Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 2162 /// packed minimum values. 2163 __m256d _mm256_min_pd (__m256d a, __m256d b) pure @trusted 2164 { 2165 // PERF DMD 2166 static if (GDC_or_LDC_with_AVX) 2167 { 2168 return __builtin_ia32_minpd256(a, b); 2169 } 2170 else 2171 { 2172 // LDC: becomes good in -O2 2173 // PERF: GDC without AVX 2174 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2175 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2176 a.ptr[2] = (a.array[2] < b.array[2]) ? a.array[2] : b.array[2]; 2177 a.ptr[3] = (a.array[3] < b.array[3]) ? a.array[3] : b.array[3]; 2178 return a; 2179 } 2180 } 2181 unittest 2182 { 2183 __m256d A = _mm256_setr_pd(4.0, 1.0, -9.0, double.infinity); 2184 __m256d B = _mm256_setr_pd(1.0, 8.0, 0.0, 100000.0); 2185 __m256d M = _mm256_min_pd(A, B); 2186 double[4] correct = [1.0, 8.0, -9.0, 100000.0]; 2187 } 2188 2189 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return 2190 /// packed maximum values. 2191 __m256 _mm256_min_ps (__m256 a, __m256 b) pure @trusted 2192 { 2193 // PERF DMD 2194 static if (GDC_or_LDC_with_AVX) 2195 { 2196 return __builtin_ia32_minps256(a, b); 2197 } 2198 else 2199 { 2200 // LDC: becomes good in -O2, but looks brittle. 2201 // PERF GDC without AVX 2202 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2203 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2204 a.ptr[2] = (a.array[2] < b.array[2]) ? a.array[2] : b.array[2]; 2205 a.ptr[3] = (a.array[3] < b.array[3]) ? a.array[3] : b.array[3]; 2206 a.ptr[4] = (a.array[4] < b.array[4]) ? a.array[4] : b.array[4]; 2207 a.ptr[5] = (a.array[5] < b.array[5]) ? a.array[5] : b.array[5]; 2208 a.ptr[6] = (a.array[6] < b.array[6]) ? a.array[6] : b.array[6]; 2209 a.ptr[7] = (a.array[7] < b.array[7]) ? a.array[7] : b.array[7]; 2210 return a; 2211 } 2212 } 2213 unittest 2214 { 2215 __m256 A = _mm256_setr_ps(4.0, 1.0, -9.0, float.infinity, 1, 2, 3, 4); 2216 __m256 B = _mm256_setr_ps(1.0, 8.0, 0.0, 100000.0f , 4, 3, 2, 1); 2217 __m256 M = _mm256_min_ps(A, B); 2218 float[8] correct = [1.0, 1.0, -9.0, 100000.0f , 1, 2, 2, 1]; 2219 } 2220 2221 /// Duplicate even-indexed double-precision (64-bit) floating-point elements from `a`. 2222 __m256d _mm256_movedup_pd (__m256d a) @trusted 2223 { 2224 // PERF DMD 2225 static if (GDC_with_AVX) 2226 { 2227 return __builtin_ia32_movddup256 (a); 2228 } 2229 else 2230 { 2231 a.ptr[1] = a.array[0]; 2232 a.ptr[3] = a.array[2]; 2233 return a; 2234 } 2235 } 2236 unittest 2237 { 2238 __m256d A = _mm256_setr_pd(1.0, 2, 3, 4); 2239 A = _mm256_movedup_pd(A); 2240 double[4] correct = [1.0, 1, 3, 3]; 2241 assert(A.array == correct); 2242 } 2243 2244 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`. 2245 __m256 _mm256_movehdup_ps (__m256 a) @trusted 2246 { 2247 // PERF DMD 2248 static if (GDC_with_AVX) 2249 { 2250 return __builtin_ia32_movshdup256 (a); 2251 } 2252 else 2253 { 2254 a.ptr[0] = a.array[1]; 2255 a.ptr[2] = a.array[3]; 2256 a.ptr[4] = a.array[5]; 2257 a.ptr[6] = a.array[7]; 2258 return a; 2259 } 2260 } 2261 unittest 2262 { 2263 __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8); 2264 A = _mm256_movehdup_ps(A); 2265 float[8] correct = [2.0, 2, 4, 4, 6, 6, 8, 8]; 2266 assert(A.array == correct); 2267 } 2268 2269 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`. 2270 __m256 _mm256_moveldup_ps (__m256 a) @trusted 2271 { 2272 // PERF DMD 2273 static if (GDC_with_AVX) 2274 { 2275 return __builtin_ia32_movsldup256 (a); 2276 } 2277 else 2278 { 2279 a.ptr[1] = a.array[0]; 2280 a.ptr[3] = a.array[2]; 2281 a.ptr[5] = a.array[4]; 2282 a.ptr[7] = a.array[6]; 2283 return a; 2284 } 2285 } 2286 unittest 2287 { 2288 __m256 A = _mm256_setr_ps(1.0f, 2, 3, 4, 5, 6, 7, 8); 2289 A = _mm256_moveldup_ps(A); 2290 float[8] correct = [1.0, 1, 3, 3, 5, 5, 7, 7]; 2291 assert(A.array == correct); 2292 } 2293 2294 /// Set each bit of result mask based on the most significant bit of the corresponding packed 2295 /// double-precision (64-bit) floating-point element in `a`. 2296 int _mm256_movemask_pd (__m256d a) @safe 2297 { 2298 // PERF DMD 2299 static if (GDC_or_LDC_with_AVX) 2300 { 2301 return __builtin_ia32_movmskpd256(a); 2302 } 2303 else static if (LDC_with_SSE2) 2304 { 2305 // this doesn't benefit GDC, and not clear for arm64. 2306 __m128d A_lo = _mm256_extractf128_pd!0(a); 2307 __m128d A_hi = _mm256_extractf128_pd!1(a); 2308 2309 return (_mm_movemask_pd(A_hi) << 2) | _mm_movemask_pd(A_lo); 2310 } 2311 else 2312 { 2313 // Fortunately, branchless on arm64 2314 long4 lv = cast(long4)a; 2315 int r = 0; 2316 if (lv.array[0] < 0) r += 1; 2317 if (lv.array[1] < 0) r += 2; 2318 if (lv.array[2] < 0) r += 4; 2319 if (lv.array[3] < 0) r += 8; 2320 return r; 2321 } 2322 } 2323 unittest 2324 { 2325 __m256d A = _mm256_setr_pd(-1, -double.infinity, 0, -1); 2326 assert(_mm256_movemask_pd(A) == 1 + 2 + 8); 2327 } 2328 2329 /// Set each bit of mask result based on the most significant bit of the corresponding packed 2330 /// single-precision (32-bit) floating-point element in `a`. 2331 int _mm256_movemask_ps (__m256 a) @system 2332 { 2333 // PERF DMD 2334 // PERF GDC without AVX 2335 static if (GDC_or_LDC_with_AVX) 2336 { 2337 return __builtin_ia32_movmskps256(a); 2338 } 2339 else version(LDC) 2340 { 2341 // this doesn't benefit GDC (unable to inline), but benefits both LDC with SSE2 and ARM64 2342 __m128 A_lo = _mm256_extractf128_ps!0(a); 2343 __m128 A_hi = _mm256_extractf128_ps!1(a); 2344 return (_mm_movemask_ps(A_hi) << 4) | _mm_movemask_ps(A_lo); 2345 } 2346 else 2347 { 2348 int8 lv = cast(int8)a; 2349 int r = 0; 2350 if (lv.array[0] < 0) r += 1; 2351 if (lv.array[1] < 0) r += 2; 2352 if (lv.array[2] < 0) r += 4; 2353 if (lv.array[3] < 0) r += 8; 2354 if (lv.array[4] < 0) r += 16; 2355 if (lv.array[5] < 0) r += 32; 2356 if (lv.array[6] < 0) r += 64; 2357 if (lv.array[7] < 0) r += 128; 2358 return r; 2359 } 2360 } 2361 unittest 2362 { 2363 __m256 A = _mm256_setr_ps(-1, -double.infinity, 0, -1, 1, double.infinity, -2, double.nan); 2364 assert(_mm256_movemask_ps(A) == 1 + 2 + 8 + 64); 2365 } 2366 2367 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`. 2368 __m256d _mm256_mul_pd (__m256d a, __m256d b) pure @safe 2369 { 2370 return a * b; 2371 } 2372 unittest 2373 { 2374 __m256d a = [-2.0, 1.5, -2.0, 1.5]; 2375 a = _mm256_mul_pd(a, a); 2376 assert(a.array == [4.0, 2.25, 4.0, 2.25]); 2377 } 2378 2379 /// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`. 2380 __m256 _mm256_mul_ps (__m256 a, __m256 b) pure @safe 2381 { 2382 return a * b; 2383 } 2384 unittest 2385 { 2386 __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 1.5f, -2.0f, 3.0f, 1.0f]; 2387 a = _mm256_mul_ps(a, a); 2388 float[8] correct = [2.25f, 4.0f, 9.0f, 1.0f, 2.25f, 4.0f, 9.0f, 1.0f]; 2389 assert(a.array == correct); 2390 } 2391 2392 2393 /// Compute the bitwise NOT of 256 bits in `a`. #BONUS 2394 __m256i _mm256_not_si256 (__m256i a) pure @safe 2395 { 2396 return ~a; 2397 } 2398 unittest 2399 { 2400 __m256i A = _mm256_set1_epi64x(-748); 2401 long4 notA = cast(long4) _mm256_not_si256(A); 2402 int[4] correct = [747, 747, 747, 747]; 2403 assert(notA.array == correct); 2404 } 2405 2406 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 2407 __m256d _mm256_or_pd (__m256d a, __m256d b) pure @safe 2408 { 2409 return cast(__m256d)( cast(__m256i)a | cast(__m256i)b ); 2410 } 2411 2412 /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in `a` and `b`. 2413 __m256 _mm256_or_ps (__m256 a, __m256 b) pure @safe 2414 { 2415 return cast(__m256)( cast(__m256i)a | cast(__m256i)b ); 2416 } 2417 2418 /// Shuffle double-precision (64-bit) floating-point elements in `a` using the control in `imm8`. 2419 __m128d _mm_permute_pd(int imm8)(__m128d a) pure @trusted 2420 { 2421 static if (GDC_with_AVX) 2422 { 2423 return __builtin_ia32_vpermilpd(a, imm8 & 3); 2424 } 2425 else 2426 { 2427 // Shufflevector not particularly better for LDC here 2428 __m128d r; 2429 r.ptr[0] = a.array[imm8 & 1]; 2430 r.ptr[1] = a.array[(imm8 >> 1) & 1]; 2431 return r; 2432 } 2433 } 2434 unittest 2435 { 2436 __m128d A = _mm_setr_pd(5, 6); 2437 __m128d B = _mm_permute_pd!1(A); 2438 __m128d C = _mm_permute_pd!3(A); 2439 double[2] RB = [6, 5]; 2440 double[2] RC = [6, 6]; 2441 assert(B.array == RB); 2442 assert(C.array == RC); 2443 } 2444 2445 ///ditto 2446 __m256d _mm256_permute_pd(int imm8)(__m256d a) pure @trusted 2447 { 2448 // PERF DMD 2449 static if (GDC_with_AVX) 2450 { 2451 return __builtin_ia32_vpermilpd256(a, imm8 & 15); 2452 } 2453 else version(LDC) 2454 { 2455 return shufflevectorLDC!(double4, 2456 (imm8 >> 0) & 1, 2457 ( (imm8 >> 1) & 1), 2458 2 + ( (imm8 >> 2) & 1), 2459 2 + ( (imm8 >> 3) & 1) )(a, a); 2460 } 2461 else 2462 { 2463 __m256d r; 2464 r.ptr[0] = a.array[ imm8 & 1]; 2465 r.ptr[1] = a.array[(imm8 >> 1) & 1]; 2466 r.ptr[2] = a.array[2 + ((imm8 >> 2) & 1)]; 2467 r.ptr[3] = a.array[2 + ((imm8 >> 3) & 1)]; 2468 return r; 2469 } 2470 } 2471 unittest 2472 { 2473 __m256d A = _mm256_setr_pd(0.0, 1, 2, 3); 2474 __m256d R = _mm256_permute_pd!(1 + 4)(A); 2475 double[4] correct = [1.0, 0, 3, 2]; 2476 assert(R.array == correct); 2477 } 2478 2479 /// Shuffle single-precision (32-bit) floating-point elements in `a` using the control in `imm8`. 2480 __m128 _mm_permute_ps(int imm8)(__m128 a) pure @trusted 2481 { 2482 // PERF DMD 2483 static if (GDC_with_AVX) 2484 { 2485 return __builtin_ia32_vpermilps(a, cast(ubyte)imm8); 2486 } 2487 else version(LDC) 2488 { 2489 return shufflevectorLDC!(float4, (imm8 >> 0) & 3, (imm8 >> 2) & 3, (imm8 >> 4) & 3, 2490 (imm8 >> 6) & 3)(a, a); 2491 } 2492 else 2493 { 2494 // PERF: could use _mm_shuffle_ps which is a super set 2495 // when AVX isn't available 2496 __m128 r; 2497 r.ptr[0] = a.array[(imm8 >> 0) & 3]; 2498 r.ptr[1] = a.array[(imm8 >> 2) & 3]; 2499 r.ptr[2] = a.array[(imm8 >> 4) & 3]; 2500 r.ptr[3] = a.array[(imm8 >> 6) & 3]; 2501 return r; 2502 } 2503 } 2504 unittest 2505 { 2506 __m128 A = _mm_setr_ps(0.0f, 1, 2, 3); 2507 __m128 R = _mm_permute_ps!(1 + 4 * 3 + 16 * 0 + 64 * 2)(A); 2508 float[4] correct = [1.0f, 3, 0, 2]; 2509 assert(R.array == correct); 2510 } 2511 2512 /// Shuffle single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using 2513 /// the control in `imm8`. The same shuffle is applied in lower and higher 128-bit lane. 2514 __m256 _mm256_permute_ps(int imm8)(__m256 a,) pure @trusted 2515 { 2516 // PERF DMD 2517 static if (GDC_with_AVX) 2518 { 2519 return __builtin_ia32_vpermilps256(a, cast(ubyte)imm8); 2520 } 2521 else version(LDC) 2522 { 2523 return shufflevectorLDC!(float8, 2524 (imm8 >> 0) & 3, (imm8 >> 2) & 3, (imm8 >> 4) & 3, (imm8 >> 6) & 3, 2525 4 + ((imm8 >> 0) & 3), 4 + ((imm8 >> 2) & 3), 4 + ((imm8 >> 4) & 3), 2526 4 + ((imm8 >> 6) & 3))(a, a); 2527 } 2528 else 2529 { 2530 __m256 r; 2531 r.ptr[0] = a.array[(imm8 >> 0) & 3]; 2532 r.ptr[1] = a.array[(imm8 >> 2) & 3]; 2533 r.ptr[2] = a.array[(imm8 >> 4) & 3]; 2534 r.ptr[3] = a.array[(imm8 >> 6) & 3]; 2535 r.ptr[4] = a.array[4 + ((imm8 >> 0) & 3)]; 2536 r.ptr[5] = a.array[4 + ((imm8 >> 2) & 3)]; 2537 r.ptr[6] = a.array[4 + ((imm8 >> 4) & 3)]; 2538 r.ptr[7] = a.array[4 + ((imm8 >> 6) & 3)]; 2539 return r; 2540 } 2541 } 2542 unittest 2543 { 2544 __m256 A = _mm256_setr_ps(0.0f, 1, 2, 3, 4, 5, 6, 7); 2545 __m256 R = _mm256_permute_ps!(1 + 4 * 3 + 16 * 0 + 64 * 2)(A); 2546 float[8] correct = [1.0f, 3, 0, 2, 5, 7, 4, 6]; 2547 assert(R.array == correct); 2548 } 2549 2550 /// Shuffle 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 2551 /// selected by `imm8` from `a` and `b`. 2552 /// See the documentation as the `imm8` format is quite complex. 2553 __m256d _mm256_permute2f128_pd(int imm8)(__m256d a, __m256d b) pure @safe 2554 { 2555 return cast(__m256d) _mm256_permute2f128_si256!imm8(cast(__m256i)a, cast(__m256i)b); 2556 } 2557 ///ditto 2558 __m256 _mm256_permute2f128_ps(int imm8)(__m256 a, __m256 b) pure @safe 2559 { 2560 return cast(__m256) _mm256_permute2f128_si256!imm8(cast(__m256i)a, cast(__m256i)b); 2561 } 2562 ///ditto 2563 __m256i _mm256_permute2f128_si256(int imm8)(__m256i a, __m256i b) pure @trusted 2564 { 2565 static if (GDC_with_AVX) 2566 { 2567 return cast(__m256i) __builtin_ia32_vperm2f128_si256(cast(int8)a, cast(int8)b, cast(ubyte)imm8); 2568 } 2569 else 2570 { 2571 static __m128i SELECT4(int imm4)(__m256i a, __m256i b) pure @trusted 2572 { 2573 static assert(imm4 >= 0 && imm4 <= 15); 2574 static if (imm4 & 8) 2575 { 2576 return _mm_setzero_si128(); 2577 } 2578 else static if ((imm4 & 2) == 0) 2579 { 2580 long2 r; 2581 enum int index = 2*(imm4 & 1); 2582 r.ptr[0] = a.array[index+0]; 2583 r.ptr[1] = a.array[index+1]; 2584 return cast(__m128i)r; 2585 } 2586 else 2587 { 2588 static assert( (imm4 & 2) != 0); 2589 long2 r; 2590 enum int index = 2*(imm4 & 1); 2591 r.ptr[0] = b.array[index+0]; 2592 r.ptr[1] = b.array[index+1]; 2593 return cast(__m128i)r; 2594 } 2595 } 2596 2597 long4 r; 2598 __m128i lo = SELECT4!(imm8 & 15)(a, b); 2599 __m128i hi = SELECT4!((imm8 >> 4) & 15)(a, b); 2600 return _mm256_set_m128i(hi, lo); 2601 } 2602 } 2603 unittest 2604 { 2605 __m256d A = _mm256_setr_pd(8.0, 1, 2, 3); 2606 __m256d B = _mm256_setr_pd(4.0, 5, 6, 7); 2607 __m256d R = _mm256_permute2f128_pd!(128 + 2)(A, B); 2608 double[4] correct = [4.0, 5.0, 0.0, 0.0]; 2609 assert(R.array == correct); 2610 2611 __m256d R2 = _mm256_permute2f128_pd!(3*16 + 1)(A, B); 2612 double[4] correct2 = [2.0, 3.0, 6.0, 7.0]; 2613 assert(R2.array == correct2); 2614 } 2615 2616 /// Shuffle double-precision (64-bit) floating-point elements in `a` using the control in `b`. 2617 /// Warning: the selector is in bit 1, not bit 0, of each 64-bit element! 2618 /// This is really not intuitive. 2619 __m128d _mm_permutevar_pd(__m128d a, __m128i b) pure @trusted 2620 { 2621 enum bool implementWithByteShuffle = GDC_with_SSSE3 || LDC_with_SSSE3 || LDC_with_ARM64; 2622 2623 static if (GDC_or_LDC_with_AVX) 2624 { 2625 return cast(__m128d) __builtin_ia32_vpermilvarpd(a, cast(long2)b); 2626 } 2627 else static if (implementWithByteShuffle) 2628 { 2629 align(16) static immutable byte[16] mmAddBase_u8 = [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]; 2630 align(16) static immutable byte[16] mmBroadcast_u8 = [0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8]; 2631 int4 bi = cast(int4)b; 2632 long2 two; 2633 two = 2; 2634 bi = _mm_slli_epi64(cast(__m128i)( (cast(long2)bi) & two), 2); 2635 bi = _mm_shuffle_epi8(bi, *cast(__m128i*)mmBroadcast_u8.ptr); 2636 // bi is now [ind0 ind0 ind0 ind0 ind0 ind0 ind0 ind0 ind1 ind1 ind1 ind1 ind1 ind1 ind1 ind1 ] 2637 byte16 bytesIndices = cast(byte16)bi; 2638 bytesIndices = bytesIndices + *cast(byte16*)mmAddBase_u8.ptr; 2639 2640 // which allows us to make a single _mm_shuffle_epi8 2641 return cast(__m128d) _mm_shuffle_epi8(cast(__m128i)a, cast(__m128i)bytesIndices); 2642 } 2643 else 2644 { 2645 // This isn't great in ARM64, TBL or TBX instructions can't do that. 2646 // that could fit the bill, if it had 64-bit operands. But it only has 8-bit operands. 2647 // SVE2 could do it with svtbx[_f64] probably. 2648 long2 bl = cast(long2)b; 2649 __m128d r; 2650 r.ptr[0] = a.array[ (bl.array[0] & 2) >> 1]; 2651 r.ptr[1] = a.array[ (bl.array[1] & 2) >> 1]; 2652 return r; 2653 } 2654 } 2655 unittest 2656 { 2657 __m128d A = _mm_setr_pd(5, 6); 2658 __m128d B = _mm_permutevar_pd(A, _mm_setr_epi64(2, 1)); 2659 __m128d C = _mm_permutevar_pd(A, _mm_setr_epi64(1 + 2 + 4, 2)); 2660 // yup, this is super strange, it's actually taking bit 1 and not bit 0 of each 64-bit element 2661 double[2] RB = [6, 5]; 2662 double[2] RC = [6, 6]; 2663 assert(B.array == RB); 2664 assert(C.array == RC); 2665 } 2666 2667 ///ditto 2668 __m256d _mm256_permutevar_pd (__m256d a, __m256i b) pure @trusted 2669 { 2670 // Worth it: for GDC, in SSSE3+ 2671 // for LDC, all the time 2672 version(LDC) 2673 enum bool implementWithByteShuffle = true; 2674 else 2675 enum bool implementWithByteShuffle = GDC_with_SSSE3; 2676 2677 // PERF DMD 2678 static if (GDC_or_LDC_with_AVX) 2679 { 2680 return cast(__m256d) __builtin_ia32_vpermilvarpd256(a, cast(long4)b); 2681 } 2682 else static if (implementWithByteShuffle) 2683 { 2684 // because we don't have 256-bit vectors, split and use _mm_permutevar_ps 2685 __m128d a_lo = _mm256_extractf128_pd!0(a); 2686 __m128d a_hi = _mm256_extractf128_pd!1(a); 2687 __m128i b_lo = _mm256_extractf128_si256!0(b); 2688 __m128i b_hi = _mm256_extractf128_si256!1(b); 2689 __m128d r_lo = _mm_permutevar_pd(a_lo, b_lo); 2690 __m128d r_hi = _mm_permutevar_pd(a_hi, b_hi); 2691 return _mm256_set_m128d(r_hi, r_lo); 2692 } 2693 else 2694 { 2695 long4 bl = cast(long4)b; 2696 __m256d r; 2697 r.ptr[0] = a.array[ (bl.array[0] & 2) >> 1]; 2698 r.ptr[1] = a.array[ (bl.array[1] & 2) >> 1]; 2699 r.ptr[2] = a.array[2 + ((bl.array[2] & 2) >> 1)]; 2700 r.ptr[3] = a.array[2 + ((bl.array[3] & 2) >> 1)]; 2701 return r; 2702 } 2703 } 2704 unittest 2705 { 2706 __m256d A = _mm256_setr_pd(5, 6, 7, 8); 2707 __m256d B = _mm256_permutevar_pd(A, _mm256_setr_epi64(2, 1, 0, 2)); 2708 __m256d C = _mm256_permutevar_pd(A, _mm256_setr_epi64(1 + 2 + 4, 2, 2, 0)); 2709 // yup, this is super strange, it's actually taking bit 1 and not bit 0 of each 64-bit element 2710 double[4] RB = [6, 5, 7, 8]; 2711 double[4] RC = [6, 6, 8, 7]; 2712 assert(B.array == RB); 2713 assert(C.array == RC); 2714 } 2715 2716 /// Shuffle single-precision (32-bit) floating-point elements in `a` using the control in `b`. 2717 __m128 _mm_permutevar_ps (__m128 a, __m128i b) @trusted 2718 { 2719 // PERF DMD 2720 2721 enum bool implementWithByteShuffle = GDC_with_SSSE3 || LDC_with_SSSE3 || LDC_with_ARM64; 2722 2723 static if (GDC_or_LDC_with_AVX) 2724 { 2725 return cast(__m128) __builtin_ia32_vpermilvarps(a, cast(int4)b); 2726 } 2727 else static if (implementWithByteShuffle) 2728 { 2729 // This workaround is worth it: in GDC with SSSE3, in LDC with SSSE3, in ARM64 (neon) 2730 int4 bi = cast(int4)b; 2731 int4 three; 2732 three = 3; 2733 bi = _mm_slli_epi32(bi & three, 2); 2734 // bi is [ind0 0 0 0 ind1 0 0 0 ind2 0 0 0 ind3 0 0 0] 2735 bi = bi | _mm_slli_si128!1(bi); 2736 bi = bi | _mm_slli_si128!2(bi); 2737 // bi is now [ind0 ind0 ind0 ind0 ind1 ind1 ind1 ind1 ind2 ind2 ind2 ind2 ind3 ind3 ind3 ind3] 2738 byte16 bytesIndices = cast(byte16)bi; 2739 align(16) static immutable byte[16] mmAddBase_u8 = [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]; 2740 bytesIndices = bytesIndices + *cast(byte16*)mmAddBase_u8.ptr; 2741 2742 // which allows us to make a single _mm_shuffle_epi8 2743 return cast(__m128) _mm_shuffle_epi8(cast(__m128i)a, cast(__m128i)bytesIndices); 2744 } 2745 else 2746 { 2747 2748 int4 bi = cast(int4)b; 2749 __m128 r; 2750 r.ptr[0] = a.array[ (bi.array[0] & 3) ]; 2751 r.ptr[1] = a.array[ (bi.array[1] & 3) ]; 2752 r.ptr[2] = a.array[ (bi.array[2] & 3) ]; 2753 r.ptr[3] = a.array[ (bi.array[3] & 3) ]; 2754 return r; 2755 } 2756 } 2757 unittest 2758 { 2759 __m128 A = _mm_setr_ps(5, 6, 7, 8); 2760 __m128 B = _mm_permutevar_ps(A, _mm_setr_epi32(2, 1, 0, 2 + 4)); 2761 __m128 C = _mm_permutevar_ps(A, _mm_setr_epi32(2, 3 + 8, 1, 0)); 2762 float[4] RB = [7, 6, 5, 7]; 2763 float[4] RC = [7, 8, 6, 5]; 2764 assert(B.array == RB); 2765 assert(C.array == RC); 2766 } 2767 2768 ///ditto 2769 __m256 _mm256_permutevar_ps (__m256 a, __m256i b) @trusted 2770 { 2771 // In order to do those two, it is necessary to use _mm_shuffle_epi8 and reconstruct the integers afterwards. 2772 enum bool implementWithByteShuffle = GDC_with_SSSE3 || LDC_with_SSSE3 || LDC_with_ARM64; 2773 2774 static if (GDC_or_LDC_with_AVX) 2775 { 2776 return __builtin_ia32_vpermilvarps256(a, cast(int8)b); 2777 } 2778 else static if (implementWithByteShuffle) 2779 { 2780 // because we don't have 256-bit vectors, split and use _mm_permutevar_ps 2781 __m128 a_lo = _mm256_extractf128_ps!0(a); 2782 __m128 a_hi = _mm256_extractf128_ps!1(a); 2783 __m128i b_lo = _mm256_extractf128_si256!0(b); 2784 __m128i b_hi = _mm256_extractf128_si256!1(b); 2785 __m128 r_lo = _mm_permutevar_ps(a_lo, b_lo); 2786 __m128 r_hi = _mm_permutevar_ps(a_hi, b_hi); 2787 return _mm256_set_m128(r_hi, r_lo); 2788 } 2789 else 2790 { 2791 int8 bi = cast(int8)b; 2792 __m256 r; 2793 r.ptr[0] = a.array[ (bi.array[0] & 3) ]; 2794 r.ptr[1] = a.array[ (bi.array[1] & 3) ]; 2795 r.ptr[2] = a.array[ (bi.array[2] & 3) ]; 2796 r.ptr[3] = a.array[ (bi.array[3] & 3) ]; 2797 r.ptr[4] = a.array[ 4 + (bi.array[4] & 3) ]; 2798 r.ptr[5] = a.array[ 4 + (bi.array[5] & 3) ]; 2799 r.ptr[6] = a.array[ 4 + (bi.array[6] & 3) ]; 2800 r.ptr[7] = a.array[ 4 + (bi.array[7] & 3) ]; 2801 return r; 2802 } 2803 } 2804 unittest 2805 { 2806 __m256 A = _mm256_setr_ps(1, 2, 3, 4, 5, 6, 7, 8); 2807 __m256 B = _mm256_permutevar_ps(A, _mm256_setr_epi32(2, 1, 0, 2, 3, 2, 1, 0)); 2808 __m256 C = _mm256_permutevar_ps(A, _mm256_setr_epi32(2, 3 + 8, 1, 0, 2, 3, 0, 1)); 2809 float[8] RB = [3.0f, 2, 1, 3, 8, 7, 6, 5]; 2810 float[8] RC = [3.0f, 4, 2, 1, 7, 8, 5, 6]; 2811 assert(B.array == RB); 2812 assert(C.array == RC); 2813 } 2814 2815 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements 2816 /// in `a`. The maximum relative error for this approximation is less than 1.5*2^-12. 2817 __m256 _mm256_rcp_ps (__m256 a) pure @trusted 2818 { 2819 // PERF DMD 2820 static if (GDC_or_LDC_with_AVX) 2821 { 2822 return __builtin_ia32_rcpps256(a); 2823 } 2824 else 2825 { 2826 a.ptr[0] = 1.0f / a.array[0]; 2827 a.ptr[1] = 1.0f / a.array[1]; 2828 a.ptr[2] = 1.0f / a.array[2]; 2829 a.ptr[3] = 1.0f / a.array[3]; 2830 a.ptr[4] = 1.0f / a.array[4]; 2831 a.ptr[5] = 1.0f / a.array[5]; 2832 a.ptr[6] = 1.0f / a.array[6]; 2833 a.ptr[7] = 1.0f / a.array[7]; 2834 return a; 2835 } 2836 } 2837 unittest 2838 { 2839 __m256 A = _mm256_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f, 9, -46, 1869816, 55583); 2840 __m256 groundTruth = _mm256_set1_ps(1.0f) / A; 2841 __m256 result = _mm256_rcp_ps(A); 2842 foreach(i; 0..8) 2843 { 2844 double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1; 2845 assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 2846 } 2847 } 2848 2849 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 2850 /// rounding parameter, and store the results as packed double-precision floating-point elements. 2851 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 2852 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 2853 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 2854 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 2855 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 2856 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 2857 __m256d _mm256_round_pd(int rounding)(__m256d a) @trusted 2858 { 2859 // PERF DMD 2860 static if (GDC_with_AVX) 2861 { 2862 return __builtin_ia32_roundpd256(a, rounding); 2863 } 2864 else static if (LDC_with_AVX) 2865 { 2866 return __builtin_ia32_roundpd256(a, rounding); 2867 } 2868 else 2869 { 2870 static if (rounding & _MM_FROUND_CUR_DIRECTION) 2871 { 2872 // PERF: non-AVX x86, would probably be faster to convert those double at once to int64 2873 2874 __m128d A_lo = _mm256_extractf128_pd!0(a); 2875 __m128d A_hi = _mm256_extractf128_pd!1(a); 2876 2877 // Convert to 64-bit integers one by one 2878 long x0 = _mm_cvtsd_si64(A_lo); 2879 long x2 = _mm_cvtsd_si64(A_hi); 2880 A_lo.ptr[0] = A_lo.array[1]; 2881 A_hi.ptr[0] = A_hi.array[1]; 2882 long x1 = _mm_cvtsd_si64(A_lo); 2883 long x3 = _mm_cvtsd_si64(A_hi); 2884 2885 return _mm256_setr_pd(x0, x1, x2, x3); 2886 } 2887 else 2888 { 2889 version(GNU) pragma(inline, false); // this was required for SSE4.1 rounding, let it here 2890 2891 uint old = _MM_GET_ROUNDING_MODE(); 2892 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 2893 2894 __m128d A_lo = _mm256_extractf128_pd!0(a); 2895 __m128d A_hi = _mm256_extractf128_pd!1(a); 2896 2897 // Convert to 64-bit integers one by one 2898 long x0 = _mm_cvtsd_si64(A_lo); 2899 long x2 = _mm_cvtsd_si64(A_hi); 2900 A_lo.ptr[0] = A_lo.array[1]; 2901 A_hi.ptr[0] = A_hi.array[1]; 2902 long x1 = _mm_cvtsd_si64(A_lo); 2903 long x3 = _mm_cvtsd_si64(A_hi); 2904 2905 // Convert back to double to achieve the rounding 2906 // The problem is that a 64-bit double can't represent all the values 2907 // a 64-bit integer can (and vice-versa). So this function won't work for 2908 // large values. (FUTURE: what range exactly?) 2909 _MM_SET_ROUNDING_MODE(old); 2910 return _mm256_setr_pd(x0, x1, x2, x3); 2911 } 2912 } 2913 } 2914 unittest 2915 { 2916 // tested in other intrinsics 2917 } 2918 2919 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 2920 /// rounding parameter, and store the results as packed single-precision floating-point elements. 2921 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 2922 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 2923 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 2924 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 2925 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 2926 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 2927 __m256 _mm256_round_ps(int rounding)(__m256 a) @trusted 2928 { 2929 // PERF DMD 2930 static if (GDC_or_LDC_with_AVX) 2931 { 2932 return __builtin_ia32_roundps256(a, rounding); 2933 } 2934 else static if (GDC_or_LDC_with_SSE41) 2935 { 2936 // we can use _mm_round_ps 2937 __m128 lo = _mm256_extractf128_ps!0(a); 2938 __m128 hi = _mm256_extractf128_ps!1(a); 2939 __m128 ilo = _mm_round_ps!rounding(lo); // unfortunately _mm_round_ps isn't fast for arm64, so we avoid that in that case 2940 __m128 ihi = _mm_round_ps!rounding(hi); 2941 return _mm256_set_m128(ihi, ilo); 2942 } 2943 else 2944 { 2945 static if (rounding & _MM_FROUND_CUR_DIRECTION) 2946 { 2947 __m256i integers = _mm256_cvtps_epi32(a); 2948 return _mm256_cvtepi32_ps(integers); 2949 } 2950 else 2951 { 2952 version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled 2953 uint old = _MM_GET_ROUNDING_MODE(); 2954 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 2955 scope(exit) _MM_SET_ROUNDING_MODE(old); 2956 2957 // Convert to 32-bit integers 2958 __m256i integers = _mm256_cvtps_epi32(a); 2959 2960 // Convert back to float to achieve the rounding 2961 // The problem is that a 32-float can't represent all the values 2962 // a 32-bit integer can (and vice-versa). So this function won't work for 2963 // large values. (FUTURE: what range exactly?) 2964 __m256 result = _mm256_cvtepi32_ps(integers); 2965 2966 return result; 2967 } 2968 } 2969 } 2970 unittest 2971 { 2972 // tested in other intrinsics 2973 } 2974 2975 2976 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) 2977 /// floating-point elements in `a`. The maximum relative error for this approximation is less than 2978 /// 1.5*2^-12. 2979 __m256 _mm256_rsqrt_ps (__m256 a) pure @trusted 2980 { 2981 static if (GDC_or_LDC_with_AVX) 2982 { 2983 return __builtin_ia32_rsqrtps256(a); 2984 } 2985 else version(LDC) 2986 { 2987 a[0] = 1.0f / llvm_sqrt(a[0]); 2988 a[1] = 1.0f / llvm_sqrt(a[1]); 2989 a[2] = 1.0f / llvm_sqrt(a[2]); 2990 a[3] = 1.0f / llvm_sqrt(a[3]); 2991 a[4] = 1.0f / llvm_sqrt(a[4]); 2992 a[5] = 1.0f / llvm_sqrt(a[5]); 2993 a[6] = 1.0f / llvm_sqrt(a[6]); 2994 a[7] = 1.0f / llvm_sqrt(a[7]); 2995 return a; 2996 } 2997 else 2998 { 2999 a.ptr[0] = 1.0f / sqrt(a.array[0]); 3000 a.ptr[1] = 1.0f / sqrt(a.array[1]); 3001 a.ptr[2] = 1.0f / sqrt(a.array[2]); 3002 a.ptr[3] = 1.0f / sqrt(a.array[3]); 3003 a.ptr[4] = 1.0f / sqrt(a.array[4]); 3004 a.ptr[5] = 1.0f / sqrt(a.array[5]); 3005 a.ptr[6] = 1.0f / sqrt(a.array[6]); 3006 a.ptr[7] = 1.0f / sqrt(a.array[7]); 3007 return a; 3008 } 3009 } 3010 unittest 3011 { 3012 __m256 A = _mm256_setr_ps(2.34f, 70000.0f, 0.00001f, 345.5f, 2.34f, 70000.0f, 0.00001f, 345.5f); 3013 __m256 groundTruth = _mm256_setr_ps(0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f, 3014 0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f); 3015 __m256 result = _mm256_rsqrt_ps(A); 3016 foreach(i; 0..8) 3017 { 3018 double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1; 3019 assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 3020 } 3021 } 3022 3023 /// Set packed 16-bit integers with the supplied values. 3024 __m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 3025 { 3026 short16 r; // Note: = void would prevent GDC from inlining a constant short16... 3027 r.ptr[0] = e0; 3028 r.ptr[1] = e1; 3029 r.ptr[2] = e2; 3030 r.ptr[3] = e3; 3031 r.ptr[4] = e4; 3032 r.ptr[5] = e5; 3033 r.ptr[6] = e6; 3034 r.ptr[7] = e7; 3035 r.ptr[8] = e8; 3036 r.ptr[9] = e9; 3037 r.ptr[10] = e10; 3038 r.ptr[11] = e11; 3039 r.ptr[12] = e12; 3040 r.ptr[13] = e13; 3041 r.ptr[14] = e14; 3042 r.ptr[15] = e15; 3043 return cast(__m256i) r; 3044 } 3045 unittest 3046 { 3047 short16 A = cast(short16) _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 3048 7, 6, 5, 4, 3, 2, 1, 0); 3049 foreach(i; 0..16) 3050 assert(A.array[i] == i); 3051 } 3052 3053 /// Set packed 32-bit integers with the supplied values. 3054 __m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted 3055 { 3056 // Inlines a constant with GCC -O1, LDC -O2 3057 int8 r; // = void would prevent GCC from inlining a constant call 3058 r.ptr[0] = e0; 3059 r.ptr[1] = e1; 3060 r.ptr[2] = e2; 3061 r.ptr[3] = e3; 3062 r.ptr[4] = e4; 3063 r.ptr[5] = e5; 3064 r.ptr[6] = e6; 3065 r.ptr[7] = e7; 3066 return cast(__m256i)r; 3067 } 3068 unittest 3069 { 3070 int8 A = cast(int8) _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); 3071 foreach(i; 0..8) 3072 assert(A.array[i] == i); 3073 } 3074 3075 /// Set packed 64-bit integers with the supplied values. 3076 __m256i _mm256_set_epi64x (long e3, long e2, long e1, long e0) pure @trusted 3077 { 3078 long4 r = void; 3079 r.ptr[0] = e0; 3080 r.ptr[1] = e1; 3081 r.ptr[2] = e2; 3082 r.ptr[3] = e3; 3083 return r; 3084 } 3085 unittest 3086 { 3087 __m256i A = _mm256_set_epi64x(-1, 42, long.min, long.max); 3088 long[4] correct = [long.max, long.min, 42, -1]; 3089 assert(A.array == correct); 3090 } 3091 3092 ///ditto 3093 alias _mm256_set_epi64 = _mm256_set_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API. 3094 3095 /// Set packed 8-bit integers with the supplied values. 3096 __m256i _mm256_set_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24, 3097 byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16, 3098 byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9, byte e8, 3099 byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) 3100 { 3101 // Inline a constant call in GDC -O1 and LDC -O2 3102 align(32) byte[32] result = [ e0, e1, e2, e3, e4, e5, e6, e7, 3103 e8, e9, e10, e11, e12, e13, e14, e15, 3104 e16, e17, e18, e19, e20, e21, e22, e23, 3105 e24, e25, e26, e27, e28, e29, e30, e31 ]; 3106 return *cast(__m256i*)(result.ptr); 3107 } 3108 unittest 3109 { 3110 byte32 R = cast(byte32) _mm256_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7); 3111 byte[32] correct = [7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0, 3, 2, 1, 0, 3112 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1]; 3113 assert(R.array == correct); 3114 } 3115 3116 /// Set packed `__m256d` vector with the supplied values. 3117 __m256 _mm256_set_m128 (__m128 hi, __m128 lo) pure @trusted 3118 { 3119 // DMD PERF 3120 static if (GDC_with_AVX) 3121 { 3122 __m256 r = __builtin_ia32_ps256_ps(lo); 3123 return __builtin_ia32_vinsertf128_ps256(r, hi, 1); 3124 } 3125 else 3126 { 3127 __m256 r = void; 3128 r.ptr[0] = lo.array[0]; 3129 r.ptr[1] = lo.array[1]; 3130 r.ptr[2] = lo.array[2]; 3131 r.ptr[3] = lo.array[3]; 3132 r.ptr[4] = hi.array[0]; 3133 r.ptr[5] = hi.array[1]; 3134 r.ptr[6] = hi.array[2]; 3135 r.ptr[7] = hi.array[3]; 3136 return r; 3137 } 3138 3139 /* 3140 // BUG, doesn't work if AVX vector is emulated, but SSE vector is not 3141 // See issue #108 3142 __m256 r = void; 3143 __m128* p = cast(__m128*)(&r); 3144 p[0] = lo; 3145 p[1] = hi; 3146 return r; 3147 */ 3148 } 3149 unittest 3150 { 3151 __m128 lo = _mm_setr_ps(1.0f, 2, 3, 4); 3152 __m128 hi = _mm_setr_ps(3.0f, 4, 5, 6); 3153 __m256 R = _mm256_set_m128(hi, lo); 3154 float[8] correct = [1.0f, 2, 3, 4, 3, 4, 5, 6]; 3155 assert(R.array == correct); 3156 } 3157 3158 /// Set packed `__m256d` vector with the supplied values. 3159 __m256d _mm256_set_m128d (__m128d hi, __m128d lo) pure @trusted 3160 { 3161 __m256d r = void; 3162 r.ptr[0] = lo.array[0]; 3163 r.ptr[1] = lo.array[1]; 3164 r.ptr[2] = hi.array[0]; 3165 r.ptr[3] = hi.array[1]; 3166 return r; 3167 } 3168 unittest 3169 { 3170 __m128d lo = _mm_setr_pd(1.0, 2.0); 3171 __m128d hi = _mm_setr_pd(3.0, 4.0); 3172 __m256d R = _mm256_set_m128d(hi, lo); 3173 double[4] correct = [1.0, 2.0, 3.0, 4.0]; 3174 assert(R.array == correct); 3175 } 3176 3177 /// Set packed `__m256i` vector with the supplied values. 3178 __m256i _mm256_set_m128i (__m128i hi, __m128i lo) pure @trusted 3179 { 3180 // DMD PERF 3181 static if (GDC_with_AVX) 3182 { 3183 __m256i r = cast(long4) __builtin_ia32_si256_si (lo); 3184 return cast(long4) __builtin_ia32_vinsertf128_si256(cast(int8)r, hi, 1); 3185 } 3186 else 3187 { 3188 int8 r = void; 3189 r.ptr[0] = lo.array[0]; 3190 r.ptr[1] = lo.array[1]; 3191 r.ptr[2] = lo.array[2]; 3192 r.ptr[3] = lo.array[3]; 3193 r.ptr[4] = hi.array[0]; 3194 r.ptr[5] = hi.array[1]; 3195 r.ptr[6] = hi.array[2]; 3196 r.ptr[7] = hi.array[3]; 3197 return cast(long4)r; 3198 } 3199 } 3200 unittest 3201 { 3202 __m128i lo = _mm_setr_epi32( 1, 2, 3, 4); 3203 __m128i hi = _mm_set_epi32(-3, -4, -5, -6); 3204 int8 R = cast(int8)_mm256_set_m128i(hi, lo); 3205 int[8] correct = [1, 2, 3, 4, -6, -5, -4, -3]; 3206 assert(R.array == correct); 3207 } 3208 3209 /// Set packed double-precision (64-bit) floating-point elements with the supplied values. 3210 __m256d _mm256_set_pd (double e3, double e2, double e1, double e0) pure @trusted 3211 { 3212 __m256d r = void; 3213 r.ptr[0] = e0; 3214 r.ptr[1] = e1; 3215 r.ptr[2] = e2; 3216 r.ptr[3] = e3; 3217 return r; 3218 } 3219 unittest 3220 { 3221 __m256d A = _mm256_set_pd(3, 2, 1, 546); 3222 double[4] correct = [546.0, 1.0, 2.0, 3.0]; 3223 assert(A.array == correct); 3224 } 3225 3226 /// Set packed single-precision (32-bit) floating-point elements with the supplied values. 3227 __m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) pure @trusted 3228 { 3229 // PERF: see #102, use = void? 3230 __m256 r; 3231 r.ptr[0] = e0; 3232 r.ptr[1] = e1; 3233 r.ptr[2] = e2; 3234 r.ptr[3] = e3; 3235 r.ptr[4] = e4; 3236 r.ptr[5] = e5; 3237 r.ptr[6] = e6; 3238 r.ptr[7] = e7; 3239 return r; 3240 } 3241 unittest 3242 { 3243 __m256 A = _mm256_set_ps(3, 2, 1, 546.0f, -1.25f, -2, -3, 0); 3244 float[8] correct = [0, -3, -2, -1.25f, 546.0f, 1.0, 2.0, 3.0]; 3245 assert(A.array == correct); 3246 } 3247 3248 /// Broadcast 16-bit integer `a` to all elements of the return value. 3249 __m256i _mm256_set1_epi16 (short a) pure @trusted 3250 { 3251 version(DigitalMars) 3252 { 3253 // workaround https://issues.dlang.org/show_bug.cgi?id=21469 3254 // It used to ICE, after that the codegen was just wrong. 3255 // No issue anymore in DMD 2.101, we can eventually remove that 3256 static if (__VERSION__ < 2101) 3257 { 3258 short16 v = a; 3259 return cast(__m256i) v; 3260 } 3261 else 3262 { 3263 pragma(inline, true); 3264 return cast(__m256i)(short16(a)); 3265 } 3266 } 3267 else 3268 { 3269 pragma(inline, true); 3270 return cast(__m256i)(short16(a)); 3271 } 3272 } 3273 unittest 3274 { 3275 short16 a = cast(short16) _mm256_set1_epi16(31); 3276 for (int i = 0; i < 16; ++i) 3277 assert(a.array[i] == 31); 3278 } 3279 3280 /// Broadcast 32-bit integer `a` to all elements. 3281 __m256i _mm256_set1_epi32 (int a) pure @trusted 3282 { 3283 version(DigitalMars) 3284 { 3285 // No issue anymore in DMD 2.101, we can eventually remove that 3286 static if (__VERSION__ < 2101) 3287 { 3288 int8 v = a; 3289 return cast(__m256i) v; 3290 } 3291 else 3292 { 3293 pragma(inline, true); 3294 return cast(__m256i)(int8(a)); 3295 } 3296 } 3297 else 3298 { 3299 pragma(inline, true); 3300 return cast(__m256i)(int8(a)); 3301 } 3302 } 3303 unittest 3304 { 3305 int8 a = cast(int8) _mm256_set1_epi32(31); 3306 for (int i = 0; i < 8; ++i) 3307 assert(a.array[i] == 31); 3308 } 3309 3310 /// Broadcast 64-bit integer `a` to all elements of the return value. 3311 __m256i _mm256_set1_epi64x (long a) pure 3312 { 3313 return cast(__m256i)(long4(a)); 3314 } 3315 unittest 3316 { 3317 long4 a = cast(long4) _mm256_set1_epi64x(-31); 3318 for (int i = 0; i < 4; ++i) 3319 assert(a.array[i] == -31); 3320 } 3321 ///ditto 3322 alias _mm256_set1_epi64 = _mm256_set1_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API. 3323 3324 /// Broadcast 8-bit integer `a` to all elements of the return value. 3325 __m256i _mm256_set1_epi8 (byte a) pure @trusted 3326 { 3327 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 3328 { 3329 byte32 v = a; 3330 return cast(__m256i) v; 3331 } 3332 else 3333 { 3334 pragma(inline, true); 3335 return cast(__m256i)(byte32(a)); 3336 } 3337 } 3338 unittest 3339 { 3340 byte32 a = cast(byte32) _mm256_set1_epi8(31); 3341 for (int i = 0; i < 32; ++i) 3342 assert(a.array[i] == 31); 3343 } 3344 3345 /// Broadcast double-precision (64-bit) floating-point value `a` to all elements of the return value. 3346 __m256d _mm256_set1_pd (double a) pure @trusted 3347 { 3348 return __m256d(a); 3349 } 3350 unittest 3351 { 3352 double a = 464.21; 3353 double[4] correct = [a, a, a, a]; 3354 double4 A = cast(double4) _mm256_set1_pd(a); 3355 assert(A.array == correct); 3356 } 3357 3358 /// Broadcast single-precision (32-bit) floating-point value `a` to all elements of the return value. 3359 __m256 _mm256_set1_ps (float a) pure @trusted 3360 { 3361 return __m256(a); 3362 } 3363 unittest 3364 { 3365 float a = 464.21f; 3366 float[8] correct = [a, a, a, a, a, a, a, a]; 3367 float8 A = cast(float8) _mm256_set1_ps(a); 3368 assert(A.array == correct); 3369 } 3370 3371 /// Set packed 16-bit integers with the supplied values in reverse order. 3372 __m256i _mm256_setr_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, 3373 short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 3374 { 3375 short[16] result = [ e15, e14, e13, e12, e11, e10, e9, e8, 3376 e7, e6, e5, e4, e3, e2, e1, e0]; 3377 static if (GDC_with_AVX) 3378 { 3379 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr); 3380 } 3381 else static if (LDC_with_optimizations) 3382 { 3383 return cast(__m256i)( loadUnaligned!(short16)(result.ptr) ); 3384 } 3385 else 3386 { 3387 short16 r; 3388 for(int n = 0; n < 16; ++n) 3389 r.ptr[n] = result[n]; 3390 return cast(__m256i)r; 3391 } 3392 } 3393 unittest 3394 { 3395 short16 A = cast(short16) _mm256_setr_epi16(-1, 0, -21, 21, 42, 127, -42, -128, 3396 -1, 0, -21, 21, 42, 127, -42, -128); 3397 short[16] correct = [-1, 0, -21, 21, 42, 127, -42, -128, 3398 -1, 0, -21, 21, 42, 127, -42, -128]; 3399 assert(A.array == correct); 3400 } 3401 3402 /// Set packed 32-bit integers with the supplied values in reverse order. 3403 __m256i _mm256_setr_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted 3404 { 3405 // Inlines a constant with GCC -O1, LDC -O2 3406 int8 r; // = void would prevent GDC from inlining a constant call 3407 r.ptr[0] = e7; 3408 r.ptr[1] = e6; 3409 r.ptr[2] = e5; 3410 r.ptr[3] = e4; 3411 r.ptr[4] = e3; 3412 r.ptr[5] = e2; 3413 r.ptr[6] = e1; 3414 r.ptr[7] = e0; 3415 return cast(__m256i)r; 3416 } 3417 unittest 3418 { 3419 int8 A = cast(int8) _mm256_setr_epi32(-1, 0, -2147483648, 2147483647, 42, 666, -42, -666); 3420 int[8] correct = [-1, 0, -2147483648, 2147483647, 42, 666, -42, -666]; 3421 assert(A.array == correct); 3422 } 3423 3424 /// Set packed 64-bit integers with the supplied values in reverse order. 3425 __m256i _mm256_setr_epi64x (long e3, long e2, long e1, long e0) pure @trusted 3426 { 3427 long4 r = void; 3428 r.ptr[0] = e3; 3429 r.ptr[1] = e2; 3430 r.ptr[2] = e1; 3431 r.ptr[3] = e0; 3432 return r; 3433 } 3434 unittest 3435 { 3436 __m256i A = _mm256_setr_epi64x(-1, 42, long.min, long.max); 3437 long[4] correct = [-1, 42, long.min, long.max]; 3438 assert(A.array == correct); 3439 } 3440 ///ditto 3441 alias _mm256_setr_epi64 = _mm256_setr_epi64x; // #BONUS, not sure why this isn't in Intel Intrinsics API. 3442 3443 /// Set packed 8-bit integers with the supplied values in reverse order. 3444 __m256i _mm256_setr_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24, 3445 byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16, 3446 byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9, byte e8, 3447 byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 3448 { 3449 // Inline a constant call in GDC -O1 and LDC -O2 3450 align(32) byte[32] result = [ e31, e30, e29, e28, e27, e26, e25, e24, 3451 e23, e22, e21, e20, e19, e18, e17, e16, 3452 e15, e14, e13, e12, e11, e10, e9, e8, 3453 e7, e6, e5, e4, e3, e2, e1, e0]; 3454 return *cast(__m256i*)(result.ptr); 3455 } 3456 unittest 3457 { 3458 byte32 A = cast(byte32) _mm256_setr_epi8( -1, 0, -21, 21, 42, 127, -42, -128, 3459 -1, 0, -21, 21, 42, 127, -42, -128, 3460 -1, 0, -21, 21, 42, 127, -42, -128, 3461 -1, 0, -21, 21, 42, 127, -42, -128); 3462 byte[32] correct = [-1, 0, -21, 21, 42, 127, -42, -128, 3463 -1, 0, -21, 21, 42, 127, -42, -128, 3464 -1, 0, -21, 21, 42, 127, -42, -128, 3465 -1, 0, -21, 21, 42, 127, -42, -128]; 3466 assert(A.array == correct); 3467 } 3468 3469 /// Set packed `__m256` vector with the supplied values. 3470 __m256 _mm256_setr_m128 (__m128 lo, __m128 hi) pure 3471 { 3472 return _mm256_set_m128(hi, lo); 3473 } 3474 unittest 3475 { 3476 __m128 A = _mm_setr_ps(1.0f, 2, 3, 4); 3477 __m128 B = _mm_setr_ps(3.0f, 4, 5, 6); 3478 __m256 R = _mm256_setr_m128(B, A); 3479 float[8] correct = [3.0f, 4, 5, 6, 1, 2, 3, 4,]; 3480 assert(R.array == correct); 3481 } 3482 3483 /// Set packed `__m256d` vector with the supplied values. 3484 __m256d _mm256_setr_m128d (__m128d lo, __m128d hi) pure 3485 { 3486 return _mm256_set_m128d(hi, lo); 3487 } 3488 unittest 3489 { 3490 __m128d A = _mm_setr_pd(1.0, 2.0); 3491 __m128d B = _mm_setr_pd(3.0, 4.0); 3492 __m256d R = _mm256_setr_m128d(B, A); 3493 double[4] correct = [3.0, 4.0, 1.0, 2.0]; 3494 assert(R.array == correct); 3495 } 3496 3497 /// Set packed `__m256i` vector with the supplied values. 3498 __m256i _mm256_setr_m128i (__m128i lo, __m128i hi) pure 3499 { 3500 return _mm256_set_m128i(hi, lo); 3501 } 3502 unittest 3503 { 3504 __m128i A = _mm_setr_epi32( 1, 2, 3, 4); 3505 __m128i B = _mm_set_epi32(-3, -4, -5, -6); 3506 int8 R = cast(int8)_mm256_setr_m128i(B, A); 3507 int[8] correct = [-6, -5, -4, -3, 1, 2, 3, 4]; 3508 assert(R.array == correct); 3509 } 3510 3511 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order. 3512 __m256d _mm256_setr_pd (double e3, double e2, double e1, double e0) pure @trusted 3513 { 3514 static if (LDC_with_optimizations) 3515 { 3516 // PERF, probably not the best 3517 double[4] result = [e3, e2, e1, e0]; 3518 return loadUnaligned!(double4)(result.ptr); 3519 } 3520 else 3521 { 3522 __m256d r; 3523 r.ptr[0] = e3; 3524 r.ptr[1] = e2; 3525 r.ptr[2] = e1; 3526 r.ptr[3] = e0; 3527 return r; 3528 } 3529 } 3530 unittest 3531 { 3532 __m256d A = _mm256_setr_pd(3, 2, 1, 546.125); 3533 double[4] correct = [3.0, 2.0, 1.0, 546.125]; 3534 assert(A.array == correct); 3535 } 3536 3537 3538 /// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order. 3539 __m256 _mm256_setr_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) pure @trusted 3540 { 3541 // PERF DMD 3542 static if (GDC_with_AVX) 3543 { 3544 align(32) float[8] r = [ e7, e6, e5, e4, e3, e2, e1, e0]; 3545 return *cast(__m256*)r; 3546 } 3547 else version(LDC) 3548 { 3549 align(32) float[8] r = [ e7, e6, e5, e4, e3, e2, e1, e0]; 3550 return *cast(__m256*)r; 3551 } 3552 else 3553 { 3554 __m256 r; 3555 r.ptr[0] = e7; 3556 r.ptr[1] = e6; 3557 r.ptr[2] = e5; 3558 r.ptr[3] = e4; 3559 r.ptr[4] = e3; 3560 r.ptr[5] = e2; 3561 r.ptr[6] = e1; 3562 r.ptr[7] = e0; 3563 return r; 3564 } 3565 } 3566 unittest 3567 { 3568 __m256 A = _mm256_setr_ps( 3, 2, 1, 546.125f, 4, 5, 6, 7); 3569 float[8] correct = [3.0f, 2, 1, 546.125f, 4, 5, 6, 7]; 3570 assert(A.array == correct); 3571 } 3572 3573 /// Return vector of type `__m256d` with all elements set to zero. 3574 __m256d _mm256_setzero_pd() pure @safe 3575 { 3576 return double4(0.0); 3577 } 3578 unittest 3579 { 3580 __m256d A = _mm256_setzero_pd(); 3581 double[4] correct = [0.0, 0.0, 0.0, 0.0]; 3582 assert(A.array == correct); 3583 } 3584 3585 /// Return vector of type `__m256` with all elements set to zero. 3586 __m256 _mm256_setzero_ps() pure @safe 3587 { 3588 return float8(0.0f); 3589 } 3590 unittest 3591 { 3592 __m256 A = _mm256_setzero_ps(); 3593 float[8] correct = [0.0f, 0, 0, 0, 0, 0, 0, 0]; 3594 assert(A.array == correct); 3595 } 3596 3597 /// Return vector of type `__m256i` with all elements set to zero. 3598 __m256i _mm256_setzero_si256() pure @trusted 3599 { 3600 return __m256i(0); 3601 } 3602 unittest 3603 { 3604 __m256i A = _mm256_setzero_si256(); 3605 long[4] correct = [0, 0, 0, 0]; 3606 assert(A.array == correct); 3607 } 3608 3609 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the 3610 /// control in `imm8`. 3611 __m256d _mm256_shuffle_pd(int imm8)(__m256d a, __m256d b) pure @trusted 3612 { 3613 // PERF DMD 3614 static if (GDC_with_AVX) 3615 { 3616 return __builtin_ia32_shufpd256(a, b, imm8); 3617 } 3618 else version(LDC) 3619 { 3620 return shufflevectorLDC!(double4, 3621 (imm8 >> 0) & 1, 3622 4 + ( (imm8 >> 1) & 1), 3623 2 + ( (imm8 >> 2) & 1), 3624 6 + ( (imm8 >> 3) & 1) )(a, b); 3625 } 3626 else 3627 { 3628 double4 r = void; 3629 r.ptr[0] = a.array[(imm8 >> 0) & 1]; 3630 r.ptr[1] = b.array[(imm8 >> 1) & 1]; 3631 r.ptr[2] = a.array[2 + ( (imm8 >> 2) & 1)]; 3632 r.ptr[3] = b.array[2 + ( (imm8 >> 3) & 1)]; 3633 return r; 3634 } 3635 } 3636 unittest 3637 { 3638 __m256d A = _mm256_setr_pd( 0, 1, 2, 3); 3639 __m256d B = _mm256_setr_pd( 4, 5, 6, 7); 3640 __m256d C = _mm256_shuffle_pd!75 /* 01001011 */(A, B); 3641 double[4] correct = [1.0, 5.0, 2.0, 7.0]; 3642 assert(C.array == correct); 3643 } 3644 3645 /// Shuffle single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using 3646 /// the control in `imm8`. 3647 __m256 _mm256_shuffle_ps(int imm8)(__m256 a, __m256 b) pure @trusted 3648 { 3649 // PERF DMD 3650 static if (GDC_with_AVX) 3651 { 3652 return __builtin_ia32_shufps256(a, b, imm8); 3653 } 3654 else version(LDC) 3655 { 3656 return shufflevectorLDC!(float8, (imm8 >> 0) & 3, 3657 (imm8 >> 2) & 3, 3658 8 + ( (imm8 >> 4) & 3), 3659 8 + ( (imm8 >> 6) & 3), 3660 4 + ( (imm8 >> 0) & 3), 3661 4 + ( (imm8 >> 2) & 3), 3662 12 + ( (imm8 >> 4) & 3), 3663 12 + ( (imm8 >> 6) & 3) )(a, b); 3664 } 3665 else 3666 { 3667 float8 r = void; 3668 r.ptr[0] = a.array[(imm8 >> 0) & 3]; 3669 r.ptr[1] = a.array[(imm8 >> 2) & 3]; 3670 r.ptr[2] = b.array[(imm8 >> 4) & 3]; 3671 r.ptr[3] = b.array[(imm8 >> 6) & 3]; 3672 r.ptr[4] = a.array[4 + ( (imm8 >> 0) & 3 )]; 3673 r.ptr[5] = a.array[4 + ( (imm8 >> 2) & 3 )]; 3674 r.ptr[6] = b.array[4 + ( (imm8 >> 4) & 3 )]; 3675 r.ptr[7] = b.array[4 + ( (imm8 >> 6) & 3 )]; 3676 return r; 3677 } 3678 } 3679 unittest 3680 { 3681 __m256 A = _mm256_setr_ps( 0, 1, 2, 3, 4, 5, 6, 7); 3682 __m256 B = _mm256_setr_ps( 8, 9, 10, 11, 12, 13, 14, 15); 3683 __m256 C = _mm256_shuffle_ps!75 /* 01001011 */(A, B); 3684 float[8] correct = [3.0f, 2, 8, 9, 7, 6, 12, 13]; 3685 assert(C.array == correct); 3686 } 3687 3688 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `a`. 3689 __m256d _mm256_sqrt_pd (__m256d a) pure @trusted 3690 { 3691 static if (GDC_with_AVX) 3692 { 3693 return __builtin_ia32_sqrtpd256(a); 3694 } 3695 else version(LDC) 3696 { 3697 static if (__VERSION__ >= 2084) 3698 return llvm_sqrt(a); // that capability appeared in LDC 1.14 3699 else 3700 { 3701 a.ptr[0] = llvm_sqrt(a.array[0]); 3702 a.ptr[1] = llvm_sqrt(a.array[1]); 3703 a.ptr[2] = llvm_sqrt(a.array[2]); 3704 a.ptr[3] = llvm_sqrt(a.array[3]); 3705 return a; 3706 } 3707 } 3708 else 3709 { 3710 a.ptr[0] = sqrt(a.array[0]); 3711 a.ptr[1] = sqrt(a.array[1]); 3712 a.ptr[2] = sqrt(a.array[2]); 3713 a.ptr[3] = sqrt(a.array[3]); 3714 return a; 3715 } 3716 } 3717 unittest 3718 { 3719 __m256d A = _mm256_sqrt_pd(_mm256_set1_pd(4.0)); 3720 double[4] correct = [2.0, 2, 2, 2]; 3721 assert(A.array == correct); 3722 } 3723 3724 /// Compute the square root of packed single-precision (32-bit) floating-point elements in `a`. 3725 __m256 _mm256_sqrt_ps (__m256 a) pure @trusted 3726 { 3727 static if (GDC_with_AVX) 3728 { 3729 return __builtin_ia32_sqrtps256(a); 3730 } 3731 else version(LDC) 3732 { 3733 static if (__VERSION__ >= 2084) 3734 return llvm_sqrt(a); // that capability appeared in LDC 1.14 3735 else 3736 { 3737 a.ptr[0] = llvm_sqrt(a.array[0]); 3738 a.ptr[1] = llvm_sqrt(a.array[1]); 3739 a.ptr[2] = llvm_sqrt(a.array[2]); 3740 a.ptr[3] = llvm_sqrt(a.array[3]); 3741 a.ptr[4] = llvm_sqrt(a.array[4]); 3742 a.ptr[5] = llvm_sqrt(a.array[5]); 3743 a.ptr[6] = llvm_sqrt(a.array[6]); 3744 a.ptr[7] = llvm_sqrt(a.array[7]); 3745 return a; 3746 } 3747 } 3748 else 3749 { 3750 a.ptr[0] = sqrt(a.array[0]); 3751 a.ptr[1] = sqrt(a.array[1]); 3752 a.ptr[2] = sqrt(a.array[2]); 3753 a.ptr[3] = sqrt(a.array[3]); 3754 a.ptr[4] = sqrt(a.array[4]); 3755 a.ptr[5] = sqrt(a.array[5]); 3756 a.ptr[6] = sqrt(a.array[6]); 3757 a.ptr[7] = sqrt(a.array[7]); 3758 return a; 3759 } 3760 } 3761 unittest 3762 { 3763 __m256 A = _mm256_sqrt_ps(_mm256_set1_ps(4.0f)); 3764 float[8] correct = [2.0f, 2, 2, 2, 2, 2, 2, 2]; 3765 assert(A.array == correct); 3766 } 3767 3768 /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 3769 /// `a` into memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 3770 /// exception may be generated. 3771 void _mm256_store_pd (double* mem_addr, __m256d a) pure @system 3772 { 3773 *cast(__m256d*)mem_addr = a; 3774 } 3775 unittest 3776 { 3777 align(32) double[4] mem; 3778 double[4] correct = [1.0, 2, 3, 4]; 3779 _mm256_store_pd(mem.ptr, _mm256_setr_pd(1.0, 2, 3, 4)); 3780 assert(mem == correct); 3781 } 3782 3783 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 3784 /// `a` into memory. `mem_addr` must be aligned on a 32-byte boundary or a general-protection 3785 /// exception may be generated. 3786 void _mm256_store_ps (float* mem_addr, __m256 a) pure @system 3787 { 3788 *cast(__m256*)mem_addr = a; 3789 } 3790 unittest 3791 { 3792 align(32) float[8] mem; 3793 float[8] correct = [1.0, 2, 3, 4, 5, 6, 7, 8]; 3794 _mm256_store_ps(mem.ptr, _mm256_set_ps(8.0, 7, 6, 5, 4, 3, 2, 1)); 3795 assert(mem == correct); 3796 } 3797 3798 /// Store 256-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 32-byte 3799 /// boundary or a general-protection exception may be generated. 3800 void _mm256_store_si256 (__m256i * mem_addr, __m256i a) pure @safe 3801 { 3802 *mem_addr = a; 3803 } 3804 unittest 3805 { 3806 align(32) long[4] mem; 3807 long[4] correct = [5, -6, -7, 8]; 3808 _mm256_store_si256(cast(__m256i*)(mem.ptr), _mm256_setr_epi64x(5, -6, -7, 8)); 3809 assert(mem == correct); 3810 } 3811 3812 /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 3813 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary. 3814 void _mm256_storeu_pd (double * mem_addr, __m256d a) pure @system 3815 { 3816 // PERF DMD 3817 static if (GDC_with_AVX) 3818 { 3819 __builtin_ia32_storeupd256(mem_addr, a); 3820 } 3821 else static if (LDC_with_optimizations) 3822 { 3823 storeUnaligned!__m256d(a, mem_addr); 3824 } 3825 else 3826 { 3827 for(int n = 0; n < 4; ++n) 3828 mem_addr[n] = a.array[n]; 3829 } 3830 } 3831 unittest 3832 { 3833 align(32) double[6] arr = [0.0, 0, 0, 0, 0, 0]; 3834 _mm256_storeu_pd(&arr[1], _mm256_set1_pd(4.0)); 3835 double[4] correct = [4.0, 4, 4, 4]; 3836 assert(arr[1..5] == correct); 3837 } 3838 3839 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 3840 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary. 3841 void _mm256_storeu_ps (float* mem_addr, __m256 a) pure @system 3842 { 3843 // PERF DMD 3844 static if (GDC_with_AVX) 3845 { 3846 __builtin_ia32_storeups256(mem_addr, a); 3847 } 3848 else static if (LDC_with_optimizations) 3849 { 3850 storeUnaligned!__m256(a, mem_addr); 3851 } 3852 else 3853 { 3854 for(int n = 0; n < 8; ++n) 3855 mem_addr[n] = a.array[n]; 3856 } 3857 } 3858 unittest 3859 { 3860 align(32) float[10] arr = [0.0f, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 3861 _mm256_storeu_ps(&arr[1], _mm256_set1_ps(4.0f)); 3862 float[8] correct = [4.0f, 4, 4, 4, 4, 4, 4, 4]; 3863 assert(arr[1..9] == correct); 3864 } 3865 3866 3867 /// Store 256-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned 3868 /// on any particular boundary. 3869 void _mm256_storeu_si256 (__m256i* mem_addr, __m256i a) pure @trusted 3870 { 3871 // PERF DMD 3872 static if (GDC_with_AVX) 3873 { 3874 __builtin_ia32_storedqu256(cast(char*)mem_addr, cast(ubyte32) a); 3875 } 3876 else static if (LDC_with_optimizations) 3877 { 3878 storeUnaligned!__m256i(a, cast(long*)mem_addr); 3879 } 3880 else 3881 { 3882 long4 v = cast(long4)a; 3883 long* p = cast(long*)mem_addr; 3884 for(int n = 0; n < 4; ++n) 3885 p[n] = v[n]; 3886 } 3887 } 3888 unittest 3889 { 3890 align(32) long[6] arr = [0, 0, 0, 0, 0, 0]; 3891 _mm256_storeu_si256( cast(__m256i*) &arr[1], _mm256_set1_epi64x(4)); 3892 long[4] correct = [4, 4, 4, 4]; 3893 assert(arr[1..5] == correct); 3894 } 3895 3896 /// Store the high and low 128-bit halves (each composed of 4 packed single-precision (32-bit) 3897 /// floating-point elements) from `a` into memory two different 128-bit locations. 3898 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 3899 void _mm256_storeu2_m128 (float* hiaddr, float* loaddr, __m256 a) pure @system 3900 { 3901 // This is way better on GDC, and similarly in LDC, vs using other intrinsics 3902 loaddr[0] = a.array[0]; 3903 loaddr[1] = a.array[1]; 3904 loaddr[2] = a.array[2]; 3905 loaddr[3] = a.array[3]; 3906 hiaddr[0] = a.array[4]; 3907 hiaddr[1] = a.array[5]; 3908 hiaddr[2] = a.array[6]; 3909 hiaddr[3] = a.array[7]; 3910 } 3911 unittest 3912 { 3913 align(32) float[11] A = [0.0f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 3914 _mm256_storeu2_m128(&A[1], &A[6], _mm256_set1_ps(2.0f)); 3915 float[11] correct = [0.0f, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0]; 3916 assert(A == correct); 3917 } 3918 3919 /// Store the high and low 128-bit halves (each composed of 2 packed double-precision (64-bit) 3920 /// floating-point elements) from `a` into memory two different 128-bit locations. 3921 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 3922 void _mm256_storeu2_m128d (double* hiaddr, double* loaddr, __m256d a) pure @system 3923 { 3924 loaddr[0] = a.array[0]; 3925 loaddr[1] = a.array[1]; 3926 hiaddr[0] = a.array[2]; 3927 hiaddr[1] = a.array[3]; 3928 } 3929 unittest 3930 { 3931 double[2] A; 3932 double[2] B; 3933 _mm256_storeu2_m128d(A.ptr, B.ptr, _mm256_set1_pd(-43.0)); 3934 double[2] correct = [-43.0, -43]; 3935 assert(A == correct); 3936 assert(B == correct); 3937 } 3938 3939 /// Store the high and low 128-bit halves (each composed of integer data) from `a` into memory two 3940 /// different 128-bit locations. 3941 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. 3942 void _mm256_storeu2_m128i (__m128i* hiaddr, __m128i* loaddr, __m256i a) pure @trusted 3943 { 3944 long* hi = cast(long*)hiaddr; 3945 long* lo = cast(long*)loaddr; 3946 lo[0] = a.array[0]; 3947 lo[1] = a.array[1]; 3948 hi[0] = a.array[2]; 3949 hi[1] = a.array[3]; 3950 } 3951 unittest 3952 { 3953 long[2] A; 3954 long[2] B; 3955 _mm256_storeu2_m128i(cast(__m128i*)A.ptr, cast(__m128i*)B.ptr, _mm256_set1_epi64x(-42)); 3956 long[2] correct = [-42, -42]; 3957 assert(A == correct); 3958 assert(B == correct); 3959 } 3960 3961 /// Store 256-bits (composed of 4 packed single-precision (64-bit) floating-point elements) from 3962 /// `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 32-byte 3963 /// boundary or a general-protection exception may be generated. 3964 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 3965 void _mm256_stream_pd (double* mem_addr, __m256d a) pure @system 3966 { 3967 // PERF DMD 3968 // PERF GDC + SSE2 3969 static if (LDC_with_InlineIREx && LDC_with_optimizations) 3970 { 3971 enum prefix = `!0 = !{ i32 1 }`; 3972 enum ir = ` 3973 store <4 x double> %1, <4 x double>* %0, align 32, !nontemporal !0 3974 ret void`; 3975 LDCInlineIREx!(prefix, ir, "", void, double4*, double4)(cast(double4*)mem_addr, a); 3976 } 3977 else static if (GDC_with_AVX) // any hope to be non-temporal? Using SSE2 instructions. 3978 { 3979 __builtin_ia32_movntpd256 (mem_addr, a); 3980 } 3981 else 3982 { 3983 // Regular store instead. 3984 __m256d* dest = cast(__m256d*)mem_addr; 3985 *dest = a; 3986 } 3987 } 3988 unittest 3989 { 3990 align(32) double[4] mem; 3991 double[4] correct = [5.0, -6, -7, 8]; 3992 _mm256_stream_pd(mem.ptr, _mm256_setr_pd(5.0, -6, -7, 8)); 3993 assert(mem == correct); 3994 } 3995 3996 /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 3997 /// `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 32-byte 3998 /// boundary or a general-protection exception may be generated. 3999 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4000 void _mm256_stream_ps (float* mem_addr, __m256 a) pure @system 4001 { 4002 // PERF DMD 4003 // PERF GDC + SSE2 4004 static if (LDC_with_InlineIREx && LDC_with_optimizations) 4005 { 4006 enum prefix = `!0 = !{ i32 1 }`; 4007 enum ir = ` 4008 store <8 x float> %1, <8 x float>* %0, align 32, !nontemporal !0 4009 ret void`; 4010 LDCInlineIREx!(prefix, ir, "", void, float8*, float8)(cast(float8*)mem_addr, a); 4011 } 4012 else static if (GDC_with_AVX) 4013 { 4014 __builtin_ia32_movntps256 (mem_addr, a); 4015 } 4016 else 4017 { 4018 // Regular store instead. 4019 __m256* dest = cast(__m256*)mem_addr; 4020 *dest = a; 4021 } 4022 } 4023 unittest 4024 { 4025 align(32) float[8] mem; 4026 float[8] correct = [5, -6, -7, 8, 1, 2, 3, 4]; 4027 _mm256_stream_ps(mem.ptr, _mm256_setr_ps(5, -6, -7, 8, 1, 2, 3, 4)); 4028 assert(mem == correct); 4029 } 4030 4031 /// Store 256-bits of integer data from `a` into memory using a non-temporal memory hint. 4032 /// `mem_addr` must be aligned on a 32-byte boundary or a general-protection exception may be 4033 /// generated. 4034 /// Note: there isn't any particular instruction in AVX to do that. It just defers to SSE2. 4035 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4036 void _mm256_stream_si256 (__m256i * mem_addr, __m256i a) pure @trusted 4037 { 4038 // PERF DMD 4039 // PERF GDC 4040 static if (LDC_with_InlineIREx && LDC_with_optimizations) 4041 { 4042 enum prefix = `!0 = !{ i32 1 }`; 4043 enum ir = ` 4044 store <4 x i64> %1, <4 x i64>* %0, align 16, !nontemporal !0 4045 ret void`; 4046 LDCInlineIREx!(prefix, ir, "", void, long4*, long4)(mem_addr, a); 4047 } 4048 else static if (GDC_with_SSE2) // any hope to be non-temporal? Using SSE2 instructions. 4049 { 4050 long2 lo, hi; 4051 lo.ptr[0] = a.array[0]; 4052 lo.ptr[1] = a.array[1]; 4053 hi.ptr[0] = a.array[2]; 4054 hi.ptr[1] = a.array[3]; 4055 _mm_stream_si128(cast(__m128i*)mem_addr, cast(__m128i)lo); 4056 _mm_stream_si128((cast(__m128i*)mem_addr) + 1, cast(__m128i)hi); 4057 } 4058 else 4059 { 4060 // Regular store instead. 4061 __m256i* dest = cast(__m256i*)mem_addr; 4062 *dest = a; 4063 } 4064 } 4065 unittest 4066 { 4067 align(32) long[4] mem; 4068 long[4] correct = [5, -6, -7, 8]; 4069 _mm256_stream_si256(cast(__m256i*)(mem.ptr), _mm256_setr_epi64x(5, -6, -7, 8)); 4070 assert(mem == correct); 4071 } 4072 4073 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from 4074 /// packed double-precision (64-bit) floating-point elements in `a`. 4075 __m256d _mm256_sub_pd (__m256d a, __m256d b) pure @safe 4076 { 4077 return a - b; 4078 } 4079 unittest 4080 { 4081 __m256d a = [1.5, -2.0, 3.0, 200000.0]; 4082 a = _mm256_sub_pd(a, a); 4083 double[4] correct = [0.0, 0, 0, 0]; 4084 assert(a.array == correct); 4085 } 4086 4087 /// Subtract packed single-precision (32-bit) floating-point elements in `b` from 4088 /// packed single-precision (32-bit) floating-point elements in `a`. 4089 __m256 _mm256_sub_ps (__m256 a, __m256 b) pure @safe 4090 { 4091 return a - b; 4092 } 4093 unittest 4094 { 4095 __m256 a = [1.5f, -2.0f, 3.0f, 1.0f, 1.5f, -2000.0f, 3.0f, 1.0f]; 4096 a = _mm256_sub_ps(a, a); 4097 float[8] correct = [0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f]; 4098 assert(a.array == correct); 4099 } 4100 4101 /// Compute the bitwise NOT of `a` and then AND with `b`, producing an intermediate value, and 4102 /// return 1 if the sign bit of each 64-bit element in the intermediate value is zero, 4103 /// otherwise return 0. 4104 int _mm_testc_pd (__m128d a, __m128d b) pure @trusted 4105 { 4106 static if (GDC_or_LDC_with_AVX) 4107 { 4108 return __builtin_ia32_vtestcpd(a, b); 4109 } 4110 else 4111 { 4112 // PERF: maybe do the generic version more like simde 4113 long2 la = cast(long2)a; 4114 long2 lb = cast(long2)b; 4115 long2 r = ~la & lb; 4116 return r.array[0] >= 0 && r.array[1] >= 0; 4117 } 4118 } 4119 unittest 4120 { 4121 __m128d A = _mm_setr_pd(-1, 1); 4122 __m128d B = _mm_setr_pd(-1, -1); 4123 __m128d C = _mm_setr_pd(1, -1); 4124 assert(_mm_testc_pd(A, A) == 1); 4125 assert(_mm_testc_pd(A, B) == 0); 4126 assert(_mm_testc_pd(B, A) == 1); 4127 } 4128 4129 ///ditto 4130 int _mm256_testc_pd (__m256d a, __m256d b) pure @safe 4131 { 4132 static if (GDC_or_LDC_with_AVX) 4133 { 4134 return __builtin_ia32_vtestcpd256(a, b); 4135 } 4136 else static if (LDC_with_ARM64) 4137 { 4138 // better to split than do vanilla (down to 10 inst) 4139 __m128d lo_a = _mm256_extractf128_pd!0(a); 4140 __m128d lo_b = _mm256_extractf128_pd!0(b); 4141 __m128d hi_a = _mm256_extractf128_pd!1(a); 4142 __m128d hi_b = _mm256_extractf128_pd!1(b); 4143 return _mm_testc_pd(lo_a, lo_b) & _mm_testc_pd(hi_a, hi_b); 4144 } 4145 else 4146 { 4147 // PERF: do the generic version more like simde, maybe this get rids of arm64 version 4148 long4 la = cast(long4)a; 4149 long4 lb = cast(long4)b; 4150 long4 r = ~la & lb; 4151 return r.array[0] >= 0 && r.array[1] >= 0 && r.array[2] >= 0 && r.array[3] >= 0; 4152 } 4153 } 4154 unittest 4155 { 4156 __m256d A = _mm256_setr_pd(-1, 1, -1, 1); 4157 __m256d B = _mm256_setr_pd(-1, -1, -1, -1); 4158 __m256d C = _mm256_setr_pd(1, -1, 1, -1); 4159 assert(_mm256_testc_pd(A, A) == 1); 4160 assert(_mm256_testc_pd(A, B) == 0); 4161 assert(_mm256_testc_pd(B, A) == 1); 4162 } 4163 4164 /// Compute the bitwise NOT of `a` and then AND with `b`, producing an intermediate value, and 4165 /// return 1 if the sign bit of each 32-bit element in the intermediate value is zero, 4166 /// otherwise return 0. 4167 int _mm_testc_ps (__m128 a, __m128 b) pure @safe 4168 { 4169 // PERF DMD 4170 static if (GDC_or_LDC_with_AVX) 4171 { 4172 return __builtin_ia32_vtestcps(a, b); 4173 } 4174 else static if (LDC_with_ARM64) 4175 { 4176 int4 la = cast(int4)a; 4177 int4 lb = cast(int4)b; 4178 int4 r = ~la & lb; 4179 int4 shift; 4180 shift = 31; 4181 r >>= shift; 4182 int[4] zero = [0, 0, 0, 0]; 4183 return r.array == zero; 4184 } 4185 else 4186 { 4187 // PERF: do the generic version more like simde, maybe this get rids of arm64 version 4188 int4 la = cast(int4)a; 4189 int4 lb = cast(int4)b; 4190 int4 r = ~la & lb; 4191 return r.array[0] >= 0 && r.array[1] >= 0 && r.array[2] >= 0 && r.array[3] >= 0; 4192 } 4193 } 4194 unittest 4195 { 4196 __m128 A = _mm_setr_ps(-1, 1, -1, 1); 4197 __m128 B = _mm_setr_ps(-1, -1, -1, -1); 4198 __m128 C = _mm_setr_ps(1, -1, 1, -1); 4199 assert(_mm_testc_ps(A, A) == 1); 4200 assert(_mm_testc_ps(A, B) == 0); 4201 assert(_mm_testc_ps(B, A) == 1); 4202 } 4203 4204 ///ditto 4205 int _mm256_testc_ps (__m256 a, __m256 b) pure @safe 4206 { 4207 // PERF DMD 4208 static if (GDC_or_LDC_with_AVX) 4209 { 4210 return __builtin_ia32_vtestcps256(a, b); 4211 } 4212 else static if (LDC_with_ARM64) 4213 { 4214 int8 la = cast(int8)a; 4215 int8 lb = cast(int8)b; 4216 int8 r = ~la & lb; 4217 int8 shift; 4218 shift = 31; 4219 r >>= shift; 4220 int[8] zero = [0, 0, 0, 0, 0, 0, 0, 0]; 4221 return r.array == zero; 4222 } 4223 else 4224 { 4225 // PERF: do the generic version more like simde, maybe this get rids of arm64 version 4226 int8 la = cast(int8)a; 4227 int8 lb = cast(int8)b; 4228 int8 r = ~la & lb; 4229 return r.array[0] >= 0 4230 && r.array[1] >= 0 4231 && r.array[2] >= 0 4232 && r.array[3] >= 0 4233 && r.array[4] >= 0 4234 && r.array[5] >= 0 4235 && r.array[6] >= 0 4236 && r.array[7] >= 0; 4237 } 4238 } 4239 unittest 4240 { 4241 __m256 A = _mm256_setr_ps(-1, 1, -1, 1, -1, 1, -1, 1); 4242 __m256 B = _mm256_setr_ps(-1, -1, -1, -1, -1, -1, -1, -1); 4243 __m256 C = _mm256_setr_ps( 1, -1, 1, -1, 1, 1, 1, 1); 4244 assert(_mm256_testc_ps(A, A) == 1); 4245 assert(_mm256_testc_ps(B, B) == 1); 4246 assert(_mm256_testc_ps(A, B) == 0); 4247 assert(_mm256_testc_ps(B, A) == 1); 4248 assert(_mm256_testc_ps(C, B) == 0); 4249 assert(_mm256_testc_ps(B, C) == 1); 4250 } 4251 4252 /// Compute the bitwise NOT of `a` and then AND with `b`, and return 1 if the result is zero, 4253 /// otherwise return 0. 4254 /// In other words, test if all bits masked by `b` are also 1 in `a`. 4255 int _mm256_testc_si256 (__m256i a, __m256i b) pure @trusted 4256 { 4257 static if (GDC_or_LDC_with_AVX) 4258 { 4259 return __builtin_ia32_ptestc256(cast(long4)a, cast(long4)b); 4260 } 4261 else static if (LDC_with_ARM64) 4262 { 4263 // better to split than do vanilla (down to 10 inst) 4264 __m128i lo_a = _mm256_extractf128_si256!0(a); 4265 __m128i lo_b = _mm256_extractf128_si256!0(b); 4266 __m128i hi_a = _mm256_extractf128_si256!1(a); 4267 __m128i hi_b = _mm256_extractf128_si256!1(b); 4268 return _mm_testc_si128(lo_a, lo_b) & _mm_testc_si128(hi_a, hi_b); 4269 } 4270 else 4271 { 4272 __m256i c = ~a & b; 4273 long[4] zero = [0, 0, 0, 0]; 4274 return c.array == zero; 4275 } 4276 } 4277 unittest 4278 { 4279 __m256i A = _mm256_setr_epi64(0x01, 0x02, 0x04, 0xf8); 4280 __m256i M1 = _mm256_setr_epi64(0xfe, 0xfd, 0x00, 0x00); 4281 __m256i M2 = _mm256_setr_epi64(0x00, 0x00, 0x04, 0x00); 4282 assert(_mm256_testc_si256(A, A) == 1); 4283 assert(_mm256_testc_si256(A, M1) == 0); 4284 assert(_mm256_testc_si256(A, M2) == 1); 4285 } 4286 4287 /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 4288 /// elements) in `a` and `b`, producing an intermediate 128-bit value, and set ZF to 1 if the 4289 /// sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. 4290 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set 4291 /// CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise 4292 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 4293 /// 4294 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`, 4295 /// AND there is at least one negative number in `b` that correspond to a negative number in `a`. 4296 int _mm_testnzc_pd (__m128d a, __m128d b) pure @safe 4297 { 4298 // PERF DMD 4299 static if (GDC_or_LDC_with_AVX) 4300 { 4301 return __builtin_ia32_vtestnzcpd(a, b); 4302 } 4303 else 4304 { 4305 // ZF = 0 means "there is at least one pair of negative numbers" 4306 // ZF = 1 means "no pairs of negative numbers" 4307 // CF = 0 means "there is a negative number in b that is next to a positive number in a" 4308 // CF = 1 means "all negative numbers in b are also negative in a" 4309 // Consequently, CF = 0 and ZF = 0 means: 4310 // "There is a pair of matching negative numbers in a and b, 4311 // AND also there is a negative number in b, that is matching a positive number in a" 4312 // Phew. 4313 4314 // courtesy of simd-everywhere 4315 __m128i m = _mm_and_si128(cast(__m128i)a, cast(__m128i)b); 4316 __m128i m2 = _mm_andnot_si128(cast(__m128i)a, cast(__m128i)b); 4317 m = _mm_srli_epi64(m, 63); 4318 m2 = _mm_srli_epi64(m2, 63); 4319 return cast(int)( m.array[0] | m.array[2]) & (m2.array[0] | m2.array[2]); 4320 } 4321 } 4322 unittest 4323 { 4324 __m128d PM = _mm_setr_pd( 1, -1); 4325 __m128d MP = _mm_setr_pd(-1, 1); 4326 __m128d MM = _mm_setr_pd(-1, -1); 4327 assert(_mm_testnzc_pd(PM, MP) == 0); 4328 assert(_mm_testnzc_pd(PM, MM) == 1); 4329 assert(_mm_testnzc_pd(MP, MP) == 0); 4330 assert(_mm_testnzc_pd(MP, MM) == 1); 4331 assert(_mm_testnzc_pd(MM, MM) == 0); 4332 } 4333 4334 /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 4335 /// elements) in `a` and `b`, producing an intermediate 256-bit value, and set ZF to 1 if the 4336 /// sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. 4337 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set 4338 /// CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise 4339 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 4340 /// 4341 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`, 4342 /// AND there is at least one negative number in `b` that correspond to a negative number in `a`. 4343 int _mm256_testnzc_pd (__m256d a, __m256d b) pure @safe 4344 { 4345 // PERF DMD 4346 static if (GDC_or_LDC_with_AVX) 4347 { 4348 return __builtin_ia32_vtestnzcpd256(a, b); 4349 } 4350 else 4351 { 4352 long4 la = cast(long4)a; 4353 long4 lb = cast(long4)b; 4354 long4 r = la & lb; 4355 long m = r.array[0] | r.array[1] | r.array[2] | r.array[3]; 4356 int ZF = (~m >> 63) & 1; 4357 long4 r2 = ~la & lb; 4358 long m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3]; 4359 int CF = (~m2 >> 63) & 1; 4360 return (CF | ZF) == 0; 4361 } 4362 } 4363 unittest 4364 { 4365 __m256d PM = _mm256_setr_pd( 1, -1, 1, 1); 4366 __m256d MP = _mm256_setr_pd(-1, 1, 1, 1); 4367 __m256d MM = _mm256_setr_pd(-1, -1, 1, 1); 4368 assert(_mm256_testnzc_pd(PM, MP) == 0); 4369 assert(_mm256_testnzc_pd(PM, MM) == 1); 4370 assert(_mm256_testnzc_pd(MP, MP) == 0); 4371 assert(_mm256_testnzc_pd(MP, MM) == 1); 4372 assert(_mm256_testnzc_pd(MM, MM) == 0); 4373 } 4374 4375 /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 4376 /// elements) in `a` and `b`, producing an intermediate 128-bit value, and set ZF to 1 if the 4377 /// sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. 4378 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set 4379 /// CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise 4380 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 4381 /// 4382 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`, 4383 /// AND there is at least one negative number in `b` that correspond to a negative number in `a`. 4384 int _mm_testnzc_ps (__m128 a, __m128 b) pure @safe 4385 { 4386 // PERF DMD 4387 static if (GDC_or_LDC_with_AVX) 4388 { 4389 return __builtin_ia32_vtestnzcps(a, b); 4390 } 4391 else 4392 { 4393 int4 la = cast(int4)a; 4394 int4 lb = cast(int4)b; 4395 int4 r = la & lb; 4396 int m = r.array[0] | r.array[1] | r.array[2] | r.array[3]; 4397 int ZF = (~m >> 31) & 1; 4398 int4 r2 = ~la & lb; 4399 int m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3]; 4400 int CF = (~m2 >> 31) & 1; 4401 return (CF | ZF) == 0; 4402 } 4403 } 4404 unittest 4405 { 4406 __m128 PM = _mm_setr_ps( 1, -1, 1, 1); 4407 __m128 MP = _mm_setr_ps(-1, 1, 1, 1); 4408 __m128 MM = _mm_setr_ps(-1, -1, 1, 1); 4409 assert(_mm_testnzc_ps(PM, MP) == 0); 4410 assert(_mm_testnzc_ps(PM, MM) == 1); 4411 assert(_mm_testnzc_ps(MP, MP) == 0); 4412 assert(_mm_testnzc_ps(MP, MM) == 1); 4413 assert(_mm_testnzc_ps(MM, MM) == 0); 4414 } 4415 4416 /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 4417 /// elements) in `a` and `b`, producing an intermediate 256-bit value, and set ZF to 1 if the 4418 /// sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. 4419 /// Compute the bitwise NOT of a and then AND with b, producing an intermediate value, and set 4420 /// CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise 4421 /// set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 4422 /// 4423 /// In other words: there is at least one negative number in `b` that correspond to a positive number in `a`, 4424 /// AND there is at least one negative number in `b` that correspond to a negative number in `a`. 4425 int _mm256_testnzc_ps (__m256 a, __m256 b) pure @safe 4426 { 4427 // PERF DMD 4428 static if (GDC_or_LDC_with_AVX) 4429 { 4430 return __builtin_ia32_vtestnzcps256(a, b); 4431 } 4432 else 4433 { 4434 int8 la = cast(int8)a; 4435 int8 lb = cast(int8)b; 4436 int8 r = la & lb; 4437 int m = r.array[0] | r.array[1] | r.array[2] | r.array[3] 4438 | r.array[4] | r.array[5] | r.array[6] | r.array[7]; 4439 int ZF = (~m >> 31) & 1; 4440 int8 r2 = ~la & lb; 4441 int m2 = r2.array[0] | r2.array[1] | r2.array[2] | r2.array[3] 4442 | r2.array[4] | r2.array[5] | r2.array[6] | r2.array[7]; 4443 int CF = (~m2 >> 31) & 1; 4444 return (CF | ZF) == 0; 4445 } 4446 } 4447 unittest 4448 { 4449 __m256 PM = _mm256_setr_ps(1, 1, 1, 1, 1, -1, 1, 1); 4450 __m256 MP = _mm256_setr_ps(1, 1, 1, 1, -1, 1, 1, 1); 4451 __m256 MM = _mm256_setr_ps(1, 1, 1, 1, -1, -1, 1, 1); 4452 assert(_mm256_testnzc_ps(PM, MP) == 0); 4453 assert(_mm256_testnzc_ps(PM, MM) == 1); 4454 assert(_mm256_testnzc_ps(MP, MP) == 0); 4455 assert(_mm256_testnzc_ps(MP, MM) == 1); 4456 assert(_mm256_testnzc_ps(MM, MM) == 0); 4457 } 4458 4459 /// Compute the bitwise AND of 256 bits (representing integer data) in `a` and `b`, 4460 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 4461 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 4462 /// result is zero, otherwise set CF to 0. 4463 /// Return 1 if both the ZF and CF values are zero, otherwise return 0. 4464 int _mm256_testnzc_si256 (__m256i a, __m256i b) pure @trusted 4465 { 4466 // PERF ARM64 4467 // PERF DMD 4468 // PERF LDC without AVX 4469 static if (GDC_or_LDC_with_AVX) 4470 { 4471 return __builtin_ia32_ptestnzc256(cast(long4) a, cast(long4) b); 4472 } 4473 else 4474 { 4475 // Need to defer to _mm_testnzc_si128 if possible, for more speed 4476 __m256i c = a & b; 4477 __m256i d = ~a & b; 4478 long m = c.array[0] | c.array[1] | c.array[2] | c.array[3]; 4479 long n = d.array[0] | d.array[1] | d.array[2] | d.array[3]; 4480 return (m != 0) & (n != 0); 4481 } 4482 } 4483 unittest 4484 { 4485 __m256i A = _mm256_setr_epi32(0x01, 0x02, 0x04, 0xf8, 0, 0, 0, 0); 4486 __m256i M = _mm256_setr_epi32(0x01, 0x40, 0x00, 0x00, 0, 0, 0, 0); 4487 __m256i Z = _mm256_setzero_si256(); 4488 assert(_mm256_testnzc_si256(A, Z) == 0); 4489 assert(_mm256_testnzc_si256(A, M) == 1); 4490 assert(_mm256_testnzc_si256(A, A) == 0); 4491 } 4492 4493 /// Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point 4494 /// elements) in `a` and `b`, producing an intermediate 128-bit value, return 1 if the sign bit of 4495 /// each 64-bit element in the intermediate value is zero, otherwise return 0. 4496 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place. 4497 int _mm_testz_pd (__m128d a, __m128d b) pure @trusted 4498 { 4499 static if (GDC_or_LDC_with_AVX) 4500 { 4501 return __builtin_ia32_vtestzpd(a, b); 4502 } 4503 else 4504 { 4505 long2 la = cast(long2)a; 4506 long2 lb = cast(long2)b; 4507 long2 r = la & lb; 4508 long m = r.array[0] | r.array[1]; 4509 return (~m >> 63) & 1; 4510 } 4511 } 4512 unittest 4513 { 4514 __m128d A = _mm_setr_pd(-1, 1); 4515 __m128d B = _mm_setr_pd(-1, -1); 4516 __m128d C = _mm_setr_pd(1, -1); 4517 assert(_mm_testz_pd(A, A) == 0); 4518 assert(_mm_testz_pd(A, B) == 0); 4519 assert(_mm_testz_pd(C, A) == 1); 4520 } 4521 4522 /// Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point 4523 /// elements) in `a` and `b`, producing an intermediate 256-bit value, return 1 if the sign bit of 4524 /// each 64-bit element in the intermediate value is zero, otherwise return 0. 4525 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place. 4526 int _mm256_testz_pd (__m256d a, __m256d b) pure @trusted 4527 { 4528 static if (GDC_or_LDC_with_AVX) 4529 { 4530 return __builtin_ia32_vtestzpd256(a, b); 4531 } 4532 else 4533 { 4534 long4 la = cast(long4)a; 4535 long4 lb = cast(long4)b; 4536 long4 r = la & lb; 4537 long r2 = r.array[0] | r.array[1] | r.array[2] | r.array[3]; 4538 return (~r2 >> 63) & 1; 4539 } 4540 } 4541 unittest 4542 { 4543 __m256d A = _mm256_setr_pd(-1, 1, -1, 1); 4544 __m256d B = _mm256_setr_pd(1, 1, -1, 1); 4545 __m256d C = _mm256_setr_pd(1, -1, 1, -1); 4546 assert(_mm256_testz_pd(A, A) == 0); 4547 assert(_mm256_testz_pd(A, B) == 0); 4548 assert(_mm256_testz_pd(C, A) == 1); 4549 } 4550 4551 /// Compute the bitwise AND of 128 bits (representing double-precision (32-bit) floating-point 4552 /// elements) in `a` and `b`, producing an intermediate 128-bit value, return 1 if the sign bit of 4553 /// each 32-bit element in the intermediate value is zero, otherwise return 0. 4554 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place. 4555 int _mm_testz_ps (__m128 a, __m128 b) pure @safe 4556 { 4557 // PERF DMD 4558 static if (GDC_or_LDC_with_AVX) 4559 { 4560 return __builtin_ia32_vtestzps(a, b); 4561 } 4562 else 4563 { 4564 int4 la = cast(int4)a; 4565 int4 lb = cast(int4)b; 4566 int4 r = la & lb; 4567 int m = r.array[0] | r.array[1] | r.array[2] | r.array[3]; 4568 return (~m >> 31) & 1; 4569 } 4570 } 4571 unittest 4572 { 4573 __m128 A = _mm_setr_ps(-1, 1, -1, 1); 4574 __m128 B = _mm_setr_ps( 1, 1, -1, 1); 4575 __m128 C = _mm_setr_ps( 1, -1, 1, -1); 4576 assert(_mm_testz_ps(A, A) == 0); 4577 assert(_mm_testz_ps(A, B) == 0); 4578 assert(_mm_testz_ps(C, A) == 1); 4579 assert(_mm_testz_ps(C, B) == 1); 4580 } 4581 4582 /// Compute the bitwise AND of 256 bits (representing double-precision (32-bit) floating-point 4583 /// elements) in `a` and `b`, producing an intermediate 256-bit value, return 1 if the sign bit of 4584 /// each 32-bit element in the intermediate value is zero, otherwise return 0. 4585 /// In other words, return 1 if `a` and `b` don't both have a negative number as the same place. 4586 int _mm256_testz_ps (__m256 a, __m256 b) pure @safe 4587 { 4588 // PERF DMD 4589 static if (GDC_or_LDC_with_AVX) 4590 { 4591 return __builtin_ia32_vtestzps256(a, b); 4592 } 4593 else 4594 { 4595 int8 la = cast(int8)a; 4596 int8 lb = cast(int8)b; 4597 int8 r = la & lb; 4598 int m = r.array[0] | r.array[1] | r.array[2] | r.array[3] 4599 | r.array[4] | r.array[5] | r.array[6] | r.array[7]; 4600 return (~m >> 31) & 1; 4601 } 4602 } 4603 4604 /// Compute the bitwise AND of 256 bits (representing integer data) in 4605 /// and return 1 if the result is zero, otherwise return 0. 4606 /// In other words, test if all bits masked by `b` are 0 in `a`. 4607 int _mm256_testz_si256 (__m256i a, __m256i b) @trusted 4608 { 4609 // PERF DMD 4610 static if (GDC_with_AVX) 4611 { 4612 return __builtin_ia32_ptestz256(cast(long4)a, cast(long4)b); 4613 } 4614 else static if (LDC_with_AVX) 4615 { 4616 return __builtin_ia32_ptestz256(cast(long4)a, cast(long4)b); 4617 } 4618 else version(LDC) 4619 { 4620 // better to split than do vanilla (down to 8 inst in arm64) 4621 __m128i lo_a = _mm256_extractf128_si256!0(a); 4622 __m128i lo_b = _mm256_extractf128_si256!0(b); 4623 __m128i hi_a = _mm256_extractf128_si256!1(a); 4624 __m128i hi_b = _mm256_extractf128_si256!1(b); 4625 return _mm_testz_si128(lo_a, lo_b) & _mm_testz_si128(hi_a, hi_b); 4626 } 4627 else 4628 { 4629 __m256i c = a & b; 4630 long[4] zero = [0, 0, 0, 0]; 4631 return c.array == zero; 4632 } 4633 } 4634 unittest 4635 { 4636 __m256i A = _mm256_setr_epi32(0x01, 0x02, 0x04, 0xf8, 0x01, 0x02, 0x04, 0xf8); 4637 __m256i M1 = _mm256_setr_epi32(0xfe, 0xfd, 0x00, 0x07, 0xfe, 0xfd, 0x00, 0x07); 4638 __m256i M2 = _mm256_setr_epi32(0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00); 4639 assert(_mm256_testz_si256(A, A) == 0); 4640 assert(_mm256_testz_si256(A, M1) == 1); 4641 assert(_mm256_testz_si256(A, M2) == 0); 4642 } 4643 4644 /// Return vector of type __m256d with undefined elements. 4645 __m256d _mm256_undefined_pd () pure @safe 4646 { 4647 __m256d r = void; 4648 return r; 4649 } 4650 4651 /// Return vector of type __m256 with undefined elements. 4652 __m256 _mm256_undefined_ps () pure @safe 4653 { 4654 __m256 r = void; 4655 return r; 4656 } 4657 4658 /// Return vector of type __m256i with undefined elements. 4659 __m256i _mm256_undefined_si256 () pure @safe 4660 { 4661 __m256i r = void; 4662 return r; 4663 } 4664 4665 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 4666 /// each 128-bit lane in `a` and `b`. 4667 __m256d _mm256_unpackhi_pd (__m256d a, __m256d b) pure @trusted 4668 { 4669 static if (LDC_with_optimizations) 4670 { 4671 enum ir = `%r = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 4672 ret <4 x double> %r`; 4673 return LDCInlineIR!(ir, double4, double4, double4)(a, b); 4674 } 4675 else static if (GDC_with_AVX) 4676 { 4677 return __builtin_ia32_unpckhpd256 (a, b); 4678 } 4679 else 4680 { 4681 __m256d r; 4682 r.ptr[0] = a.array[1]; 4683 r.ptr[1] = b.array[1]; 4684 r.ptr[2] = a.array[3]; 4685 r.ptr[3] = b.array[3]; 4686 return r; 4687 } 4688 } 4689 unittest 4690 { 4691 __m256d A = _mm256_setr_pd(1.0, 2, 3, 4); 4692 __m256d B = _mm256_setr_pd(5.0, 6, 7, 8); 4693 __m256d C = _mm256_unpackhi_pd(A, B); 4694 double[4] correct = [2.0, 6, 4, 8]; 4695 assert(C.array == correct); 4696 } 4697 4698 4699 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 4700 /// each 128-bit lane in `a` and `b`. 4701 __m256 _mm256_unpackhi_ps (__m256 a, __m256 b) pure @trusted 4702 { 4703 static if (LDC_with_optimizations) 4704 { 4705 enum ir = `%r = shufflevector <8 x float> %0, <8 x float> %1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> 4706 ret <8 x float> %r`; 4707 return LDCInlineIR!(ir, float8, float8, float8)(a, b); 4708 } 4709 else static if (GDC_with_AVX) 4710 { 4711 return __builtin_ia32_unpckhps256 (a, b); 4712 } 4713 else 4714 { 4715 __m256 r; 4716 r.ptr[0] = a.array[2]; 4717 r.ptr[1] = b.array[2]; 4718 r.ptr[2] = a.array[3]; 4719 r.ptr[3] = b.array[3]; 4720 r.ptr[4] = a.array[6]; 4721 r.ptr[5] = b.array[6]; 4722 r.ptr[6] = a.array[7]; 4723 r.ptr[7] = b.array[7]; 4724 return r; 4725 } 4726 } 4727 unittest 4728 { 4729 __m256 A = _mm256_setr_ps(0.0f, 1, 2, 3, 4, 5, 6, 7); 4730 __m256 B = _mm256_setr_ps(8.0f, 9, 10, 11, 12, 13, 14, 15); 4731 __m256 C = _mm256_unpackhi_ps(A, B); 4732 float[8] correct = [2.0f, 10, 3, 11, 6, 14, 7, 15]; 4733 assert(C.array == correct); 4734 } 4735 4736 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of 4737 /// each 128-bit lane in `a` and `b`. 4738 __m256d _mm256_unpacklo_pd (__m256d a, __m256d b) 4739 { 4740 static if (LDC_with_optimizations) 4741 { 4742 enum ir = `%r = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 4743 ret <4 x double> %r`; 4744 return LDCInlineIR!(ir, double4, double4, double4)(a, b); 4745 } 4746 else static if (GDC_with_AVX) 4747 { 4748 return __builtin_ia32_unpcklpd256 (a, b); 4749 } 4750 else 4751 { 4752 __m256d r; 4753 r.ptr[0] = a.array[0]; 4754 r.ptr[1] = b.array[0]; 4755 r.ptr[2] = a.array[2]; 4756 r.ptr[3] = b.array[2]; 4757 return r; 4758 } 4759 } 4760 unittest 4761 { 4762 __m256d A = _mm256_setr_pd(1.0, 2, 3, 4); 4763 __m256d B = _mm256_setr_pd(5.0, 6, 7, 8); 4764 __m256d C = _mm256_unpacklo_pd(A, B); 4765 double[4] correct = [1.0, 5, 3, 7]; 4766 assert(C.array == correct); 4767 } 4768 4769 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of 4770 /// each 128-bit lane in `a` and `b`. 4771 __m256 _mm256_unpacklo_ps (__m256 a, __m256 b) 4772 { 4773 static if (LDC_with_optimizations) 4774 { 4775 enum ir = `%r = shufflevector <8 x float> %0, <8 x float> %1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> 4776 ret <8 x float> %r`; 4777 return LDCInlineIR!(ir, float8, float8, float8)(a, b); 4778 } 4779 else static if (GDC_with_AVX) 4780 { 4781 return __builtin_ia32_unpcklps256 (a, b); 4782 } 4783 else 4784 { 4785 __m256 r; 4786 r.ptr[0] = a.array[0]; 4787 r.ptr[1] = b.array[0]; 4788 r.ptr[2] = a.array[1]; 4789 r.ptr[3] = b.array[1]; 4790 r.ptr[4] = a.array[4]; 4791 r.ptr[5] = b.array[4]; 4792 r.ptr[6] = a.array[5]; 4793 r.ptr[7] = b.array[5]; 4794 return r; 4795 } 4796 } 4797 unittest 4798 { 4799 __m256 A = _mm256_setr_ps(0.0f, 1, 2, 3, 4, 5, 6, 7); 4800 __m256 B = _mm256_setr_ps(8.0f, 9, 10, 11, 12, 13, 14, 15); 4801 __m256 C = _mm256_unpacklo_ps(A, B); 4802 float[8] correct = [0.0f, 8, 1, 9, 4, 12, 5, 13]; 4803 assert(C.array == correct); 4804 } 4805 4806 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 4807 __m256d _mm256_xor_pd (__m256d a, __m256d b) pure @safe 4808 { 4809 return cast(__m256d)( cast(__m256i)a ^ cast(__m256i)b ); 4810 } 4811 4812 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`. 4813 __m256 _mm256_xor_ps (__m256 a, __m256 b) pure @safe 4814 { 4815 return cast(__m256)( cast(__m256i)a ^ cast(__m256i)b ); 4816 } 4817 4818 void _mm256_zeroall () pure @safe 4819 { 4820 // PERF DMD needs to do it explicitely if AVX is ever used one day. 4821 4822 static if (GDC_with_AVX) 4823 { 4824 __builtin_ia32_vzeroall(); 4825 } 4826 else 4827 { 4828 // Do nothing. The transitions penalty are supposed handled by the backend (eg: LLVM). 4829 } 4830 } 4831 4832 void _mm256_zeroupper () pure @safe 4833 { 4834 // PERF DMD needs to do it explicitely if AVX is ever used. 4835 4836 static if (GDC_with_AVX) 4837 { 4838 __builtin_ia32_vzeroupper(); 4839 } 4840 else 4841 { 4842 // Do nothing. The transitions penalty are supposed handled by the backend (eg: LLVM). 4843 } 4844 4845 } 4846 4847 /// Cast vector of type `__m128d` to type `__m256d`; the upper 128 bits of the result are zeroed. 4848 __m256d _mm256_zextpd128_pd256 (__m128d a) pure @trusted 4849 { 4850 __m256d r; 4851 r.ptr[0] = a.array[0]; 4852 r.ptr[1] = a.array[1]; 4853 r.ptr[2] = 0; 4854 r.ptr[3] = 0; 4855 return r; 4856 } 4857 unittest 4858 { 4859 __m256d R = _mm256_zextpd128_pd256(_mm_setr_pd(2.0, -3.0)); 4860 double[4] correct = [2.0, -3, 0, 0]; 4861 assert(R.array == correct); 4862 } 4863 4864 /// Cast vector of type `__m128` to type `__m256`; the upper 128 bits of the result are zeroed. 4865 __m256 _mm256_zextps128_ps256 (__m128 a) pure @trusted 4866 { 4867 double2 la = cast(double2)a; 4868 double4 r; 4869 r.ptr[0] = la.array[0]; 4870 r.ptr[1] = la.array[1]; 4871 r.ptr[2] = 0; 4872 r.ptr[3] = 0; 4873 return cast(__m256)r; 4874 } 4875 unittest 4876 { 4877 __m256 R = _mm256_zextps128_ps256(_mm_setr_ps(2.0, -3.0, 4, -5)); 4878 float[8] correct = [2.0, -3, 4, -5, 0, 0, 0, 0]; 4879 assert(R.array == correct); 4880 } 4881 4882 /// Cast vector of type `__m128i` to type `__m256i`; the upper 128 bits of the result are zeroed. 4883 __m256i _mm256_zextsi128_si256 (__m128i a) pure @trusted 4884 { 4885 long2 la = cast(long2)a; 4886 __m256i r; 4887 r.ptr[0] = la.array[0]; 4888 r.ptr[1] = la.array[1]; 4889 r.ptr[2] = 0; 4890 r.ptr[3] = 0; 4891 return r; 4892 } 4893 unittest 4894 { 4895 __m256i R = _mm256_zextsi128_si256(_mm_setr_epi64(-1, 99)); 4896 long[4] correct = [-1, 99, 0, 0]; 4897 assert(R.array == correct); 4898 } 4899 4900 4901 // F16C start here 4902 4903 enum _MM_FROUND_TO_NEAREST_INT = 0, 4904 _MM_FROUND_TO_NEG_INF = 1, 4905 _MM_FROUND_TO_POS_INF = 2, 4906 _MM_FROUND_TO_ZERO = 3, 4907 _MM_FROUND_CUR_DIRECTION = 4, 4908 _MM_FROUND_RAISE_EXC = 0, 4909 _MM_FROUND_NO_EXC = 8, 4910 _MM_FROUND_NINT = 0, 4911 _MM_FROUND_FLOOR = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF, 4912 _MM_FROUND_CEIL = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF, 4913 _MM_FROUND_TRUNC = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO, 4914 _MM_FROUND_RINT = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION, 4915 _MM_FROUND_NEARBYINT = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION; 4916 4917 /// Convert 4 packed half-precision (16-bit) floating-point elements 4918 /// in `a` to packed single-precision (32-bit) floating-point elements. 4919 /// Note: Only lowest 64-bit of input considered. 4920 /// Preserve infinities, sign of zeroes, and NaN-ness. 4921 __m128 _mm_cvtph_ps(__m128i a) pure @trusted 4922 { 4923 short8 sa = cast(short8)a; 4924 4925 static if (LDC_with_ARM64) 4926 { 4927 short4 h; 4928 short8 as = cast(short8) a; 4929 h.ptr[0] = as.array[0]; 4930 h.ptr[1] = as.array[1]; 4931 h.ptr[2] = as.array[2]; 4932 h.ptr[3] = as.array[3]; 4933 return vcvt_f32_f16(h); 4934 } 4935 else static if (LDC_with_F16C) 4936 { 4937 // Note: clang has a __builtin_ia32_vcvtph2ps256 but we don't 4938 // Note: LLVM IR fpext leads to call __extendhfsf2@PLT 4939 // Same with the pragma llvm.convert.from.fp16, so not sure 4940 // what to do 4941 return cast(__m128)__asm!(float4)("vcvtph2ps $1, $0", "=v,v", a); 4942 } 4943 else 4944 { 4945 // Reference: stb_image_resize2.h has F16C emulation. 4946 // See: 4947 // Originated from: 4948 __m128i mask_nosign = _mm_set1_epi32(0x7fff); 4949 __m128i smallest_normal = _mm_set1_epi32(0x0400); 4950 __m128i infinity = _mm_set1_epi32(0x7c00); 4951 __m128i expadjust_normal = _mm_set1_epi32((127 - 15) << 23); 4952 __m128i magic_denorm = _mm_set1_epi32(113 << 23); 4953 __m128i i = a; 4954 __m128i h = _mm_unpacklo_epi16 ( i, _mm_setzero_si128() ); 4955 __m128i mnosign = mask_nosign; 4956 __m128i eadjust = expadjust_normal; 4957 __m128i smallest = smallest_normal; 4958 __m128i infty = infinity; 4959 __m128i expmant = _mm_and_si128(mnosign, h); 4960 __m128i justsign = _mm_xor_si128(h, expmant); 4961 __m128i b_notinfnan = _mm_cmpgt_epi32(infty, expmant); 4962 __m128i b_isdenorm = _mm_cmpgt_epi32(smallest, expmant); 4963 __m128i shifted = _mm_slli_epi32(expmant, 13); 4964 __m128i adj_infnan = _mm_andnot_si128(b_notinfnan, eadjust); 4965 __m128i adjusted = _mm_add_epi32(eadjust, shifted); 4966 __m128i den1 = _mm_add_epi32(shifted, magic_denorm); 4967 __m128i adjusted2 = _mm_add_epi32(adjusted, adj_infnan); 4968 __m128 den2 = _mm_sub_ps(cast(__m128)den1, *cast(const(__m128)*)&magic_denorm); 4969 __m128 adjusted3 = _mm_and_ps(den2, cast(__m128)b_isdenorm); 4970 __m128 adjusted4 = _mm_andnot_ps(cast(__m128)b_isdenorm, cast(__m128)adjusted2); 4971 __m128 adjusted5 = _mm_or_ps(adjusted3, adjusted4); 4972 __m128i sign = _mm_slli_epi32(justsign, 16); 4973 __m128 final_ = _mm_or_ps(adjusted5, cast(__m128)sign); 4974 return final_; 4975 } 4976 } 4977 unittest 4978 { 4979 __m128i A = _mm_setr_epi16(cast(short)0x8000, 0x7C00, cast(short)0xDA90, 0x5000, 0, 0, 0, 0); 4980 float[4] correct = [-0.0f, float.infinity, -210.0f, 32.0f]; 4981 __m128 R = _mm_cvtph_ps(A); 4982 assert(R.array == correct); 4983 } 4984 4985 /// Convert 8 packed half-precision (16-bit) floating-point elements 4986 /// in `a` to packed single-precision (32-bit) floating-point elements. 4987 /// Note: Preserve infinities, sign of zeroes, and NaN-ness. 4988 __m256 _mm256_cvtph_ps(__m128i a) pure @trusted 4989 { 4990 static if (LDC_with_F16C) 4991 { 4992 return __asm!(float8)("vcvtph2ps $1, $0", "=v,v", a); 4993 } 4994 else 4995 { 4996 // In stb_image_resize2.h, _mm_cvtph_ps is simply hand-inlined 2x 4997 // so we do the same here. 4998 int4 ihi; 4999 ihi.ptr[0] = a.array[2]; 5000 ihi.ptr[1] = a.array[3]; 5001 __m128 lo = _mm_cvtph_ps(a); 5002 __m128 hi = _mm_cvtph_ps(ihi); 5003 return _mm256_set_m128(hi, lo); 5004 } 5005 } 5006 unittest 5007 { 5008 __m128i A = _mm_setr_epi16(0, cast(short)-32768, 0, cast(short)0xFC00, 0x7C00, 0x5A90,cast(short)0xDA90, 0x5000); 5009 float[8] correct = [0.0f, -0.0f, 0.0f, -float.infinity, float.infinity, 210.0f, -210.0f, 32.0f]; 5010 __m256 R = _mm256_cvtph_ps(A); 5011 assert(R.array == correct); 5012 } 5013 5014 /// Convert packed single-precision (32-bit) floating-point elements in a to 5015 /// packed half-precision (16-bit) floating-point elements USING NEAREST ROUNDING. 5016 /// 5017 /// Normally rounding is done according to the imm8[2:0] parameter, which can be one of: 5018 /// - _MM_FROUND_TO_NEAREST_INT // round to nearest 5019 /// - _MM_FROUND_TO_NEG_INF // round down 5020 /// - _MM_FROUND_TO_POS_INF // round up 5021 /// - _MM_FROUND_TO_ZERO // truncate 5022 /// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 5023 /// 5024 /// HOWEVER BUG: here only nearest rounding is supported, this is ensured statically. 5025 /// Note: Preserve infinities, sign of zeroes, and NaN-ness. 5026 __m128i _mm_cvtps_ph(int imm8)(__m128 a) 5027 { 5028 // Only nearest rounding is supported 5029 // If you REALLY need another rounding mode, contact intel-intrinsics authors. 5030 static assert((imm8 & 3) == _MM_FROUND_TO_NEAREST_INT); 5031 5032 static if (LDC_with_ARM64) 5033 { 5034 // BUG: this has semantics different from the x86 5035 // version. vcvt_f16_f32 uses FPCR, while the x86 5036 // path uses nearest-rounding. 5037 5038 short8 r; 5039 short4 h = vcvt_f16_f32(a); 5040 r.ptr[0] = h.array[0]; 5041 r.ptr[1] = h.array[1]; 5042 r.ptr[2] = h.array[2]; 5043 r.ptr[3] = h.array[3]; 5044 r.ptr[4] = 0; 5045 r.ptr[5] = 0; 5046 r.ptr[6] = 0; 5047 r.ptr[7] = 0; 5048 return cast(__m128i) r; 5049 } 5050 else static if (GDC_or_LDC_with_F16C) 5051 { 5052 return cast(__m128i) __builtin_ia32_vcvtps2ph(a, imm8); 5053 } 5054 else 5055 { 5056 // Source: stb_image_resize2.h 5057 // Fabian Giesen's round-to-nearest-even float to half routine 5058 __m128 msign = cast(__m128) _mm_set1_epi32(0x80000000); 5059 __m128 justsign = _mm_and_ps(msign, a); 5060 __m128 absf = _mm_xor_ps(a, justsign); 5061 __m128i absf_int = cast(__m128i) absf; 5062 __m128i f16max = _mm_set1_epi32((127 + 16) << 23); 5063 __m128 b_isnan = _mm_cmpunord_ps(absf, absf); // is this a NaN? 5064 __m128i b_isregular = _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special? 5065 __m128i c_nanbit = _mm_set1_epi32(0x200); 5066 __m128i c_infty_as_fp16 = _mm_set1_epi32(0x7c00); 5067 __m128i nanbit = _mm_and_si128(_mm_castps_si128(b_isnan), c_nanbit); 5068 __m128i inf_or_nan = _mm_or_si128(nanbit, c_infty_as_fp16); // output for specials 5069 5070 __m128i min_normal = _mm_set1_epi32( (127 - 14) << 23 ); // smallest FP32 that yields a normalized FP16 5071 __m128i b_issub = _mm_cmpgt_epi32(min_normal, absf_int); 5072 __m128i c_subnorm_magic = _mm_set1_epi32( ((127 - 15) + (23 - 10) + 1) << 23 ); 5073 5074 // "result is subnormal" path 5075 __m128 subnorm1 = _mm_add_ps(absf, cast(__m128) c_subnorm_magic); // magic value to round output mantissa 5076 __m128i subnorm2 = _mm_sub_epi32(cast(__m128i)(subnorm1), c_subnorm_magic); // subtract out bias 5077 5078 // "result is normal" path 5079 __m128i mantoddbit = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign 5080 __m128i mantodd = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0 5081 5082 __m128i c_normal_bias = _mm_set1_epi32(0xfff - ((127 - 15) << 23) ); 5083 __m128i round1 = _mm_add_epi32(absf_int, c_normal_bias); 5084 __m128i round2 = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE) 5085 __m128i normal = _mm_srli_epi32(round2, 13); // rounded result 5086 5087 // combine the two non-specials 5088 __m128i nonspecial = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal)); 5089 5090 // merge in specials as well 5091 __m128i joined = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan)); 5092 5093 __m128i sign_shift = _mm_srai_epi32(_mm_castps_si128(justsign), 16); 5094 __m128i final_ = _mm_or_si128(joined, sign_shift); 5095 return _mm_packs_epi32(final_, _mm_setzero_si128()); 5096 } 5097 } 5098 unittest 5099 { 5100 float[4] A = [0.0f, -0.0f, 0.0f, -float.infinity]; 5101 short8 RA = cast(short8) _mm_cvtps_ph!_MM_FROUND_TO_NEAREST_INT(_mm_loadu_ps(A.ptr)); 5102 short[8] correctA = [0, cast(short)-32768, 0, cast(short)0xFC00, 0, 0, 0, 0]; 5103 assert(RA.array == correctA); 5104 5105 float[4] B = [float.infinity, 210.0f, -210.0f, 32.0000001f]; 5106 short8 RB = cast(short8) _mm_cvtps_ph!_MM_FROUND_TO_NEAREST_INT(_mm_loadu_ps(B.ptr)); 5107 short[8] correctB = [0x7C00, 0x5A90, cast(short)0xDA90, 0x5000, 0, 0, 0, 0]; 5108 assert(RB.array == correctB); 5109 } 5110 5111 /// Convert packed single-precision (32-bit) floating-point elements in a to 5112 /// packed half-precision (16-bit) floating-point elements USING NEAREST ROUNDING. 5113 /// 5114 /// Normally rounding is done according to the imm8[2:0] parameter, which can be one of: 5115 /// - _MM_FROUND_TO_NEAREST_INT // round to nearest 5116 /// - _MM_FROUND_TO_NEG_INF // round down 5117 /// - _MM_FROUND_TO_POS_INF // round up 5118 /// - _MM_FROUND_TO_ZERO // truncate 5119 /// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 5120 /// 5121 /// HOWEVER here only nearest rounding is supported, this is ensured statically. 5122 /// Note: Preserve infinities, sign of zeroes, and NaN-ness. 5123 __m128i _mm256_cvtps_ph(int imm8)(__m256 a) 5124 { 5125 // Only nearest rounding is supported 5126 // If you REALLY need another rounding mode, contact intel-intrinsics authors. 5127 static assert((imm8 & 3) == _MM_FROUND_TO_NEAREST_INT); 5128 5129 static if (LDC_with_ARM64) 5130 { 5131 // BUG: this has semantics different from the x86 5132 // version. vcvt_f16_f32 uses FPCR, while the x86 5133 // path uses nearest-rounding. 5134 5135 __m128 a_lo = _mm256_extractf128_ps!0(a); 5136 __m128 a_hi = _mm256_extractf128_ps!1(a); 5137 short8 r; 5138 short4 h1 = vcvt_f16_f32(a_lo); 5139 short4 h2 = vcvt_f16_f32(a_hi); 5140 r.ptr[0] = h1.array[0]; 5141 r.ptr[1] = h1.array[1]; 5142 r.ptr[2] = h1.array[2]; 5143 r.ptr[3] = h1.array[3]; 5144 r.ptr[4] = h2.array[0]; 5145 r.ptr[5] = h2.array[1]; 5146 r.ptr[6] = h2.array[2]; 5147 r.ptr[7] = h2.array[3]; 5148 return cast(__m128i) r; 5149 } 5150 else static if (GDC_or_LDC_with_F16C) 5151 { 5152 return cast(__m128i) __builtin_ia32_vcvtps2ph256(a, imm8); 5153 } 5154 else 5155 { 5156 __m128 a_lo = _mm256_extractf128_ps!0(a); 5157 __m128 a_hi = _mm256_extractf128_ps!1(a); 5158 long2 r_lo = cast(long2) _mm_cvtps_ph!imm8(a_lo); 5159 long2 r_hi = cast(long2) _mm_cvtps_ph!imm8(a_hi); 5160 long2 r; 5161 r.ptr[0] = r_lo.array[0]; 5162 r.ptr[1] = r_hi.array[0]; 5163 return cast(__m128i) r; 5164 } 5165 } 5166 unittest 5167 { 5168 float[8] A = [0.0f, -0.0f, 0.0f, -float.infinity, float.infinity, 210.0f, -210.0f, 32.0000001f]; 5169 short8 R = cast(short8) _mm256_cvtps_ph!_MM_FROUND_TO_NEAREST_INT(_mm256_loadu_ps(A.ptr)); 5170 short[8] correct = [0, -32768, 0, cast(short)0xFC00, 0x7C00, 0x5A90, cast(short)0xDA90, 0x5000]; 5171 assert(R.array == correct); 5172 }