1 /** 2 * SSE4.1 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1 4 * 5 * Copyright: Guillaume Piolat 2021. 6 * Johan Engelen 2021. 7 * cet 2024. 8 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 9 */ 10 module inteli.smmintrin; 11 12 // SSE4.1 instructions 13 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1 14 // Note: this header will work whether you have SSE4.1 enabled or not. 15 // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively 16 // generate SSE4.1 instructions. 17 // With GDC, use "dflags-gdc": ["-msse4.1"] or equivalent to generate SSE4.1 instructions. 18 19 public import inteli.types; 20 import inteli.internals; 21 22 // smmintrin pulls in all previous instruction set intrinsics. 23 public import inteli.tmmintrin; 24 25 nothrow @nogc: 26 27 enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes 28 enum int _MM_FROUND_TO_NEG_INF = 0x01; /// ditto 29 enum int _MM_FROUND_TO_POS_INF = 0x02; /// ditto 30 enum int _MM_FROUND_TO_ZERO = 0x03; /// ditto 31 enum int _MM_FROUND_CUR_DIRECTION = 0x04; /// ditto 32 enum int _MM_FROUND_RAISE_EXC = 0x00; /// ditto 33 enum int _MM_FROUND_NO_EXC = 0x08; /// ditto 34 35 enum int _MM_FROUND_NINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT); 36 enum int _MM_FROUND_FLOOR = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF); 37 enum int _MM_FROUND_CEIL = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF); 38 enum int _MM_FROUND_TRUNC = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO); 39 enum int _MM_FROUND_RINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION); 40 enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION); 41 42 /// Add packed signed 32-bit integers in `a` and `b` using saturation. 43 /// #BONUS 44 __m128i _mm_adds_epi32(__m128i a, __m128i b) pure 45 { 46 // PERF: ARM64 should use 2x vqadd_s32 47 static if (LDC_with_saturated_intrinsics) 48 return cast(__m128i)inteli_llvm_adds!int4(cast(int4)a, cast(int4)b); 49 else 50 { 51 __m128i int_max = _mm_set1_epi32(0x7FFFFFFF); 52 __m128i res = _mm_add_epi32(a, b); 53 __m128i sign_bit = _mm_srli_epi32(a, 31); 54 __m128i sign_xor = _mm_xor_si128(a, b); 55 __m128i overflow = _mm_andnot_si128(sign_xor, _mm_xor_si128(a, res)); 56 __m128i saturated = _mm_add_epi32(int_max, sign_bit); 57 return cast(__m128i) _mm_blendv_ps(cast(__m128)res, 58 cast(__m128)saturated, 59 cast(__m128)overflow); 60 } 61 } 62 unittest 63 { 64 __m128i a = _mm_setr_epi32(int.max, 1, 2, int.min); 65 __m128i b = _mm_setr_epi32(1, 2, 3, -4); 66 assert(_mm_adds_epi32(a, b).array == [int.max, 3, 5, int.min]); 67 } 68 69 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results. 70 // Note: changed signature, GDC needs a compile-time value for imm8. 71 __m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) pure @trusted 72 { 73 // PERF DMD 74 static if (GDC_with_SSE41) 75 { 76 pragma(inline, true); // else wouldn't inline in _mm256_blend_epi16 77 return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8); 78 } 79 else 80 { 81 // LDC x86 This generates pblendw since LDC 1.1 and -O2 82 short8 r; 83 short8 sa = cast(short8)a; 84 short8 sb = cast(short8)b; 85 for (int n = 0; n < 8; ++n) 86 { 87 r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n]; 88 } 89 return cast(__m128i)r; 90 } 91 } 92 unittest 93 { 94 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 95 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 96 short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011 97 short[8] correct = [8, 9, 2, 3, 12, 5, 6, 15]; 98 assert(C.array == correct); 99 } 100 101 102 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`. 103 // Note: changed signature, GDC needs a compile-time value for `imm8`. 104 __m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted 105 { 106 static assert(imm8 >= 0 && imm8 < 4); 107 // PERF DMD 108 static if (GDC_with_SSE41) 109 { 110 return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8); 111 } 112 else 113 { 114 // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12 115 double2 r; 116 for (int n = 0; n < 2; ++n) 117 { 118 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 119 } 120 return cast(__m128d)r; 121 } 122 } 123 unittest 124 { 125 __m128d A = _mm_setr_pd(0, 1); 126 __m128d B = _mm_setr_pd(8, 9); 127 double2 C = _mm_blend_pd!2(A, B); 128 double[2] correct = [0, 9]; 129 assert(C.array == correct); 130 } 131 132 133 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control 134 /// mask `imm8`. 135 // Note: changed signature, GDC needs a compile-time value for imm8. 136 __m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) pure @trusted 137 { 138 // PERF DMD 139 static assert(imm8 >= 0 && imm8 < 16); 140 static if (GDC_with_SSE41) 141 { 142 return __builtin_ia32_blendps(a, b, imm8); 143 } 144 else version(LDC) 145 { 146 // LDC x86: generates blendps since LDC 1.1 -O2 147 // arm64: pretty good, two instructions worst case 148 return shufflevectorLDC!(float4, (imm8 & 1) ? 4 : 0, 149 (imm8 & 2) ? 5 : 1, 150 (imm8 & 4) ? 6 : 2, 151 (imm8 & 8) ? 7 : 3)(a, b); 152 } 153 else 154 { 155 // PERF GDC without SSE4.1 is quite bad 156 __m128 r; 157 for (int n = 0; n < 4; ++n) 158 { 159 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 160 } 161 return r; 162 } 163 } 164 unittest 165 { 166 __m128 A = _mm_setr_ps(0, 1, 2, 3); 167 __m128 B = _mm_setr_ps(8, 9, 10, 11); 168 float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101 169 float[4] correct = [8, 1, 10, 11]; 170 assert(C.array == correct); 171 } 172 173 /// Blend packed 8-bit integers from `a` and `b` using `mask`. 174 /// Select from `b` if the high-order bit of the corresponding 8-bit element in `mask` is set, else select from `a`. 175 __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) pure @trusted 176 { 177 // PERF DMD 178 /*static if (GDC_with_SSE41) 179 { 180 // This intrinsic do nothing in GDC 12. 181 // TODO report to GDC. No problem in GCC. 182 return cast(__m128i) __builtin_ia32_pblendvb128 (cast(ubyte16)a, cast(ubyte16)b, cast(ubyte16)mask); 183 } 184 else*/ 185 static if (LDC_with_SSE41) 186 { 187 return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask); 188 } 189 else static if (LDC_with_ARM64) 190 { 191 // LDC arm64: two instructions since LDC 1.12 -O2 192 byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7); 193 return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a); 194 } 195 else 196 { 197 __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask); 198 return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b); 199 } 200 } 201 unittest 202 { 203 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 204 8, 9, 10, 11, 12, 13, 14, 15); 205 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 206 24, 25, 26, 27, 28, 29, 30, 31); 207 __m128i M = _mm_setr_epi8( 1, -1, 1, 1, -4, 1, -8, 127, 208 1, 1, -1, -1, 4, 1, 8, -128); 209 byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M); 210 byte[16] correct = [ 0, 17, 2, 3, 20, 5, 22, 7, 211 8, 9, 26, 27, 12, 13, 14, 31 ]; 212 assert(R.array == correct); 213 } 214 215 216 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`. 217 __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted 218 { 219 // PERF DMD 220 static if (GDC_with_SSE42) 221 { 222 // PERF Amazingly enough, GCC/GDC generates the blendvpd instruction 223 // with -msse4.2 but not -msse4.1. 224 // Not sure what is the reason, and there is a replacement sequence. 225 // Sounds like a bug. 226 return __builtin_ia32_blendvpd(a, b, mask); 227 } 228 else static if (LDC_with_SSE41) 229 { 230 return __builtin_ia32_blendvpd(a, b, mask); 231 } 232 else static if (LDC_with_ARM64) 233 { 234 long2 shift; 235 shift = 63; 236 long2 lmask = cast(long2)mask >> shift; 237 return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a); 238 } 239 else 240 { 241 __m128d r; // PERF =void; 242 long2 lmask = cast(long2)mask; 243 for (int n = 0; n < 2; ++n) 244 { 245 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 246 } 247 return r; 248 } 249 } 250 unittest 251 { 252 __m128d A = _mm_setr_pd(1.0, 2.0); 253 __m128d B = _mm_setr_pd(3.0, 4.0); 254 __m128d M1 = _mm_setr_pd(-3.0, 2.0); 255 __m128d R1 = _mm_blendv_pd(A, B, M1); 256 double[2] correct1 = [3.0, 2.0]; 257 assert(R1.array == correct1); 258 259 // Note: wouldn't work with -double.nan, since in some AArch64 archs the NaN sign bit is lost 260 // See Issue #78 261 __m128d M2 = _mm_setr_pd(double.nan, double.infinity); 262 __m128d R2 = _mm_blendv_pd(A, B, M2); 263 double[2] correct2 = [1.0, 2.0]; 264 assert(R2.array == correct2); 265 } 266 267 268 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`. 269 __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) pure @trusted 270 { 271 // PERF DMD 272 static if (GDC_with_SSE41) 273 { 274 return __builtin_ia32_blendvps(a, b, mask); 275 } 276 else static if (LDC_with_SSE41) 277 { 278 return __builtin_ia32_blendvps(a, b, mask); 279 } 280 else static if (LDC_with_ARM64) 281 { 282 int4 shift; 283 shift = 31; 284 int4 lmask = cast(int4)mask >> shift; 285 return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a); 286 } 287 else 288 { 289 // LDC x86_64: Compiles to 5 instr since LDC 1.27 -O2 290 // If lack of optimization, consider replacing by: 291 // __m128i overflow_mask = _mm_srai_epi32(overflow, 31); 292 // return _mm_or_si128( 293 // _mm_and_si128(overflow_mask, saturated), 294 // _mm_andnot_si128(overflow_mask, res) 295 // LLVM makes almost the same sequence when optimized. 296 __m128 r; 297 int4 lmask = cast(int4)mask; 298 for (int n = 0; n < 4; ++n) 299 { 300 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 301 } 302 return r; 303 } 304 } 305 unittest 306 { 307 __m128 A = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f); 308 __m128 B = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f); 309 __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f); 310 __m128 M2 = _mm_setr_ps(float.nan, float.nan, -0.0f, +0.0f); 311 __m128 R1 = _mm_blendv_ps(A, B, M1); 312 __m128 R2 = _mm_blendv_ps(A, B, M2); 313 float[4] correct1 = [ 4.0f, 1.0f, 2.0f, 7.0f]; 314 float[4] correct2 = [ 0.0f, 1.0f, 6.0f, 3.0f]; 315 assert(R1.array == correct1); 316 317 // Note: wouldn't work with -float.nan, since in some AArch64 archs the NaN sign bit is lost 318 // See Issue #78 319 assert(R2.array == correct2); 320 } 321 322 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, 323 /// and store the results as packed double-precision floating-point elements. 324 __m128d _mm_ceil_pd (__m128d a) @trusted 325 { 326 static if (LDC_with_ARM64) 327 { 328 // LDC arm64 acceptable since 1.8 -O2 329 // Unfortunately x86 intrinsics force a round-trip back to double2 330 // ARM neon semantics wouldn't have that 331 long2 l = vcvtpq_s64_f64(a); 332 double2 r; 333 r.ptr[0] = l.array[0]; 334 r.ptr[1] = l.array[1]; 335 return r; 336 } 337 else 338 { 339 return _mm_round_pd!2(a); 340 } 341 } 342 unittest 343 { 344 __m128d A = _mm_setr_pd(1.3f, -2.12f); 345 __m128d B = _mm_setr_pd(53.6f, -2.7f); 346 A = _mm_ceil_pd(A); 347 B = _mm_ceil_pd(B); 348 double[2] correctA = [2.0, -2.0]; 349 double[2] correctB = [54.0, -2.0]; 350 assert(A.array == correctA); 351 assert(B.array == correctB); 352 } 353 354 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, 355 /// and store the results as packed single-precision floating-point elements. 356 __m128 _mm_ceil_ps (__m128 a) @trusted 357 { 358 static if (LDC_with_ARM64) 359 { 360 // LDC arm64 acceptable since 1.8 -O1 361 int4 l = vcvtpq_s32_f32(a); 362 float4 r; 363 r.ptr[0] = l.array[0]; 364 r.ptr[1] = l.array[1]; 365 r.ptr[2] = l.array[2]; 366 r.ptr[3] = l.array[3]; 367 return r; 368 } 369 else 370 { 371 return _mm_round_ps!2(a); 372 } 373 } 374 unittest 375 { 376 __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); 377 __m128 C = _mm_ceil_ps(A); 378 float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f]; 379 assert(C.array == correct); 380 } 381 382 /// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, 383 /// store the result as a double-precision floating-point element in the lower element of result, 384 /// and copy the upper element from `a` to the upper element of dst. 385 __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted 386 { 387 static if (LDC_with_ARM64) 388 { 389 a[0] = vcvtps_s64_f64(b[0]); 390 return a; 391 } 392 else 393 { 394 return _mm_round_sd!2(a, b); 395 } 396 } 397 unittest 398 { 399 __m128d A = _mm_setr_pd(1.3, -2.12); 400 __m128d B = _mm_setr_pd(53.6, -3.7); 401 __m128d C = _mm_ceil_sd(A, B); 402 double[2] correct = [54.0, -2.12]; 403 assert(C.array == correct); 404 } 405 406 /// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value, 407 /// store the result as a single-precision floating-point element in the lower element of result, 408 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 409 __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted 410 { 411 static if (LDC_with_ARM64) 412 { 413 a[0] = vcvtps_s32_f32(b[0]); 414 return a; 415 } 416 else 417 { 418 return _mm_round_ss!2(a, b); 419 } 420 } 421 unittest 422 { 423 __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); 424 __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f); 425 __m128 C = _mm_ceil_ss(A, B); 426 float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f]; 427 assert(C.array == correct); 428 } 429 430 /// Compare packed 64-bit integers in `a` and `b` for equality. 431 __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted 432 { 433 static if (SIMD_COMPARISON_MASKS_16B) 434 { 435 version(DigitalMars) 436 { 437 // DMD doesn't recognize long2 == long2 438 long2 la = cast(long2)a; 439 long2 lb = cast(long2)b; 440 long2 res; 441 res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0; 442 res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0; 443 return cast(__m128i)res; 444 } 445 else 446 { 447 return cast(__m128i)(cast(long2)a == cast(long2)b); 448 } 449 } 450 else static if (GDC_with_SSE41) 451 { 452 return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b); 453 } 454 else version(LDC) 455 { 456 // LDC x86: generates pcmpeqq since LDC 1.1 -O1 457 // arm64: generates cmeq since LDC 1.8 -O1 458 return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b); 459 } 460 else 461 { 462 // Clever pcmpeqd + pand use with LDC 1.24 -O2 463 long2 la = cast(long2)a; 464 long2 lb = cast(long2)b; 465 long2 res; 466 res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0; 467 res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0; 468 return cast(__m128i)res; 469 } 470 } 471 unittest 472 { 473 __m128i A = _mm_setr_epi64(-1, -2); 474 __m128i B = _mm_setr_epi64(-3, -2); 475 __m128i C = _mm_setr_epi64(-1, -4); 476 long2 AB = cast(long2) _mm_cmpeq_epi64(A, B); 477 long2 AC = cast(long2) _mm_cmpeq_epi64(A, C); 478 long[2] correct1 = [0, -1]; 479 long[2] correct2 = [-1, 0]; 480 assert(AB.array == correct1); 481 assert(AC.array == correct2); 482 } 483 484 485 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers. 486 __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted 487 { 488 // PERF DMD 489 static if (GDC_with_SSE41) 490 { 491 return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a); 492 } 493 else static if (LDC_with_optimizations) 494 { 495 // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64 496 enum ir = ` 497 %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3> 498 %r = sext <4 x i16> %v to <4 x i32> 499 ret <4 x i32> %r`; 500 return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a); 501 } 502 else 503 { 504 short8 sa = cast(short8)a; 505 int4 r; 506 r.ptr[0] = sa.array[0]; 507 r.ptr[1] = sa.array[1]; 508 r.ptr[2] = sa.array[2]; 509 r.ptr[3] = sa.array[3]; 510 return r; 511 } 512 } 513 unittest 514 { 515 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 516 int4 C = cast(int4) _mm_cvtepi16_epi32(A); 517 int[4] correct = [-1, 0, -32768, 32767]; 518 assert(C.array == correct); 519 } 520 521 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers. 522 __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted 523 { 524 // PERF DMD 525 static if (GDC_with_SSE41) 526 { 527 return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a); 528 } 529 else static if (LDC_with_optimizations) 530 { 531 // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64 532 enum ir = ` 533 %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> <i32 0, i32 1> 534 %r = sext <2 x i16> %v to <2 x i64> 535 ret <2 x i64> %r`; 536 return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a); 537 } 538 else 539 { 540 short8 sa = cast(short8)a; 541 long2 r; 542 r.ptr[0] = sa.array[0]; 543 r.ptr[1] = sa.array[1]; 544 return cast(__m128i)r; 545 } 546 } 547 unittest 548 { 549 __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0); 550 long2 C = cast(long2) _mm_cvtepi16_epi64(A); 551 long[2] correct = [-32768, 32767]; 552 assert(C.array == correct); 553 } 554 555 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers. 556 __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted 557 { 558 // PERF DMD 559 static if (GDC_with_SSE41) 560 { 561 return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a); 562 } 563 else static if (LDC_with_optimizations) 564 { 565 // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64 566 enum ir = ` 567 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 568 %r = sext <2 x i32> %v to <2 x i64> 569 ret <2 x i64> %r`; 570 return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a); 571 } 572 else 573 { 574 int4 sa = cast(int4)a; 575 long2 r; 576 r.ptr[0] = sa.array[0]; 577 r.ptr[1] = sa.array[1]; 578 return cast(__m128i)r; 579 } 580 } 581 unittest 582 { 583 __m128i A = _mm_setr_epi32(-4, 42, 0, 0); 584 long2 C = cast(long2) _mm_cvtepi32_epi64(A); 585 long[2] correct = [-4, 42]; 586 assert(C.array == correct); 587 } 588 589 590 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers. 591 __m128i _mm_cvtepi8_epi16 (__m128i a) pure @trusted 592 { 593 // PERF DMD 594 static if (GDC_with_SSE41) 595 { 596 alias ubyte16 = __vector(ubyte[16]); 597 return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a); 598 } 599 else static if (LDC_with_optimizations) 600 { 601 // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 602 // LDC ARM64: sshll generated since LDC 1.8.0 -O1 603 enum ir = ` 604 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 605 %r = sext <8 x i8> %v to <8 x i16> 606 ret <8 x i16> %r`; 607 return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a); 608 } 609 else 610 { 611 byte16 sa = cast(byte16)a; 612 short8 r; 613 foreach(n; 0..8) 614 r.ptr[n] = sa.array[n]; 615 return cast(__m128i)r; 616 } 617 } 618 unittest 619 { 620 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 621 short8 C = cast(short8) _mm_cvtepi8_epi16(A); 622 short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8]; 623 assert(C.array == correct); 624 } 625 626 627 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers. 628 __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted 629 { 630 // PERF DMD 631 static if (GDC_with_SSE41) 632 { 633 alias ubyte16 = __vector(ubyte[16]); 634 return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a); 635 } 636 else static if (LDC_with_SSE41 && LDC_with_optimizations) 637 { 638 // LDC x86: Generates pmovsxbd since LDC 1.1 -O0 639 enum ir = ` 640 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 641 %r = sext <4 x i8> %v to <4 x i32> 642 ret <4 x i32> %r`; 643 return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a); 644 } 645 else 646 { 647 // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would 648 byte16 sa = cast(byte16)a; 649 int4 r; 650 r.ptr[0] = sa.array[0]; 651 r.ptr[1] = sa.array[1]; 652 r.ptr[2] = sa.array[2]; 653 r.ptr[3] = sa.array[3]; 654 return cast(__m128i)r; 655 } 656 } 657 unittest 658 { 659 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 660 int4 C = cast(int4) _mm_cvtepi8_epi32(A); 661 int[4] correct = [127, -128, 1, -1]; 662 assert(C.array == correct); 663 } 664 665 666 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. 667 __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted 668 { 669 // PERF DMD 670 static if (GDC_with_SSE41) 671 { 672 alias ubyte16 = __vector(ubyte[16]); 673 return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a); 674 } 675 else static if (LDC_with_optimizations) 676 { 677 // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, 678 // LDC arm64: it's ok since LDC 1.8 -O1 679 enum ir = ` 680 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> <i32 0, i32 1> 681 %r = sext <2 x i8> %v to <2 x i64> 682 ret <2 x i64> %r`; 683 return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a); 684 } 685 else 686 { 687 byte16 sa = cast(byte16)a; 688 long2 r; 689 foreach(n; 0..2) 690 r.ptr[n] = sa.array[n]; 691 return cast(__m128i)r; 692 } 693 } 694 unittest 695 { 696 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 697 long2 C = cast(long2) _mm_cvtepi8_epi64(A); 698 long[2] correct = [127, -128]; 699 assert(C.array == correct); 700 } 701 702 703 /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers. 704 __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted 705 { 706 // PERF DMD 707 static if (GDC_with_SSE41) 708 { 709 return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a); 710 } 711 else 712 { 713 // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 714 // arm64: ushll since LDC 1.12 -O1 715 short8 sa = cast(short8)a; 716 int4 r; 717 r.ptr[0] = cast(ushort)sa.array[0]; 718 r.ptr[1] = cast(ushort)sa.array[1]; 719 r.ptr[2] = cast(ushort)sa.array[2]; 720 r.ptr[3] = cast(ushort)sa.array[3]; 721 return cast(__m128i)r; 722 } 723 } 724 unittest 725 { 726 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 727 int4 C = cast(int4) _mm_cvtepu16_epi32(A); 728 int[4] correct = [65535, 0, 32768, 32767]; 729 assert(C.array == correct); 730 } 731 732 733 /// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers. 734 __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted 735 { 736 // PERF DMD 737 static if (GDC_with_SSE41) 738 { 739 return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a); 740 } 741 else static if (LDC_with_ARM64) 742 { 743 // LDC arm64: a bit shorter than below, in -O2 744 short8 sa = cast(short8)a; 745 long2 r; 746 for(int n = 0; n < 2; ++n) 747 r.ptr[n] = cast(ushort)sa.array[n]; 748 return cast(__m128i)r; 749 } 750 else 751 { 752 // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 753 short8 sa = cast(short8)a; 754 long2 r; 755 r.ptr[0] = cast(ushort)sa.array[0]; 756 r.ptr[1] = cast(ushort)sa.array[1]; 757 return cast(__m128i)r; 758 } 759 } 760 unittest 761 { 762 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 763 long2 C = cast(long2) _mm_cvtepu16_epi64(A); 764 long[2] correct = [65535, 0]; 765 assert(C.array == correct); 766 } 767 768 769 /// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers. 770 __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted 771 { 772 // PERF DMD 773 static if (GDC_with_SSE41) 774 { 775 return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a); 776 } 777 else 778 { 779 // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1 780 // arm64: generates ushll since LDC 1.12 -O1 781 int4 sa = cast(int4)a; 782 long2 r; 783 r.ptr[0] = cast(uint)sa.array[0]; 784 r.ptr[1] = cast(uint)sa.array[1]; 785 return cast(__m128i)r; 786 } 787 } 788 unittest 789 { 790 __m128i A = _mm_setr_epi32(-1, 42, 0, 0); 791 long2 C = cast(long2) _mm_cvtepu32_epi64(A); 792 long[2] correct = [4294967295, 42]; 793 assert(C.array == correct); 794 } 795 796 797 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers. 798 __m128i _mm_cvtepu8_epi16 (__m128i a) pure @trusted 799 { 800 // PERF DMD 801 static if (GDC_with_SSE41) 802 { 803 return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(ubyte16)a); 804 } 805 else static if (LDC_with_optimizations) 806 { 807 enum ir = ` 808 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1,i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 809 %r = zext <8 x i8> %v to <8 x i16> 810 ret <8 x i16> %r`; 811 return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a); 812 } 813 else 814 { 815 return _mm_unpacklo_epi8(a, _mm_setzero_si128()); 816 } 817 } 818 unittest 819 { 820 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 821 short8 C = cast(short8) _mm_cvtepu8_epi16(A); 822 short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248]; 823 assert(C.array == correct); 824 } 825 826 827 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers. 828 __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted 829 { 830 // PERF DMD 831 static if (GDC_with_SSE41) 832 { 833 alias ubyte16 = __vector(ubyte[16]); 834 return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a); 835 } 836 else static if (LDC_with_ARM64) 837 { 838 // LDC arm64: a bit better than below in -O2 839 byte16 sa = cast(byte16)a; 840 int4 r; 841 for(int n = 0; n < 4; ++n) 842 r.ptr[n] = cast(ubyte)sa.array[n]; 843 return cast(__m128i)r; 844 } 845 else 846 { 847 // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1 848 // PERF: catastrophic with GDC without SSE4.1 849 byte16 sa = cast(byte16)a; 850 int4 r; 851 r.ptr[0] = cast(ubyte)sa.array[0]; 852 r.ptr[1] = cast(ubyte)sa.array[1]; 853 r.ptr[2] = cast(ubyte)sa.array[2]; 854 r.ptr[3] = cast(ubyte)sa.array[3]; 855 return cast(__m128i)r; 856 } 857 } 858 unittest 859 { 860 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 861 int4 C = cast(int4) _mm_cvtepu8_epi32(A); 862 int[4] correct = [127, 128, 1, 255]; 863 assert(C.array == correct); 864 } 865 866 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. 867 __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted 868 { 869 // PERF DMD 870 static if (GDC_with_SSE41) 871 { 872 alias ubyte16 = __vector(ubyte[16]); 873 return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a); 874 } 875 else static if (LDC_with_ARM64) 876 { 877 // LDC arm64: this optimizes better than the loop below 878 byte16 sa = cast(byte16)a; 879 long2 r; 880 for (int n = 0; n < 2; ++n) 881 r.ptr[n] = cast(ubyte)sa.array[n]; 882 return cast(__m128i)r; 883 } 884 else 885 { 886 // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1 887 byte16 sa = cast(byte16)a; 888 long2 r; 889 r.ptr[0] = cast(ubyte)sa.array[0]; 890 r.ptr[1] = cast(ubyte)sa.array[1]; 891 return cast(__m128i)r; 892 } 893 } 894 unittest 895 { 896 __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 897 long2 C = cast(long2) _mm_cvtepu8_epi64(A); 898 long[2] correct = [127, 254]; 899 assert(C.array == correct); 900 } 901 902 /// Conditionally multiply the packed double-precision (64-bit) floating-point elements 903 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally 904 /// store the sum in dst using the low 4 bits of `imm8`. 905 __m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted 906 { 907 // PERF DMD 908 static if (GDC_with_SSE41) 909 { 910 return __builtin_ia32_dppd(a, b, imm8 & 0x33); 911 } 912 else static if (LDC_with_SSE41) 913 { 914 return __builtin_ia32_dppd(a, b, imm8 & 0x33); 915 } 916 else 917 { 918 __m128d zero = _mm_setzero_pd(); 919 __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b); 920 double sum = temp.array[0] + temp.array[1]; 921 return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum)); 922 } 923 } 924 unittest 925 { 926 __m128d A = _mm_setr_pd(1.0, 2.0); 927 __m128d B = _mm_setr_pd(4.0, 8.0); 928 double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B); 929 double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B); 930 double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B); 931 double[2] correct1 = [ 4.0, 4.0]; 932 double[2] correct2 = [16.0, 0.0]; 933 double[2] correct3 = [ 0.0, 20.0]; 934 assert(R1.array == correct1); 935 assert(R2.array == correct2); 936 assert(R3.array == correct3); 937 } 938 939 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements 940 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, 941 /// and conditionally store the sum in result using the low 4 bits of `imm8`. 942 __m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted 943 { 944 // PERF DMD 945 static if (GDC_with_SSE41) 946 { 947 return __builtin_ia32_dpps(a, b, cast(ubyte)imm8); 948 } 949 else static if (LDC_with_SSE41) 950 { 951 return __builtin_ia32_dpps(a, b, cast(byte)imm8); 952 } 953 else 954 { 955 __m128 zero = _mm_setzero_ps(); 956 __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b); 957 float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3]; 958 return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum)); 959 } 960 } 961 unittest 962 { 963 __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f); 964 __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f); 965 float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B); 966 float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B); 967 float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B); 968 float[4] correct1 = [67.0f, 67.0f, 67.0f, 67.0f]; 969 float[4] correct2 = [23.0f, 0.0f, 23.0f, 0.0f]; 970 float[4] correct3 = [0.0f, 29.0f, 0.0f, 29.0f]; 971 assert(R1.array == correct1); 972 assert(R2.array == correct2); 973 assert(R3.array == correct3); 974 } 975 976 977 /// Extract a 32-bit integer from `a`, selected with `imm8`. 978 int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted 979 { 980 return (cast(int4)a).array[imm8 & 3]; 981 } 982 unittest 983 { 984 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 985 assert(_mm_extract_epi32(A, 0) == 1); 986 assert(_mm_extract_epi32(A, 1 + 8) == 2); 987 assert(_mm_extract_epi32(A, 3 + 4) == 4); 988 } 989 990 /// Extract a 64-bit integer from `a`, selected with `imm8`. 991 long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted 992 { 993 long2 la = cast(long2)a; 994 return la.array[imm8 & 1]; 995 } 996 unittest 997 { 998 __m128i A = _mm_setr_epi64(45, -67); 999 assert(_mm_extract_epi64(A, 0) == 45); 1000 assert(_mm_extract_epi64(A, 1) == -67); 1001 assert(_mm_extract_epi64(A, 2) == 45); 1002 } 1003 1004 /// Extract an 8-bit integer from `a`, selected with `imm8`. 1005 /// Warning: the returned value is zero-extended to 32-bits. 1006 int _mm_extract_epi8 (__m128i a, const int imm8) @trusted 1007 { 1008 byte16 ba = cast(byte16)a; 1009 return cast(ubyte) ba.array[imm8 & 15]; 1010 } 1011 unittest 1012 { 1013 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15); 1014 assert(_mm_extract_epi8(A, 7) == 7); 1015 assert(_mm_extract_epi8(A, 13) == 255); 1016 assert(_mm_extract_epi8(A, 7 + 16) == 7); 1017 } 1018 1019 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`. 1020 /// Note: returns a 32-bit $(I integer). 1021 int _mm_extract_ps (__m128 a, const int imm8) @trusted 1022 { 1023 return (cast(int4)a).array[imm8 & 3]; 1024 } 1025 unittest 1026 { 1027 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f); 1028 assert(_mm_extract_ps(A, 0) == 0x3f800000); 1029 assert(_mm_extract_ps(A, 1 + 8) == 0x40000000); 1030 assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000); 1031 } 1032 1033 1034 1035 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an 1036 /// integer value, and store the results as packed double-precision floating-point elements. 1037 __m128d _mm_floor_pd (__m128d a) @trusted 1038 { 1039 static if (LDC_with_ARM64) 1040 { 1041 // LDC arm64 acceptable since 1.8 -O2 1042 long2 l = vcvtmq_s64_f64(a); 1043 double2 r; 1044 r.ptr[0] = l.array[0]; 1045 r.ptr[1] = l.array[1]; 1046 return r; 1047 } 1048 else 1049 { 1050 return _mm_round_pd!1(a); 1051 } 1052 } 1053 unittest 1054 { 1055 __m128d A = _mm_setr_pd(1.3f, -2.12f); 1056 __m128d B = _mm_setr_pd(53.6f, -2.7f); 1057 A = _mm_floor_pd(A); 1058 B = _mm_floor_pd(B); 1059 double[2] correctA = [1.0, -3.0]; 1060 double[2] correctB = [53.0, -3.0]; 1061 assert(A.array == correctA); 1062 assert(B.array == correctB); 1063 } 1064 1065 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an 1066 /// integer value, and store the results as packed single-precision floating-point elements. 1067 __m128 _mm_floor_ps (__m128 a) @trusted 1068 { 1069 static if (LDC_with_ARM64) 1070 { 1071 // LDC arm64 acceptable since 1.8 -O1 1072 int4 l = vcvtmq_s32_f32(a); 1073 float4 r; 1074 r.ptr[0] = l.array[0]; 1075 r.ptr[1] = l.array[1]; 1076 r.ptr[2] = l.array[2]; 1077 r.ptr[3] = l.array[3]; 1078 return r; 1079 } 1080 else 1081 { 1082 return _mm_round_ps!1(a); 1083 } 1084 } 1085 unittest 1086 { 1087 __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); 1088 __m128 C = _mm_floor_ps(A); 1089 float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f]; 1090 assert(C.array == correct); 1091 } 1092 1093 /// Round the lower double-precision (64-bit) floating-point element in `b` down to an 1094 /// integer value, store the result as a double-precision floating-point element in the 1095 /// lower element, and copy the upper element from `a` to the upper element. 1096 __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted 1097 { 1098 static if (LDC_with_ARM64) 1099 { 1100 a[0] = vcvtms_s64_f64(b[0]); 1101 return a; 1102 } 1103 else 1104 { 1105 return _mm_round_sd!1(a, b); 1106 } 1107 } 1108 unittest 1109 { 1110 __m128d A = _mm_setr_pd(1.3, -2.12); 1111 __m128d B = _mm_setr_pd(-53.1, -3.7); 1112 __m128d C = _mm_floor_sd(A, B); 1113 double[2] correct = [-54.0, -2.12]; 1114 assert(C.array == correct); 1115 } 1116 1117 /// Round the lower single-precision (32-bit) floating-point element in `b` down to an 1118 /// integer value, store the result as a single-precision floating-point element in the 1119 /// lower element, and copy the upper 3 packed elements from `a` to the upper elements. 1120 __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted 1121 { 1122 static if (LDC_with_ARM64) 1123 { 1124 a[0] = vcvtms_s32_f32(b[0]); 1125 return a; 1126 } 1127 else 1128 { 1129 return _mm_round_ss!1(a, b); 1130 } 1131 } 1132 unittest 1133 { 1134 __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); 1135 __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f); 1136 __m128 C = _mm_floor_ss(A, B); 1137 float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f]; 1138 assert(C.array == correct); 1139 } 1140 1141 /// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`. 1142 __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted 1143 { 1144 // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1 1145 // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1 1146 // LDC arm64: ins.s since LDC 1.8 -O2 1147 int4 ia = cast(int4)a; 1148 ia.ptr[imm8 & 3] = i; 1149 return cast(__m128i)ia; 1150 } 1151 unittest 1152 { 1153 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 1154 int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4); 1155 int[4] result = [1, 2, 5, 4]; 1156 assert(C.array == result); 1157 } 1158 1159 /// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`. 1160 __m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted 1161 { 1162 // GDC: nothing special to do, psinrq generated with -O1 -msse4.1 1163 // LDC x86: always do something sensible. 1164 long2 la = cast(long2)a; 1165 la.ptr[imm8 & 1] = i; 1166 return cast(__m128i)la; 1167 } 1168 unittest 1169 { 1170 __m128i A = _mm_setr_epi64(1, 2); 1171 long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2); 1172 long[2] result = [1, 5]; 1173 assert(C.array == result); 1174 } 1175 1176 /// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`. 1177 /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8. 1178 __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted 1179 { 1180 // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1 1181 // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory. 1182 byte16 ba = cast(byte16)a; 1183 ba.ptr[imm8 & 15] = cast(byte)i; 1184 return cast(__m128i)ba; 1185 } 1186 unittest 1187 { 1188 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 1189 byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16); 1190 byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 1191 assert(C.array == result); 1192 } 1193 1194 1195 /// Warning: of course it does something totally different from `_mm_insert_epi32`! 1196 /// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` 1197 /// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` 1198 /// (elements are zeroed out when the corresponding bit is set). 1199 __m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted 1200 { 1201 // PERF DMD 1202 static if (GDC_with_SSE41) 1203 { 1204 return __builtin_ia32_insertps128(a, b, cast(ubyte)imm8); 1205 } 1206 else static if (LDC_with_SSE41) 1207 { 1208 return __builtin_ia32_insertps128(a, b, cast(byte)imm8); 1209 } 1210 else 1211 { 1212 float4 tmp2 = a; 1213 float tmp1 = b.array[(imm8 >> 6) & 3]; 1214 tmp2.ptr[(imm8 >> 4) & 3] = tmp1; 1215 return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps()); 1216 } 1217 } 1218 unittest 1219 { 1220 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1221 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1222 __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B); 1223 float[4] correct = [1.0f, 2.0f, 0.0f, 7.0f]; 1224 assert(C.array == correct); 1225 } 1226 1227 1228 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. 1229 __m128i _mm_max_epi32 (__m128i a, __m128i b) pure @trusted 1230 { 1231 static if (GDC_with_SSE41) 1232 { 1233 return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b); 1234 } 1235 else version(LDC) 1236 { 1237 // x86: pmaxsd since LDC 1.1 -O1 1238 // ARM: smax.4s since LDC 1.8 -01 1239 int4 sa = cast(int4)a; 1240 int4 sb = cast(int4)b; 1241 static if (SIMD_COMPARISON_MASKS_16B) 1242 int4 greater = sa > sb; 1243 else 1244 int4 greater = greaterMask!int4(sa, sb); 1245 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1246 } 1247 else 1248 { 1249 __m128i higher = _mm_cmpgt_epi32(a, b); 1250 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1251 __m128i mask = _mm_and_si128(aTob, higher); 1252 return _mm_xor_si128(b, mask); 1253 } 1254 } 1255 unittest 1256 { 1257 int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), 1258 _mm_setr_epi32( -4,-8, 9, -8)); 1259 int[4] correct = [0x7fffffff, 1, 9, 7]; 1260 assert(R.array == correct); 1261 } 1262 1263 /// Compare packed signed 8-bit integers in `a` and `b`, 1264 /// and return packed maximum values. 1265 __m128i _mm_max_epi8 (__m128i a, __m128i b) pure @trusted 1266 { 1267 // PERF DMD 1268 static if (GDC_with_SSE41) 1269 { 1270 return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b); 1271 } 1272 else version(LDC) 1273 { 1274 // x86: pmaxsb since LDC 1.1 -O1 1275 // ARM64: smax.16b since LDC 1.8.0 -O1 1276 byte16 sa = cast(byte16)a; 1277 byte16 sb = cast(byte16)b; 1278 static if (SIMD_COMPARISON_MASKS_16B) 1279 byte16 greater = sa > sb; 1280 else 1281 byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); 1282 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1283 } 1284 else 1285 { 1286 __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else 1287 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1288 __m128i mask = _mm_and_si128(aTob, lower); 1289 return _mm_xor_si128(b, mask); 1290 } 1291 } 1292 unittest 1293 { 1294 __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); 1295 __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 1296 byte16 R = cast(byte16) _mm_max_epi8(A, B); 1297 byte[16] correct = [127, 1, 9, -7, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0]; 1298 assert(R.array == correct); 1299 } 1300 1301 /// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values. 1302 __m128i _mm_max_epu16 (__m128i a, __m128i b) pure @trusted 1303 { 1304 // PERF DMD 1305 static if (GDC_with_SSE41) 1306 { 1307 return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b); 1308 } 1309 else version(LDC) 1310 { 1311 // x86: pmaxuw since LDC 1.1 -O1 1312 // ARM64: umax.8h since LDC 1.8.0 -O1 1313 // PERF: without sse4.1, LLVM 12 produces a very interesting 1314 // psubusw xmm0, xmm1 1315 // paddw xmm0, xmm1 1316 // sequence that maybe should go in other min/max intrinsics? 1317 ushort8 sa = cast(ushort8)a; 1318 ushort8 sb = cast(ushort8)b; 1319 static if (SIMD_COMPARISON_MASKS_16B) 1320 { 1321 // Note: doesn't work well with GDC, which prefers the builtin. 1322 ushort8 greater = sa > sb; 1323 } 1324 else 1325 ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb); 1326 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1327 } 1328 else 1329 { 1330 b = _mm_subs_epu16(b, a); 1331 b = _mm_add_epi16(b, a); 1332 return b; 1333 } 1334 } 1335 unittest 1336 { 1337 short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), 1338 _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); 1339 short[8] correct = [ -4, -8, -4, -7, 9,-32768, 0, 57]; 1340 assert(R.array == correct); 1341 } 1342 1343 /// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values. 1344 __m128i _mm_max_epu32 (__m128i a, __m128i b) pure @trusted 1345 { 1346 // PERF DMD 1347 static if (GDC_with_SSE41) 1348 { 1349 return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b); 1350 } 1351 else version(LDC) 1352 { 1353 // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1 1354 // ARM64: umax.4s since LDC 1.8.0 -O1 1355 uint4 sa = cast(uint4)a; 1356 uint4 sb = cast(uint4)b; 1357 static if (SIMD_COMPARISON_MASKS_16B) 1358 uint4 greater = sa > sb; 1359 else 1360 uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); 1361 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1362 } 1363 else 1364 { 1365 // PERF: LLVM suggests to replace the _mm_add_epi32 by _mm_xor_si128, and the last xor by an "_mm_or_si128" 1366 /+ 1367 movdqa xmm2, xmmword ptr [-0x80000000, -0x80000000, -0x80000000, -0x80000000] 1368 movdqa xmm3, xmm1 1369 pxor xmm3, xmm2 1370 pxor xmm2, xmm0 1371 pcmpgtd xmm2, xmm3 1372 pand xmm0, xmm2 1373 pandn xmm2, xmm1 1374 por xmm0, xmm2 1375 +/ 1376 __m128i valueShift = _mm_set1_epi32(-0x80000000); 1377 __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift)); 1378 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1379 __m128i mask = _mm_and_si128(aTob, higher); 1380 return _mm_xor_si128(b, mask); 1381 } 1382 } 1383 unittest 1384 { 1385 int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), 1386 _mm_setr_epi32( -4,-8, 9, -8)); 1387 int[4] correct = [ -4,-8, 9, -7]; 1388 assert(R.array == correct); 1389 } 1390 1391 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. 1392 __m128i _mm_min_epi32 (__m128i a, __m128i b) pure @trusted 1393 { 1394 // PERF DMD 1395 static if (GDC_with_SSE41) 1396 { 1397 return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b); 1398 } 1399 else version(LDC) 1400 { 1401 // x86: pminsd since LDC 1.1 -O1, also good without sse4.1 1402 // ARM: smin.4s since LDC 1.8 -01 1403 int4 sa = cast(int4)a; 1404 int4 sb = cast(int4)b; 1405 static if (SIMD_COMPARISON_MASKS_16B) 1406 int4 greater = sa > sb; 1407 else 1408 int4 greater = greaterMask!int4(sa, sb); 1409 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1410 } 1411 else 1412 { 1413 __m128i higher = _mm_cmplt_epi32(a, b); 1414 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1415 __m128i mask = _mm_and_si128(aTob, higher); 1416 return _mm_xor_si128(b, mask); 1417 } 1418 } 1419 unittest 1420 { 1421 int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), 1422 _mm_setr_epi32( -4, -8, 9, -8)); 1423 int[4] correct = [ -4, -8, -4, -8]; 1424 assert(R.array == correct); 1425 } 1426 1427 /// Compare packed signed 8-bit integers in `a` and `b`, 1428 /// and return packed minimum values. 1429 __m128i _mm_min_epi8 (__m128i a, __m128i b) pure @trusted 1430 { 1431 // PERF DMD 1432 static if (GDC_with_SSE41) 1433 { 1434 return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b); 1435 } 1436 else version(LDC) 1437 { 1438 // x86: pminsb since LDC 1.1 -O1 1439 // ARM64: smin.16b since LDC 1.8.0 -O1 1440 byte16 sa = cast(byte16)a; 1441 byte16 sb = cast(byte16)b; 1442 static if (SIMD_COMPARISON_MASKS_16B) 1443 byte16 greater = sa > sb; 1444 else 1445 byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); 1446 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1447 } 1448 else 1449 { 1450 __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else 1451 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1452 __m128i mask = _mm_and_si128(aTob, lower); 1453 return _mm_xor_si128(b, mask); 1454 } 1455 } 1456 unittest 1457 { 1458 __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); 1459 __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 1460 byte16 R = cast(byte16) _mm_min_epi8(A, B); 1461 byte[16] correct = [ 4, -8, -4, -8, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 1462 assert(R.array == correct); 1463 } 1464 1465 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst. 1466 __m128i _mm_min_epu16 (__m128i a, __m128i b) pure @trusted 1467 { 1468 // PERF DMD 1469 static if (GDC_with_SSE41) 1470 { 1471 return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b); 1472 } 1473 else version(LDC) 1474 { 1475 // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1 1476 // ARM64: umin.8h since LDC 1.8.0 -O1 1477 ushort8 sa = cast(ushort8)a; 1478 ushort8 sb = cast(ushort8)b; 1479 static if (SIMD_COMPARISON_MASKS_16B) 1480 ushort8 greater = (sb > sa); 1481 else 1482 ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa); 1483 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1484 } 1485 else 1486 { 1487 __m128i c = _mm_subs_epu16(b, a); 1488 b = _mm_sub_epi16(b, c); 1489 return b; 1490 } 1491 } 1492 unittest 1493 { 1494 short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), 1495 _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); 1496 short[8] correct = [32767, 1, 9, -8, 0, 7, 0, 0]; 1497 assert(R.array == correct); 1498 } 1499 1500 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst. 1501 __m128i _mm_min_epu32 (__m128i a, __m128i b) pure @trusted 1502 { 1503 // PERF DMD 1504 static if (GDC_with_SSE41) 1505 { 1506 return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b); 1507 } 1508 else version(LDC) 1509 { 1510 // x86: pminud since LDC 1.1 -O1, also good without sse4.1 1511 // ARM64: umin.4s since LDC 1.8.0 -O1 1512 uint4 sa = cast(uint4)a; 1513 uint4 sb = cast(uint4)b; 1514 static if (SIMD_COMPARISON_MASKS_16B) 1515 uint4 greater = sa > sb; 1516 else 1517 uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); 1518 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1519 } 1520 else 1521 { 1522 // PERF: same remark as in _mm_max_epu32 1523 __m128i valueShift = _mm_set1_epi32(-0x80000000); 1524 __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift)); 1525 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1526 __m128i mask = _mm_and_si128(aTob, higher); 1527 return _mm_xor_si128(b, mask); 1528 } 1529 } 1530 unittest 1531 { 1532 int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), 1533 _mm_setr_epi32( -4,-8, 9, -8)); 1534 int[4] correct = [0x7fffffff, 1, 4, -8]; 1535 assert(R.array == correct); 1536 } 1537 1538 /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, 1539 /// store the minimum and index in return value, and zero the remaining bits. 1540 __m128i _mm_minpos_epu16 (__m128i a) @trusted 1541 { 1542 // PERF DMD 1543 static if (GDC_with_SSE41) 1544 { 1545 return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); 1546 } 1547 else static if (LDC_with_SSE41) 1548 { 1549 return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); 1550 } 1551 else static if (LDC_with_ARM64) 1552 { 1553 __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 1554 __m128i combinedLo = _mm_unpacklo_epi16(indices, a); 1555 __m128i combinedHi = _mm_unpackhi_epi16(indices, a); 1556 __m128i best = _mm_min_epu32(combinedLo, combinedHi); 1557 best = _mm_min_epu32(best, _mm_srli_si128!8(best)); 1558 best = _mm_min_epu32(best, _mm_srli_si128!4(best)); 1559 short8 sbest = cast(short8)best; 1560 short8 r; 1561 r[0] = sbest[1]; 1562 r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie 1563 r[2] = 0; 1564 r[3] = 0; 1565 r[4] = 0; 1566 r[5] = 0; 1567 r[6] = 0; 1568 r[7] = 0; 1569 return cast(__m128i)r; 1570 } 1571 else 1572 { 1573 short8 sa = cast(short8)a; 1574 ushort min = 0xffff; 1575 int index = 0; 1576 for(int n = 0; n < 8; ++n) 1577 { 1578 ushort c = sa.array[n]; 1579 if (c < min) 1580 { 1581 min = c; 1582 index = n; 1583 } 1584 } 1585 short8 r; 1586 r.ptr[0] = min; 1587 r.ptr[1] = cast(short)index; 1588 return cast(__m128i)r; 1589 } 1590 } 1591 unittest 1592 { 1593 __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6); 1594 __m128i B = _mm_setr_epi16(14, 4, 4, 2, -3, 2, 5, 6); 1595 short8 R1 = cast(short8) _mm_minpos_epu16(A); 1596 short8 R2 = cast(short8) _mm_minpos_epu16(B); 1597 short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0]; 1598 short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0]; 1599 assert(R1.array == correct1); 1600 assert(R2.array == correct2); 1601 } 1602 1603 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers 1604 /// in `a` compared to those in `b`, and store the 16-bit results in dst. 1605 /// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. 1606 /// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. 1607 /// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting 1608 /// at the offset specified in `imm8[2]`. 1609 __m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted 1610 { 1611 // PERF DMD 1612 static if (GDC_with_SSE41) 1613 { 1614 return cast(__m128i) __builtin_ia32_mpsadbw128(cast(ubyte16)a, cast(ubyte16)b, cast(ubyte)imm8); 1615 } 1616 else static if (LDC_with_SSE41) 1617 { 1618 return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8); 1619 } 1620 else 1621 { 1622 int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable... 1623 int b_offset = (imm8 & 3) * 4; 1624 1625 byte16 ba = cast(byte16)a; 1626 byte16 bb = cast(byte16)b; 1627 short8 r; 1628 1629 __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0); 1630 1631 for (int j = 0; j < 8; j += 2) 1632 { 1633 int k = a_offset + j; 1634 __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3], 1635 0, 0, 0, 0, 1636 ba[k+1], ba[k+2], ba[k+3], ba[k+4], 1637 0, 0, 0, 0); 1638 short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64 1639 r.ptr[j] = diffs.array[0]; 1640 r.ptr[j+1] = diffs.array[4]; 1641 } 1642 return cast(__m128i)r; 1643 } 1644 } 1645 unittest 1646 { 1647 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 1648 __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5, 5, 5, 12, 13, 14, 15); 1649 short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23]; 1650 short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749]; 1651 short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35]; 1652 short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741]; 1653 short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4]; 1654 short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B); 1655 short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B); 1656 short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B); 1657 short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B); 1658 short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B); 1659 assert(r1.array == correct1); 1660 assert(r4.array == correct4); 1661 assert(r5.array == correct5); 1662 assert(r7.array == correct7); 1663 assert(r8.array == correct0); 1664 } 1665 1666 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst. 1667 __m128i _mm_mul_epi32 (__m128i a, __m128i b) pure @trusted 1668 { 1669 // PERF DMD 1670 static if (GDC_with_SSE41) 1671 { 1672 return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b); 1673 } 1674 else static if (LDC_with_SSE41 && LDC_with_optimizations) 1675 { 1676 // For some reason, clang has the builtin but it's not in IntrinsicsX86.td 1677 // Use IR instead. 1678 // This generates pmuldq with since LDC 1.2.0 -O0 1679 enum ir = ` 1680 %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 2> 1681 %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> <i32 0, i32 2> 1682 %la = sext <2 x i32> %ia to <2 x i64> 1683 %lb = sext <2 x i32> %ib to <2 x i64> 1684 %r = mul <2 x i64> %la, %lb 1685 ret <2 x i64> %r`; 1686 return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b); 1687 } 1688 else static if (LDC_with_ARM64) 1689 { 1690 // 3 instructions since LDC 1.8 -O2 1691 // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull 1692 int2 a_lo = vmovn_s64(cast(long2)a); 1693 int2 b_lo = vmovn_s64(cast(long2)b); 1694 return cast(__m128i) vmull_s32(a_lo, b_lo); 1695 } 1696 else 1697 { 1698 int4 ia = cast(int4)a; 1699 int4 ib = cast(int4)b; 1700 long2 r; 1701 r.ptr[0] = cast(long)ia.array[0] * ib.array[0]; 1702 r.ptr[1] = cast(long)ia.array[2] * ib.array[2]; 1703 return cast(__m128i)r; 1704 } 1705 } 1706 unittest 1707 { 1708 __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); 1709 __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); 1710 long2 R = cast(long2) _mm_mul_epi32(A, B); 1711 long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144]; 1712 assert(R.array == correct); 1713 } 1714 1715 /// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, 1716 /// return the low 32 bits of the intermediate integers. 1717 __m128i _mm_mullo_epi32 (__m128i a, __m128i b) pure @trusted 1718 { 1719 // PERF DMD 1720 // PERF GDC without SSE4.1 could be better 1721 static if (GDC_with_SSE41) 1722 { 1723 int4 ia = cast(int4)a; 1724 int4 ib = cast(int4)b; 1725 // Note: older GDC doesn't have that op, but older GDC 1726 // also has no support for -msse4.1 detection 1727 return cast(__m128i)(a * b); 1728 } 1729 else version(LDC) 1730 { 1731 int4 ia = cast(int4)a; 1732 int4 ib = cast(int4)b; 1733 return cast(__m128i)(a * b); 1734 } 1735 else 1736 { 1737 // DMD doesn't take the above 1738 int4 ia = cast(int4)a; 1739 int4 ib = cast(int4)b; 1740 int4 r; 1741 r.ptr[0] = ia.array[0] * ib.array[0]; 1742 r.ptr[1] = ia.array[1] * ib.array[1]; 1743 r.ptr[2] = ia.array[2] * ib.array[2]; 1744 r.ptr[3] = ia.array[3] * ib.array[3]; 1745 return r; 1746 } 1747 } 1748 unittest 1749 { 1750 __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); 1751 __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); 1752 int4 R = cast(int4) _mm_mullo_epi32(A, B); 1753 int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0]; 1754 assert(R.array == correct); 1755 } 1756 1757 1758 /// Convert packed signed 32-bit integers from `a` and `b` 1759 /// to packed 16-bit integers using unsigned saturation. 1760 __m128i _mm_packus_epi32 (__m128i a, __m128i b) pure @trusted 1761 { 1762 static if (GDC_with_SSE41) 1763 { 1764 return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); 1765 } 1766 else static if (LDC_with_SSE41) 1767 { 1768 return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); 1769 } 1770 else static if (LDC_with_ARM64) 1771 { 1772 int4 z; 1773 z = 0; 1774 return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)), 1775 vqmovn_u32(vmaxq_s32(z, cast(int4)b))); 1776 } 1777 else 1778 { 1779 __m128i i32768 = _mm_set1_epi32(32768); 1780 __m128i s32768 = _mm_set1_epi16(-32768); 1781 a = _mm_sub_epi32(a, i32768); 1782 b = _mm_sub_epi32(b, i32768); 1783 __m128i clampedSigned = _mm_packs_epi32(a, b); 1784 return _mm_add_epi16(clampedSigned, s32768); 1785 } 1786 } 1787 unittest 1788 { 1789 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 1790 short8 R = cast(short8) _mm_packus_epi32(A, A); 1791 short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0]; 1792 assert(R.array == correct); 1793 } 1794 1795 1796 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 1797 /// rounding parameter, and store the results as packed double-precision floating-point elements. 1798 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1799 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1800 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1801 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1802 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1803 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1804 __m128d _mm_round_pd(int rounding)(__m128d a) @trusted 1805 { 1806 // PERF DMD 1807 static if (GDC_with_SSE41) 1808 { 1809 return __builtin_ia32_roundpd(a, rounding); 1810 } 1811 else static if (LDC_with_SSE41) 1812 { 1813 return __builtin_ia32_roundpd(a, rounding); 1814 } 1815 else 1816 { 1817 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1818 { 1819 // Convert to 64-bit integers 1820 long lo = _mm_cvtsd_si64(a); 1821 a.ptr[0] = a.array[1]; 1822 long hi = _mm_cvtsd_si64(a); 1823 return _mm_setr_pd(lo, hi); 1824 } 1825 else 1826 { 1827 version(GNU) pragma(inline, false); // else fail unittest with optimizations 1828 1829 uint old = _MM_GET_ROUNDING_MODE(); 1830 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1831 1832 // Convert to 64-bit integers 1833 long lo = _mm_cvtsd_si64(a); 1834 a.ptr[0] = a.array[1]; 1835 long hi = _mm_cvtsd_si64(a); 1836 1837 // Convert back to double to achieve the rounding 1838 // The problem is that a 64-bit double can't represent all the values 1839 // a 64-bit integer can (and vice-versa). So this function won't work for 1840 // large values. (MAYDO: what range exactly?) 1841 _MM_SET_ROUNDING_MODE(old); 1842 return _mm_setr_pd(lo, hi); 1843 } 1844 } 1845 } 1846 unittest 1847 { 1848 // tested in other intrinsics 1849 } 1850 1851 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 1852 /// rounding parameter, and store the results as packed single-precision floating-point elements. 1853 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1854 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1855 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1856 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1857 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1858 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1859 __m128 _mm_round_ps(int rounding)(__m128 a) @trusted 1860 { 1861 // PERF ARM64: there is duplication because this isn't optimal for ARM64, so it is avoided externally 1862 static if (GDC_or_LDC_with_SSE41) 1863 { 1864 return __builtin_ia32_roundps(a, rounding); 1865 } 1866 else 1867 { 1868 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1869 { 1870 __m128i integers = _mm_cvtps_epi32(a); 1871 return _mm_cvtepi32_ps(integers); 1872 } 1873 else 1874 { 1875 version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled 1876 uint old = _MM_GET_ROUNDING_MODE(); 1877 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1878 scope(exit) _MM_SET_ROUNDING_MODE(old); 1879 1880 // Convert to 64-bit integers 1881 __m128i integers = _mm_cvtps_epi32(a); 1882 1883 // Convert back to float to achieve the rounding 1884 // The problem is that a 32-float can't represent all the values 1885 // a 32-bit integer can (and vice-versa). So this function won't work for 1886 // large values. (MAYDO: what range exactly?) 1887 __m128 result = _mm_cvtepi32_ps(integers); 1888 1889 return result; 1890 } 1891 } 1892 } 1893 unittest 1894 { 1895 // tested in other intrinsics 1896 } 1897 1898 1899 /// Round the lower double-precision (64-bit) floating-point element in `b` using the 1900 /// rounding parameter, store the result as a double-precision floating-point element 1901 /// in the lower element of result, and copy the upper element from `a` to the upper element of result. 1902 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1903 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1904 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1905 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1906 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1907 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1908 __m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted 1909 { 1910 static if (GDC_with_SSE41) 1911 { 1912 return __builtin_ia32_roundsd(a, b, rounding); 1913 } 1914 else static if (LDC_with_SSE41) 1915 { 1916 return __builtin_ia32_roundsd(a, b, rounding); 1917 } 1918 else 1919 { 1920 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1921 { 1922 // Convert to 64-bit integer 1923 long b0 = _mm_cvtsd_si64(b); 1924 a.ptr[0] = b0; 1925 return a; 1926 } 1927 else 1928 { 1929 version(GNU) pragma(inline, false); // else fail unittest with optimizations 1930 1931 uint old = _MM_GET_ROUNDING_MODE(); 1932 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1933 1934 // Convert to 64-bit integer 1935 long b0 = _mm_cvtsd_si64(b); 1936 a.ptr[0] = b0; 1937 1938 // Convert back to double to achieve the rounding 1939 // The problem is that a 64-bit double can't represent all the values 1940 // a 64-bit integer can (and vice-versa). So this function won't work for 1941 // large values. (MAYDO: what range exactly?) 1942 _MM_SET_ROUNDING_MODE(old); 1943 return a; 1944 } 1945 } 1946 } 1947 unittest 1948 { 1949 // tested in other intrinsics 1950 } 1951 1952 1953 /// Round the lower single-precision (32-bit) floating-point element in `b` using the 1954 /// rounding parameter, store the result as a single-precision floating-point element 1955 /// in the lower element of result, and copy the upper 3 packed elements from `a` 1956 /// to the upper elements of result. 1957 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1958 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1959 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1960 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1961 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1962 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1963 __m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted 1964 { 1965 static if (GDC_with_SSE41) 1966 { 1967 return __builtin_ia32_roundss(a, b, rounding); 1968 } 1969 else static if (LDC_with_SSE41) 1970 { 1971 return __builtin_ia32_roundss(a, b, rounding); 1972 } 1973 else 1974 { 1975 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1976 { 1977 int b0 = _mm_cvtss_si32(b); 1978 a.ptr[0] = b0; 1979 return a; 1980 } 1981 else version(GNU) 1982 { 1983 pragma(inline, false) 1984 __m128 GDCworkaround() nothrow @nogc @trusted 1985 { 1986 uint old = _MM_GET_ROUNDING_MODE(); 1987 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1988 1989 // Convert to 32-bit integer 1990 int b0 = _mm_cvtss_si32(b); 1991 a.ptr[0] = b0; 1992 1993 // Convert back to double to achieve the rounding 1994 // The problem is that a 32-bit float can't represent all the values 1995 // a 32-bit integer can (and vice-versa). So this function won't work for 1996 // large values. (MAYDO: what range exactly?) 1997 _MM_SET_ROUNDING_MODE(old); 1998 return a; 1999 } 2000 return GDCworkaround(); 2001 } 2002 else 2003 { 2004 uint old = _MM_GET_ROUNDING_MODE(); 2005 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 2006 2007 // Convert to 32-bit integer 2008 int b0 = _mm_cvtss_si32(b); 2009 a.ptr[0] = b0; 2010 2011 // Convert back to double to achieve the rounding 2012 // The problem is that a 32-bit float can't represent all the values 2013 // a 32-bit integer can (and vice-versa). So this function won't work for 2014 // large values. (MAYDO: what range exactly?) 2015 _MM_SET_ROUNDING_MODE(old); 2016 return a; 2017 } 2018 } 2019 } 2020 unittest 2021 { 2022 // tested in other intrinsics 2023 } 2024 2025 2026 /// Load 128-bits of integer data from memory using a non-temporal memory hint. 2027 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection 2028 /// exception may be generated. 2029 __m128i _mm_stream_load_si128 (void* mem_addr) pure @trusted 2030 { 2031 // PERF DMD D_SIMD 2032 static if (GDC_with_SSE41) 2033 { 2034 return cast(__m128i) __builtin_ia32_movntdqa(cast(long2*)mem_addr); 2035 } 2036 else static if (LDC_with_InlineIREx && LDC_with_optimizations) 2037 { 2038 enum prefix = `!0 = !{ i32 1 }`; 2039 enum ir = ` 2040 %r = load <4 x i32>, <4 x i32>* %0, !nontemporal !0 2041 ret <4 x i32> %r`; 2042 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", int4, int4*)(cast(__m128i*)mem_addr); 2043 } 2044 else 2045 { 2046 return *cast(__m128i*)mem_addr; // regular move instead 2047 } 2048 } 2049 unittest 2050 { 2051 align(16) static immutable int[4] correct = [1, 2, 3, 4]; 2052 __m128i A = _mm_stream_load_si128(cast(__m128i*)(correct.ptr)); 2053 _mm_mfence(); 2054 assert(A.array == correct); 2055 } 2056 2057 /// Return 1 if all bits in `a` are all 1's. Else return 0. 2058 int _mm_test_all_ones (__m128i a) @safe 2059 { 2060 return _mm_testc_si128(a, _mm_set1_epi32(-1)); 2061 } 2062 unittest 2063 { 2064 __m128i A = _mm_set1_epi32(-1); 2065 __m128i B = _mm_set_epi32(-1, -2, -1, -1); 2066 assert(_mm_test_all_ones(A) == 1); 2067 assert(_mm_test_all_ones(B) == 0); 2068 } 2069 2070 /// Return 1 if all bits in `a` are all 0's. Else return 0. 2071 // This is a #BONUS since it was lacking in Intel Intrinsics API. 2072 int _mm_test_all_zeros (__m128i a) @safe 2073 { 2074 return _mm_testz_si128(a, _mm_set1_epi32(-1)); 2075 } 2076 unittest 2077 { 2078 __m128i A = _mm_set1_epi32(0); 2079 __m128i B = _mm_set_epi32(0, 8, 0, 0); 2080 assert(_mm_test_all_zeros(A) == 1); 2081 assert(_mm_test_all_zeros(B) == 0); 2082 } 2083 2084 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, 2085 /// and return 1 if the result is zero, otherwise return 0. 2086 int _mm_test_all_zeros (__m128i a, __m128i mask) @safe 2087 { 2088 return _mm_testz_si128(a, mask); // it's really the same, but with a good name 2089 } 2090 2091 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and mask, and set ZF to 1 2092 /// if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with 2093 /// mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and 2094 /// CF values are zero, otherwise return 0. 2095 int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted 2096 { 2097 return _mm_testnzc_si128(a, mask); 2098 } 2099 2100 /// Compute the bitwise NOT of a and then AND with b, and return 1 if the 2101 /// result is zero, otherwise return 0. 2102 /// In other words, test if all bits masked by `b` are 1 in `a`. 2103 int _mm_testc_si128 (__m128i a, __m128i b) pure @trusted 2104 { 2105 // PERF DMD 2106 static if (GDC_with_SSE41) 2107 { 2108 return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); 2109 } 2110 else static if (LDC_with_SSE41) 2111 { 2112 return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); 2113 } 2114 else static if (LDC_with_ARM64) 2115 { 2116 // Acceptable since LDC 1.8 -02 2117 long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a); 2118 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); 2119 } 2120 else 2121 { 2122 __m128i c = ~a & b; 2123 int[4] zero = [0, 0, 0, 0]; 2124 return c.array == zero; 2125 } 2126 } 2127 unittest 2128 { 2129 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2130 __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00); 2131 __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); 2132 assert(_mm_testc_si128(A, A) == 1); 2133 assert(_mm_testc_si128(A, M1) == 0); 2134 assert(_mm_testc_si128(A, M2) == 1); 2135 } 2136 2137 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, 2138 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 2139 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 2140 /// result is zero, otherwise set CF to 0. 2141 /// Return 1 if both the ZF and CF values are zero, otherwise return 0. 2142 int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted 2143 { 2144 // PERF DMD 2145 static if (GDC_with_SSE41) 2146 { 2147 return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); 2148 } 2149 else static if (LDC_with_SSE41) 2150 { 2151 return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); 2152 } 2153 else static if (LDC_with_ARM64) 2154 { 2155 long2 s640 = vandq_s64(cast(long2)b, cast(long2)a); 2156 long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a); 2157 2158 return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)) 2159 | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) ); 2160 } 2161 else 2162 { 2163 __m128i c = a & b; 2164 __m128i d = ~a & b; 2165 int[4] zero = [0, 0, 0, 0]; 2166 return !( (c.array == zero) || (d.array == zero)); 2167 } 2168 } 2169 unittest 2170 { 2171 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2172 __m128i M = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00); 2173 __m128i Z = _mm_setzero_si128(); 2174 assert(_mm_testnzc_si128(A, Z) == 0); 2175 assert(_mm_testnzc_si128(A, M) == 1); 2176 assert(_mm_testnzc_si128(A, A) == 0); 2177 } 2178 2179 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, 2180 /// and return 1 if the result is zero, otherwise return 0. 2181 /// In other words, test if all bits masked by `b` are 0 in `a`. 2182 int _mm_testz_si128 (__m128i a, __m128i b) @trusted 2183 { 2184 // PERF DMD 2185 static if (GDC_with_SSE41) 2186 { 2187 return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); 2188 } 2189 else static if (LDC_with_SSE41) 2190 { 2191 return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); 2192 } 2193 else static if (LDC_with_ARM64) 2194 { 2195 // Acceptable since LDC 1.8 -02 2196 long2 s64 = vandq_s64(cast(long2)a, cast(long2)b); 2197 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); 2198 } 2199 else 2200 { 2201 __m128i c = a & b; 2202 int[4] zero = [0, 0, 0, 0]; 2203 return c.array == zero; 2204 } 2205 } 2206 unittest 2207 { 2208 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2209 __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07); 2210 __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); 2211 assert(_mm_testz_si128(A, A) == 0); 2212 assert(_mm_testz_si128(A, M1) == 1); 2213 assert(_mm_testz_si128(A, M2) == 0); 2214 } 2215