1 /** 2 * SSE4.1 intrinsics. 3 * 4 * Copyright: Guillaume Piolat 2021. 5 * Johan Engelen 2021. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.smmintrin; 9 10 // SSE4.1 instructions 11 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1 12 // Note: this header will work whether you have SSE4.1 enabled or not. 13 // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively 14 // generate SSE4.1 instructions. 15 16 public import inteli.types; 17 import inteli.internals; 18 19 // smmintrin pulls in all previous instruction set intrinsics. 20 public import inteli.tmmintrin; 21 22 nothrow @nogc: 23 24 enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes 25 enum int _MM_FROUND_TO_NEG_INF = 0x01; /// ditto 26 enum int _MM_FROUND_TO_POS_INF = 0x02; /// ditto 27 enum int _MM_FROUND_TO_ZERO = 0x03; /// ditto 28 enum int _MM_FROUND_CUR_DIRECTION = 0x04; /// ditto 29 enum int _MM_FROUND_RAISE_EXC = 0x00; /// ditto 30 enum int _MM_FROUND_NO_EXC = 0x08; /// ditto 31 32 enum int _MM_FROUND_NINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT); 33 enum int _MM_FROUND_FLOOR = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF); 34 enum int _MM_FROUND_CEIL = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF); 35 enum int _MM_FROUND_TRUNC = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO); 36 enum int _MM_FROUND_RINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION); 37 enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION); 38 39 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results. 40 // Note: changed signature, GDC needs a compile-time value for imm8. 41 __m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) @trusted 42 { 43 // PERF DMD 44 static if (GDC_with_SSE41) 45 { 46 return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8); 47 } 48 else 49 { 50 // LDC x86 This generates pblendw since LDC 1.1 and -O2 51 short8 r; 52 short8 sa = cast(short8)a; 53 short8 sb = cast(short8)b; 54 for (int n = 0; n < 8; ++n) 55 { 56 r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n]; 57 } 58 return cast(__m128i)r; 59 } 60 } 61 unittest 62 { 63 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 64 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 65 short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011 66 short[8] correct = [8, 9, 2, 3, 12, 5, 6, 15]; 67 assert(C.array == correct); 68 } 69 70 71 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`. 72 // Note: changed signature, GDC needs a compile-time value for `imm8`. 73 __m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted 74 { 75 static assert(imm8 >= 0 && imm8 < 4); 76 // PERF DMD 77 static if (GDC_with_SSE41) 78 { 79 return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8); 80 } 81 else 82 { 83 // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12 84 double2 r; 85 for (int n = 0; n < 2; ++n) 86 { 87 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 88 } 89 return cast(__m128d)r; 90 } 91 } 92 unittest 93 { 94 __m128d A = _mm_setr_pd(0, 1); 95 __m128d B = _mm_setr_pd(8, 9); 96 double2 C = _mm_blend_pd!2(A, B); 97 double[2] correct = [0, 9]; 98 assert(C.array == correct); 99 } 100 101 102 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control mask `imm8`. 103 // Note: changed signature, GDC needs a compile-time value for imm8. 104 __m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) @trusted 105 { 106 // PERF DMD 107 static assert(imm8 >= 0 && imm8 < 16); 108 static if (GDC_with_SSE41) 109 { 110 return __builtin_ia32_blendps(a, b, imm8); 111 } 112 else version(LDC) 113 { 114 // LDC x86: generates blendps since LDC 1.1 -O2 115 // arm64: pretty good, two instructions worst case 116 return shufflevector!(float4, (imm8 & 1) ? 4 : 0, 117 (imm8 & 2) ? 5 : 1, 118 (imm8 & 4) ? 6 : 2, 119 (imm8 & 8) ? 7 : 3)(a, b); 120 } 121 else 122 { 123 __m128 r; 124 for (int n = 0; n < 4; ++n) 125 { 126 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 127 } 128 return r; 129 } 130 } 131 unittest 132 { 133 __m128 A = _mm_setr_ps(0, 1, 2, 3); 134 __m128 B = _mm_setr_ps(8, 9, 10, 11); 135 float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101 136 float[4] correct = [8, 1, 10, 11]; 137 assert(C.array == correct); 138 } 139 140 /// Blend packed 8-bit integers from `a` and `b` using `mask`. 141 __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) @trusted 142 { 143 // PERF DMD 144 static if (GDC_with_SSE41) 145 { 146 return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask); 147 } 148 else static if (LDC_with_SSE41) 149 { 150 return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask); 151 } 152 else static if (LDC_with_ARM64) 153 { 154 // LDC arm64: two instructions since LDC 1.12 -O2 155 byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7); 156 return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a); 157 } 158 else 159 { 160 __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask); 161 return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b); 162 } 163 } 164 unittest 165 { 166 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 167 8, 9, 10, 11, 12, 13, 14, 15); 168 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 169 24, 25, 26, 27, 28, 29, 30, 31); 170 __m128i M = _mm_setr_epi8( 1, -1, 1, 1, -4, 1, -8, 127, 171 1, 1, -1, -1, 4, 1, 8, -128); 172 byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M); 173 byte[16] correct = [ 0, 17, 2, 3, 20, 5, 22, 7, 174 8, 9, 26, 27, 12, 13, 14, 31 ]; 175 assert(R.array == correct); 176 } 177 178 179 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`. 180 __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted 181 { 182 // PERF DMD 183 static if (GDC_with_SSE42) 184 { 185 // Amazingly enough, GCC/GDC generates the blendvpd instruction 186 // with -msse4.2 but not -msse4.1. 187 // Not sure what is the reason, and there is a replacement sequence. 188 // Sounds like a bug. 189 return __builtin_ia32_blendvpd(a, b, mask); 190 } 191 else static if (LDC_with_SSE41) 192 { 193 return __builtin_ia32_blendvpd(a, b, mask); 194 } 195 else static if (LDC_with_ARM64) 196 { 197 long2 shift; 198 shift = 63; 199 long2 lmask = cast(long2)mask >> shift; 200 return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a); 201 } 202 else 203 { 204 __m128d r; 205 long2 lmask = cast(long2)mask; 206 for (int n = 0; n < 2; ++n) 207 { 208 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 209 } 210 return r; 211 } 212 } 213 unittest 214 { 215 __m128d A = _mm_setr_pd(1.0, 2.0); 216 __m128d B = _mm_setr_pd(3.0, 4.0); 217 __m128d M1 = _mm_setr_pd(-3.0, 2.0); 218 __m128d R1 = _mm_blendv_pd(A, B, M1); 219 double[2] correct1 = [3.0, 2.0]; 220 assert(R1.array == correct1); 221 222 // BUG: LDC _mm_blendv_pd doesn't work with NaN mask in arm64 Linux for some unknown reason. 223 // but it does work in arm64 macOS 224 // yields different results despite FP seemingly not being used 225 version(linux) 226 {} 227 else 228 { 229 __m128d M2 = _mm_setr_pd(double.nan, -double.nan); 230 __m128d R2 = _mm_blendv_pd(A, B, M2); 231 double[2] correct2 = [1.0, 4.0]; 232 assert(R2.array == correct2); 233 } 234 } 235 236 237 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`. 238 __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) @trusted 239 { 240 // PERF DMD 241 static if (GDC_with_SSE41) 242 { 243 return __builtin_ia32_blendvps(a, b, mask); 244 } 245 else static if (LDC_with_SSE41) 246 { 247 return __builtin_ia32_blendvps(a, b, mask); 248 } 249 else static if (LDC_with_ARM64) 250 { 251 int4 shift; 252 shift = 31; 253 int4 lmask = cast(int4)mask >> shift; 254 return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a); 255 } 256 else 257 { 258 __m128 r; 259 int4 lmask = cast(int4)mask; 260 for (int n = 0; n < 4; ++n) 261 { 262 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 263 } 264 return r; 265 } 266 } 267 unittest 268 { 269 __m128 A = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f); 270 __m128 B = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f); 271 __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f); 272 __m128 M2 = _mm_setr_ps(float.nan, -float.nan, -0.0f, +0.0f); 273 __m128 R1 = _mm_blendv_ps(A, B, M1); 274 __m128 R2 = _mm_blendv_ps(A, B, M2); 275 float[4] correct1 = [ 4.0f, 1.0f, 2.0f, 7.0f]; 276 float[4] correct2 = [ 0.0f, 5.0f, 6.0f, 3.0f]; 277 assert(R1.array == correct1); 278 279 // BUG: like above, LDC _mm_blendv_ps doesn't work with NaN mask in arm64 Linux for some unknown reason. 280 // yields different results despite FP seemingly not being used 281 version(linux) 282 {} 283 else 284 { 285 assert(R2.array == correct2); 286 } 287 } 288 289 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, 290 /// and store the results as packed double-precision floating-point elements. 291 __m128d _mm_ceil_pd (__m128d a) @trusted 292 { 293 static if (LDC_with_ARM64) 294 { 295 // LDC arm64 acceptable since 1.8 -O2 296 // Unfortunately x86 intrinsics force a round-trip back to double2 297 // ARM neon semantics wouldn't have that 298 long2 l = vcvtpq_s64_f64(a); 299 double2 r; 300 r.ptr[0] = l.array[0]; 301 r.ptr[1] = l.array[1]; 302 return r; 303 } 304 else 305 { 306 return _mm_round_pd!2(a); 307 } 308 } 309 unittest 310 { 311 __m128d A = _mm_setr_pd(1.3f, -2.12f); 312 __m128d B = _mm_setr_pd(53.6f, -2.7f); 313 A = _mm_ceil_pd(A); 314 B = _mm_ceil_pd(B); 315 double[2] correctA = [2.0, -2.0]; 316 double[2] correctB = [54.0, -2.0]; 317 assert(A.array == correctA); 318 assert(B.array == correctB); 319 } 320 321 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, 322 /// and store the results as packed single-precision floating-point elements. 323 __m128 _mm_ceil_ps (__m128 a) @trusted 324 { 325 static if (LDC_with_ARM64) 326 { 327 // LDC arm64 acceptable since 1.8 -O1 328 int4 l = vcvtpq_s32_f32(a); 329 float4 r; 330 r.ptr[0] = l.array[0]; 331 r.ptr[1] = l.array[1]; 332 r.ptr[2] = l.array[2]; 333 r.ptr[3] = l.array[3]; 334 return r; 335 } 336 else 337 { 338 return _mm_round_ps!2(a); 339 } 340 } 341 unittest 342 { 343 __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); 344 __m128 C = _mm_ceil_ps(A); 345 float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f]; 346 assert(C.array == correct); 347 } 348 349 /// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, 350 /// store the result as a double-precision floating-point element in the lower element of result, 351 /// and copy the upper element from `a` to the upper element of dst. 352 __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted 353 { 354 static if (LDC_with_ARM64) 355 { 356 a[0] = vcvtps_s64_f64(b[0]); 357 return a; 358 } 359 else 360 { 361 return _mm_round_sd!2(a, b); 362 } 363 } 364 unittest 365 { 366 __m128d A = _mm_setr_pd(1.3, -2.12); 367 __m128d B = _mm_setr_pd(53.6, -3.7); 368 __m128d C = _mm_ceil_sd(A, B); 369 double[2] correct = [54.0, -2.12]; 370 assert(C.array == correct); 371 } 372 373 /// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value, 374 /// store the result as a single-precision floating-point element in the lower element of result, 375 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 376 __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted 377 { 378 static if (LDC_with_ARM64) 379 { 380 a[0] = vcvtps_s32_f32(b[0]); 381 return a; 382 } 383 else 384 { 385 return _mm_round_ss!2(a, b); 386 } 387 } 388 unittest 389 { 390 __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); 391 __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f); 392 __m128 C = _mm_ceil_ss(A, B); 393 float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f]; 394 assert(C.array == correct); 395 } 396 397 /// Compare packed 64-bit integers in `a` and `b` for equality. 398 __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted 399 { 400 // PERF DMD 401 static if (GDC_with_SSE41) 402 { 403 return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b); 404 } 405 else version(LDC) 406 { 407 // LDC x86: generates pcmpeqq since LDC 1.1 -O1 408 // arm64: generates cmeq since LDC 1.8 -O1 409 return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b); 410 } 411 else 412 { 413 // Clever pcmpeqd + pand use with LDC 1.24 -O2 414 long2 la = cast(long2)a; 415 long2 lb = cast(long2)b; 416 long2 res; 417 res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0; 418 res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0; 419 return cast(__m128i)res; 420 } 421 } 422 unittest 423 { 424 __m128i A = _mm_setr_epi64(-1, -2); 425 __m128i B = _mm_setr_epi64(-3, -2); 426 __m128i C = _mm_setr_epi64(-1, -4); 427 long2 AB = cast(long2) _mm_cmpeq_epi64(A, B); 428 long2 AC = cast(long2) _mm_cmpeq_epi64(A, C); 429 long[2] correct1 = [0, -1]; 430 long[2] correct2 = [-1, 0]; 431 assert(AB.array == correct1); 432 assert(AC.array == correct2); 433 } 434 435 436 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers. 437 __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted 438 { 439 // PERF DMD 440 static if (GDC_with_SSE41) 441 { 442 return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a); 443 } 444 else version(LDC) 445 { 446 // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64 447 enum ir = ` 448 %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3> 449 %r = sext <4 x i16> %v to <4 x i32> 450 ret <4 x i32> %r`; 451 return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a); 452 } 453 else 454 { 455 short8 sa = cast(short8)a; 456 int4 r; 457 r.ptr[0] = sa.array[0]; 458 r.ptr[1] = sa.array[1]; 459 r.ptr[2] = sa.array[2]; 460 r.ptr[3] = sa.array[3]; 461 return r; 462 } 463 } 464 unittest 465 { 466 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 467 int4 C = cast(int4) _mm_cvtepi16_epi32(A); 468 int[4] correct = [-1, 0, -32768, 32767]; 469 assert(C.array == correct); 470 } 471 472 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers. 473 __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted 474 { 475 // PERF DMD 476 static if (GDC_with_SSE41) 477 { 478 return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a); 479 } 480 else version(LDC) 481 { 482 // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64 483 enum ir = ` 484 %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> <i32 0, i32 1> 485 %r = sext <2 x i16> %v to <2 x i64> 486 ret <2 x i64> %r`; 487 return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a); 488 } 489 else 490 { 491 short8 sa = cast(short8)a; 492 long2 r; 493 r.ptr[0] = sa.array[0]; 494 r.ptr[1] = sa.array[1]; 495 return cast(__m128i)r; 496 } 497 } 498 unittest 499 { 500 __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0); 501 long2 C = cast(long2) _mm_cvtepi16_epi64(A); 502 long[2] correct = [-32768, 32767]; 503 assert(C.array == correct); 504 } 505 506 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers. 507 __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted 508 { 509 // PERF DMD 510 static if (GDC_with_SSE41) 511 { 512 return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a); 513 } 514 else version(LDC) 515 { 516 // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64 517 enum ir = ` 518 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 519 %r = sext <2 x i32> %v to <2 x i64> 520 ret <2 x i64> %r`; 521 return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a); 522 } 523 else 524 { 525 int4 sa = cast(int4)a; 526 long2 r; 527 r.ptr[0] = sa.array[0]; 528 r.ptr[1] = sa.array[1]; 529 return cast(__m128i)r; 530 } 531 } 532 unittest 533 { 534 __m128i A = _mm_setr_epi32(-4, 42, 0, 0); 535 long2 C = cast(long2) _mm_cvtepi32_epi64(A); 536 long[2] correct = [-4, 42]; 537 assert(C.array == correct); 538 } 539 540 541 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers. 542 __m128i _mm_cvtepi8_epi16 (__m128i a) @trusted 543 { 544 // PERF DMD 545 static if (GDC_with_SSE41) 546 { 547 alias ubyte16 = __vector(ubyte[16]); 548 return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a); 549 } 550 else version(LDC) 551 { 552 // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 553 // LDC ARM64: sshll generated since LDC 1.8.0 -O1 554 enum ir = ` 555 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 556 %r = sext <8 x i8> %v to <8 x i16> 557 ret <8 x i16> %r`; 558 return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a); 559 } 560 else 561 { 562 byte16 sa = cast(byte16)a; 563 short8 r; 564 foreach(n; 0..8) 565 r.ptr[n] = sa.array[n]; 566 return cast(__m128i)r; 567 } 568 } 569 unittest 570 { 571 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 572 short8 C = cast(short8) _mm_cvtepi8_epi16(A); 573 short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8]; 574 assert(C.array == correct); 575 } 576 577 578 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers. 579 __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted 580 { 581 // PERF DMD 582 static if (GDC_with_SSE41) 583 { 584 alias ubyte16 = __vector(ubyte[16]); 585 return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a); 586 } 587 else static if (LDC_with_SSE41) 588 { 589 // LDC x86: Generates pmovsxbd since LDC 1.1 -O0 590 enum ir = ` 591 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 592 %r = sext <4 x i8> %v to <4 x i32> 593 ret <4 x i32> %r`; 594 return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a); 595 } 596 else 597 { 598 // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would 599 byte16 sa = cast(byte16)a; 600 int4 r; 601 r.ptr[0] = sa.array[0]; 602 r.ptr[1] = sa.array[1]; 603 r.ptr[2] = sa.array[2]; 604 r.ptr[3] = sa.array[3]; 605 return cast(__m128i)r; 606 } 607 } 608 unittest 609 { 610 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 611 int4 C = cast(int4) _mm_cvtepi8_epi32(A); 612 int[4] correct = [127, -128, 1, -1]; 613 assert(C.array == correct); 614 } 615 616 617 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. 618 __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted 619 { 620 // PERF DMD 621 static if (GDC_with_SSE41) 622 { 623 alias ubyte16 = __vector(ubyte[16]); 624 return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a); 625 } 626 else version(LDC) 627 { 628 // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, 629 // LDC arm64: it's ok since LDC 1.8 -O1 630 enum ir = ` 631 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> <i32 0, i32 1> 632 %r = sext <2 x i8> %v to <2 x i64> 633 ret <2 x i64> %r`; 634 return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a); 635 } 636 else 637 { 638 byte16 sa = cast(byte16)a; 639 long2 r; 640 foreach(n; 0..2) 641 r.ptr[n] = sa.array[n]; 642 return cast(__m128i)r; 643 } 644 } 645 unittest 646 { 647 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 648 long2 C = cast(long2) _mm_cvtepi8_epi64(A); 649 long[2] correct = [127, -128]; 650 assert(C.array == correct); 651 } 652 653 654 /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers. 655 __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted 656 { 657 // PERF DMD 658 static if (GDC_with_SSE41) 659 { 660 return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a); 661 } 662 else 663 { 664 // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 665 // arm64: ushll since LDC 1.12 -O1 666 short8 sa = cast(short8)a; 667 int4 r; 668 r.ptr[0] = cast(ushort)sa.array[0]; 669 r.ptr[1] = cast(ushort)sa.array[1]; 670 r.ptr[2] = cast(ushort)sa.array[2]; 671 r.ptr[3] = cast(ushort)sa.array[3]; 672 return cast(__m128i)r; 673 } 674 } 675 unittest 676 { 677 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 678 int4 C = cast(int4) _mm_cvtepu16_epi32(A); 679 int[4] correct = [65535, 0, 32768, 32767]; 680 assert(C.array == correct); 681 } 682 683 684 /// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers. 685 __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted 686 { 687 // PERF DMD 688 static if (GDC_with_SSE41) 689 { 690 return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a); 691 } 692 else static if (LDC_with_ARM64) 693 { 694 // LDC arm64: a bit shorter than below, in -O2 695 short8 sa = cast(short8)a; 696 long2 r; 697 for(int n = 0; n < 2; ++n) 698 r.ptr[n] = cast(ushort)sa.array[n]; 699 return cast(__m128i)r; 700 } 701 else 702 { 703 // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 704 short8 sa = cast(short8)a; 705 long2 r; 706 r.ptr[0] = cast(ushort)sa.array[0]; 707 r.ptr[1] = cast(ushort)sa.array[1]; 708 return cast(__m128i)r; 709 } 710 } 711 unittest 712 { 713 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 714 long2 C = cast(long2) _mm_cvtepu16_epi64(A); 715 long[2] correct = [65535, 0]; 716 assert(C.array == correct); 717 } 718 719 720 /// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers. 721 __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted 722 { 723 // PERF DMD 724 static if (GDC_with_SSE41) 725 { 726 return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a); 727 } 728 else 729 { 730 // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1 731 // arm64: generates ushll since LDC 1.12 -O1 732 int4 sa = cast(int4)a; 733 long2 r; 734 r.ptr[0] = cast(uint)sa.array[0]; 735 r.ptr[1] = cast(uint)sa.array[1]; 736 return cast(__m128i)r; 737 } 738 } 739 unittest 740 { 741 __m128i A = _mm_setr_epi32(-1, 42, 0, 0); 742 long2 C = cast(long2) _mm_cvtepu32_epi64(A); 743 long[2] correct = [4294967295, 42]; 744 assert(C.array == correct); 745 } 746 747 748 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers. 749 __m128i _mm_cvtepu8_epi16 (__m128i a) @trusted 750 { 751 // PERF DMD 752 static if (GDC_with_SSE41) 753 { 754 return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(short8)a); 755 } 756 else 757 { 758 // LDC x86: generates pmovzxbw since LDC 1.12 -O1 also good without SSE4.1 759 // arm64: ushll since LDC 1.12 -O1 760 // PERF: catastrophic with GDC without SSE4.1 761 byte16 sa = cast(byte16)a; 762 short8 r; 763 r.ptr[0] = cast(ubyte)sa.array[0]; 764 r.ptr[1] = cast(ubyte)sa.array[1]; 765 r.ptr[2] = cast(ubyte)sa.array[2]; 766 r.ptr[3] = cast(ubyte)sa.array[3]; 767 r.ptr[4] = cast(ubyte)sa.array[4]; 768 r.ptr[5] = cast(ubyte)sa.array[5]; 769 r.ptr[6] = cast(ubyte)sa.array[6]; 770 r.ptr[7] = cast(ubyte)sa.array[7]; 771 return cast(__m128i)r; 772 } 773 } 774 unittest 775 { 776 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 777 short8 C = cast(short8) _mm_cvtepu8_epi16(A); 778 short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248]; 779 assert(C.array == correct); 780 } 781 782 783 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers. 784 __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted 785 { 786 // PERF DMD 787 static if (GDC_with_SSE41) 788 { 789 alias ubyte16 = __vector(ubyte[16]); 790 return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a); 791 } 792 else static if (LDC_with_ARM64) 793 { 794 // LDC arm64: a bit better than below in -O2 795 byte16 sa = cast(byte16)a; 796 int4 r; 797 for(int n = 0; n < 4; ++n) 798 r.ptr[n] = cast(ubyte)sa.array[n]; 799 return cast(__m128i)r; 800 } 801 else 802 { 803 // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1 804 // PERF: catastrophic with GDC without SSE4.1 805 byte16 sa = cast(byte16)a; 806 int4 r; 807 r.ptr[0] = cast(ubyte)sa.array[0]; 808 r.ptr[1] = cast(ubyte)sa.array[1]; 809 r.ptr[2] = cast(ubyte)sa.array[2]; 810 r.ptr[3] = cast(ubyte)sa.array[3]; 811 return cast(__m128i)r; 812 } 813 } 814 unittest 815 { 816 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 817 int4 C = cast(int4) _mm_cvtepu8_epi32(A); 818 int[4] correct = [127, 128, 1, 255]; 819 assert(C.array == correct); 820 } 821 822 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. 823 __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted 824 { 825 // PERF DMD 826 static if (GDC_with_SSE41) 827 { 828 alias ubyte16 = __vector(ubyte[16]); 829 return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a); 830 } 831 else static if (LDC_with_ARM64) 832 { 833 // LDC arm64: this optimizes better than the loop below 834 byte16 sa = cast(byte16)a; 835 long2 r; 836 for (int n = 0; n < 2; ++n) 837 r.ptr[n] = cast(ubyte)sa.array[n]; 838 return cast(__m128i)r; 839 } 840 else 841 { 842 // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1 843 byte16 sa = cast(byte16)a; 844 long2 r; 845 r.ptr[0] = cast(ubyte)sa.array[0]; 846 r.ptr[1] = cast(ubyte)sa.array[1]; 847 return cast(__m128i)r; 848 } 849 } 850 unittest 851 { 852 __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 853 long2 C = cast(long2) _mm_cvtepu8_epi64(A); 854 long[2] correct = [127, 254]; 855 assert(C.array == correct); 856 } 857 858 /// Conditionally multiply the packed double-precision (64-bit) floating-point elements 859 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally 860 /// store the sum in dst using the low 4 bits of `imm8`. 861 __m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted 862 { 863 // PERF DMD 864 static if (GDC_with_SSE41) 865 { 866 return __builtin_ia32_dppd(a, b, imm8 & 0x33); 867 } 868 else static if (LDC_with_SSE41) 869 { 870 return __builtin_ia32_dppd(a, b, imm8 & 0x33); 871 } 872 else 873 { 874 __m128d zero = _mm_setzero_pd(); 875 __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b); 876 double sum = temp.array[0] + temp.array[1]; 877 return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum)); 878 } 879 } 880 unittest 881 { 882 __m128d A = _mm_setr_pd(1.0, 2.0); 883 __m128d B = _mm_setr_pd(4.0, 8.0); 884 double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B); 885 double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B); 886 double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B); 887 double[2] correct1 = [ 4.0, 4.0]; 888 double[2] correct2 = [16.0, 0.0]; 889 double[2] correct3 = [ 0.0, 20.0]; 890 assert(R1.array == correct1); 891 assert(R2.array == correct2); 892 assert(R3.array == correct3); 893 } 894 895 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements 896 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, 897 /// and conditionally store the sum in result using the low 4 bits of `imm8`. 898 __m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted 899 { 900 // PERF DMD 901 static if (GDC_with_SSE41) 902 { 903 return __builtin_ia32_dpps(a, b, cast(byte)imm8); 904 } 905 else static if (LDC_with_SSE41) 906 { 907 return __builtin_ia32_dpps(a, b, cast(byte)imm8); 908 } 909 else 910 { 911 __m128 zero = _mm_setzero_ps(); 912 __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b); 913 float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3]; 914 return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum)); 915 } 916 } 917 unittest 918 { 919 __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f); 920 __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f); 921 float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B); 922 float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B); 923 float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B); 924 float[4] correct1 = [67.0f, 67.0f, 67.0f, 67.0f]; 925 float[4] correct2 = [23.0f, 0.0f, 23.0f, 0.0f]; 926 float[4] correct3 = [0.0f, 29.0f, 0.0f, 29.0f]; 927 assert(R1.array == correct1); 928 assert(R2.array == correct2); 929 assert(R3.array == correct3); 930 } 931 932 933 /// Extract a 32-bit integer from `a`, selected with `imm8`. 934 int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted 935 { 936 return (cast(int4)a).array[imm8 & 3]; 937 } 938 unittest 939 { 940 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 941 assert(_mm_extract_epi32(A, 0) == 1); 942 assert(_mm_extract_epi32(A, 1 + 8) == 2); 943 assert(_mm_extract_epi32(A, 3 + 4) == 4); 944 } 945 946 /// Extract a 64-bit integer from `a`, selected with `imm8`. 947 long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted 948 { 949 long2 la = cast(long2)a; 950 return la.array[imm8 & 1]; 951 } 952 unittest 953 { 954 __m128i A = _mm_setr_epi64(45, -67); 955 assert(_mm_extract_epi64(A, 0) == 45); 956 assert(_mm_extract_epi64(A, 1) == -67); 957 assert(_mm_extract_epi64(A, 2) == 45); 958 } 959 960 /// Extract an 8-bit integer from `a`, selected with `imm8`. 961 /// Warning: the returned value is zero-extended to 32-bits. 962 int _mm_extract_epi8 (__m128i a, const int imm8) @trusted 963 { 964 byte16 ba = cast(byte16)a; 965 return cast(ubyte) ba.array[imm8 & 15]; 966 } 967 unittest 968 { 969 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15); 970 assert(_mm_extract_epi8(A, 7) == 7); 971 assert(_mm_extract_epi8(A, 13) == 255); 972 assert(_mm_extract_epi8(A, 7 + 16) == 7); 973 } 974 975 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`. 976 /// Note: returns a 32-bit $(I integer). 977 int _mm_extract_ps (__m128 a, const int imm8) @trusted 978 { 979 return (cast(int4)a).array[imm8 & 3]; 980 } 981 unittest 982 { 983 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f); 984 assert(_mm_extract_ps(A, 0) == 0x3f800000); 985 assert(_mm_extract_ps(A, 1 + 8) == 0x40000000); 986 assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000); 987 } 988 989 990 991 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an 992 /// integer value, and store the results as packed double-precision floating-point elements. 993 __m128d _mm_floor_pd (__m128d a) @trusted 994 { 995 static if (LDC_with_ARM64) 996 { 997 // LDC arm64 acceptable since 1.8 -O2 998 long2 l = vcvtmq_s64_f64(a); 999 double2 r; 1000 r.ptr[0] = l.array[0]; 1001 r.ptr[1] = l.array[1]; 1002 return r; 1003 } 1004 else 1005 { 1006 return _mm_round_pd!1(a); 1007 } 1008 } 1009 unittest 1010 { 1011 __m128d A = _mm_setr_pd(1.3f, -2.12f); 1012 __m128d B = _mm_setr_pd(53.6f, -2.7f); 1013 A = _mm_floor_pd(A); 1014 B = _mm_floor_pd(B); 1015 double[2] correctA = [1.0, -3.0]; 1016 double[2] correctB = [53.0, -3.0]; 1017 assert(A.array == correctA); 1018 assert(B.array == correctB); 1019 } 1020 1021 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an 1022 /// integer value, and store the results as packed single-precision floating-point elements. 1023 __m128 _mm_floor_ps (__m128 a) @trusted 1024 { 1025 static if (LDC_with_ARM64) 1026 { 1027 // LDC arm64 acceptable since 1.8 -O1 1028 int4 l = vcvtmq_s32_f32(a); 1029 float4 r; 1030 r.ptr[0] = l.array[0]; 1031 r.ptr[1] = l.array[1]; 1032 r.ptr[2] = l.array[2]; 1033 r.ptr[3] = l.array[3]; 1034 return r; 1035 } 1036 else 1037 { 1038 return _mm_round_ps!1(a); 1039 } 1040 } 1041 unittest 1042 { 1043 __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); 1044 __m128 C = _mm_floor_ps(A); 1045 float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f]; 1046 assert(C.array == correct); 1047 } 1048 1049 /// Round the lower double-precision (64-bit) floating-point element in `b` down to an 1050 /// integer value, store the result as a double-precision floating-point element in the 1051 /// lower element, and copy the upper element from `a` to the upper element. 1052 __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted 1053 { 1054 static if (LDC_with_ARM64) 1055 { 1056 a[0] = vcvtms_s64_f64(b[0]); 1057 return a; 1058 } 1059 else 1060 { 1061 return _mm_round_sd!1(a, b); 1062 } 1063 } 1064 unittest 1065 { 1066 __m128d A = _mm_setr_pd(1.3, -2.12); 1067 __m128d B = _mm_setr_pd(-53.1, -3.7); 1068 __m128d C = _mm_floor_sd(A, B); 1069 double[2] correct = [-54.0, -2.12]; 1070 assert(C.array == correct); 1071 } 1072 1073 /// Round the lower single-precision (32-bit) floating-point element in `b` down to an 1074 /// integer value, store the result as a single-precision floating-point element in the 1075 /// lower element, and copy the upper 3 packed elements from `a` to the upper elements. 1076 __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted 1077 { 1078 static if (LDC_with_ARM64) 1079 { 1080 a[0] = vcvtms_s32_f32(b[0]); 1081 return a; 1082 } 1083 else 1084 { 1085 return _mm_round_ss!1(a, b); 1086 } 1087 } 1088 unittest 1089 { 1090 __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); 1091 __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f); 1092 __m128 C = _mm_floor_ss(A, B); 1093 float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f]; 1094 assert(C.array == correct); 1095 } 1096 1097 /// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`. 1098 __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted 1099 { 1100 // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1 1101 // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1 1102 // LDC arm64: ins.s since LDC 1.8 -O2 1103 int4 ia = cast(int4)a; 1104 ia.ptr[imm8 & 3] = i; 1105 return cast(__m128i)ia; 1106 } 1107 unittest 1108 { 1109 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 1110 int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4); 1111 int[4] result = [1, 2, 5, 4]; 1112 assert(C.array == result); 1113 } 1114 1115 /// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`. 1116 __m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted 1117 { 1118 // GDC: nothing special to do, psinrq generated with -O1 -msse4.1 1119 // LDC x86: always do something sensible. 1120 long2 la = cast(long2)a; 1121 la.ptr[imm8 & 1] = i; 1122 return cast(__m128i)la; 1123 } 1124 unittest 1125 { 1126 __m128i A = _mm_setr_epi64(1, 2); 1127 long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2); 1128 long[2] result = [1, 5]; 1129 assert(C.array == result); 1130 } 1131 1132 /// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`. 1133 /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8. 1134 __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted 1135 { 1136 // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1 1137 // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory. 1138 byte16 ba = cast(byte16)a; 1139 ba.ptr[imm8 & 15] = cast(byte)i; 1140 return cast(__m128i)ba; 1141 } 1142 unittest 1143 { 1144 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 1145 byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16); 1146 byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 1147 assert(C.array == result); 1148 } 1149 1150 1151 /// Warning: of course it does something totally different from `_mm_insert_epi32`! 1152 /// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` 1153 /// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` 1154 /// (elements are zeroed out when the corresponding bit is set). 1155 __m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted 1156 { 1157 // PERF DMD 1158 static if (GDC_with_SSE41) 1159 { 1160 return __builtin_ia32_insertps128(a, b, cast(byte)imm8); 1161 } 1162 else static if (LDC_with_SSE41) 1163 { 1164 return __builtin_ia32_insertps128(a, b, cast(byte)imm8); 1165 } 1166 else 1167 { 1168 float4 tmp2 = a; 1169 float tmp1 = b.array[(imm8 >> 6) & 3]; 1170 tmp2.ptr[(imm8 >> 4) & 3] = tmp1; 1171 return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps()); 1172 } 1173 } 1174 unittest 1175 { 1176 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1177 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1178 __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B); 1179 float[4] correct = [1.0f, 2.0f, 0.0f, 7.0f]; 1180 assert(C.array == correct); 1181 } 1182 1183 1184 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. 1185 __m128i _mm_max_epi32 (__m128i a, __m128i b) @trusted 1186 { 1187 static if (GDC_with_SSE41) 1188 { 1189 return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b); 1190 } 1191 else version(LDC) 1192 { 1193 // x86: pmaxsd since LDC 1.1 -O1 1194 // ARM: smax.4s since LDC 1.8 -01 1195 int4 sa = cast(int4)a; 1196 int4 sb = cast(int4)b; 1197 int4 greater = greaterMask!int4(sa, sb); 1198 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1199 } 1200 else 1201 { 1202 __m128i higher = _mm_cmpgt_epi32(a, b); 1203 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1204 __m128i mask = _mm_and_si128(aTob, higher); 1205 return _mm_xor_si128(b, mask); 1206 } 1207 } 1208 unittest 1209 { 1210 int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), 1211 _mm_setr_epi32( -4,-8, 9, -8)); 1212 int[4] correct = [0x7fffffff, 1, 9, 7]; 1213 assert(R.array == correct); 1214 } 1215 1216 /// Compare packed signed 8-bit integers in `a` and `b`, 1217 /// and return packed maximum values. 1218 __m128i _mm_max_epi8 (__m128i a, __m128i b) @trusted 1219 { 1220 // PERF DMD 1221 static if (GDC_with_SSE41) 1222 { 1223 return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b); 1224 } 1225 else version(LDC) 1226 { 1227 // x86: pmaxsb since LDC 1.1 -O1 1228 // ARM64: smax.16b since LDC 1.8.0 -O1 1229 byte16 sa = cast(byte16)a; 1230 byte16 sb = cast(byte16)b; 1231 byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); 1232 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1233 } 1234 else 1235 { 1236 __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else 1237 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1238 __m128i mask = _mm_and_si128(aTob, lower); 1239 return _mm_xor_si128(b, mask); 1240 } 1241 } 1242 unittest 1243 { 1244 __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); 1245 __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 1246 byte16 R = cast(byte16) _mm_max_epi8(A, B); 1247 byte[16] correct = [127, 1, 9, -7, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0]; 1248 assert(R.array == correct); 1249 } 1250 1251 /// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values. 1252 __m128i _mm_max_epu16 (__m128i a, __m128i b) @trusted 1253 { 1254 // PERF DMD 1255 static if (GDC_with_SSE41) 1256 { 1257 return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b); 1258 } 1259 else version(LDC) 1260 { 1261 // x86: pmaxuw since LDC 1.1 -O1 1262 // ARM64: umax.8h since LDC 1.8.0 -O1 1263 // PERF: without sse4.1, LLVM 12 produces a very interesting 1264 // psubusw xmm0, xmm1 1265 // paddw xmm0, xmm1 1266 // sequence that maybe should go in other min/max intrinsics? 1267 ushort8 sa = cast(ushort8)a; 1268 ushort8 sb = cast(ushort8)b; 1269 ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb); 1270 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1271 } 1272 else 1273 { 1274 b = _mm_subs_epu16(b, a); 1275 b = _mm_add_epi16(b, a); 1276 return b; 1277 } 1278 } 1279 unittest 1280 { 1281 short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), 1282 _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); 1283 short[8] correct = [ -4, -8, -4, -7, 9,-32768, 0, 57]; 1284 assert(R.array == correct); 1285 } 1286 1287 /// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values. 1288 __m128i _mm_max_epu32 (__m128i a, __m128i b) @trusted 1289 { 1290 // PERF DMD 1291 static if (GDC_with_SSE41) 1292 { 1293 return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b); 1294 } 1295 else version(LDC) 1296 { 1297 // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1 1298 // ARM64: umax.4s since LDC 1.8.0 -O1 1299 uint4 sa = cast(uint4)a; 1300 uint4 sb = cast(uint4)b; 1301 uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); 1302 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1303 } 1304 else 1305 { 1306 __m128i valueShift = _mm_set1_epi32(-0x80000000); 1307 __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift)); 1308 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1309 __m128i mask = _mm_and_si128(aTob, higher); 1310 return _mm_xor_si128(b, mask); 1311 } 1312 } 1313 unittest 1314 { 1315 int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), 1316 _mm_setr_epi32( -4,-8, 9, -8)); 1317 int[4] correct = [ -4,-8, 9, -7]; 1318 assert(R.array == correct); 1319 } 1320 1321 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. 1322 __m128i _mm_min_epi32 (__m128i a, __m128i b) @trusted 1323 { 1324 // PERF DMD 1325 static if (GDC_with_SSE41) 1326 { 1327 return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b); 1328 } 1329 else version(LDC) 1330 { 1331 // x86: pminsd since LDC 1.1 -O1, also good without sse4.1 1332 // ARM: smin.4s since LDC 1.8 -01 1333 int4 sa = cast(int4)a; 1334 int4 sb = cast(int4)b; 1335 int4 greater = greaterMask!int4(sa, sb); 1336 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1337 } 1338 else 1339 { 1340 __m128i higher = _mm_cmplt_epi32(a, b); 1341 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1342 __m128i mask = _mm_and_si128(aTob, higher); 1343 return _mm_xor_si128(b, mask); 1344 } 1345 } 1346 unittest 1347 { 1348 int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), 1349 _mm_setr_epi32( -4, -8, 9, -8)); 1350 int[4] correct = [ -4, -8, -4, -8]; 1351 assert(R.array == correct); 1352 } 1353 1354 /// Compare packed signed 8-bit integers in `a` and `b`, 1355 /// and return packed minimum values. 1356 __m128i _mm_min_epi8 (__m128i a, __m128i b) @trusted 1357 { 1358 // PERF DMD 1359 static if (GDC_with_SSE41) 1360 { 1361 return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b); 1362 } 1363 else version(LDC) 1364 { 1365 // x86: pminsb since LDC 1.1 -O1 1366 // ARM64: smin.16b since LDC 1.8.0 -O1 1367 byte16 sa = cast(byte16)a; 1368 byte16 sb = cast(byte16)b; 1369 byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); 1370 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1371 } 1372 else 1373 { 1374 __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else 1375 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1376 __m128i mask = _mm_and_si128(aTob, lower); 1377 return _mm_xor_si128(b, mask); 1378 } 1379 } 1380 unittest 1381 { 1382 __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); 1383 __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 1384 byte16 R = cast(byte16) _mm_min_epi8(A, B); 1385 byte[16] correct = [ 4, -8, -4, -8, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 1386 assert(R.array == correct); 1387 } 1388 1389 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst. 1390 __m128i _mm_min_epu16 (__m128i a, __m128i b) @trusted 1391 { 1392 // PERF DMD 1393 static if (GDC_with_SSE41) 1394 { 1395 return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b); 1396 } 1397 else version(LDC) 1398 { 1399 // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1 1400 // ARM64: umin.8h since LDC 1.8.0 -O1 1401 ushort8 sa = cast(ushort8)a; 1402 ushort8 sb = cast(ushort8)b; 1403 ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa); 1404 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1405 } 1406 else 1407 { 1408 __m128i c = _mm_subs_epu16(b, a); 1409 b = _mm_sub_epi16(b, c); 1410 return b; 1411 } 1412 } 1413 unittest 1414 { 1415 short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), 1416 _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); 1417 short[8] correct = [32767, 1, 9, -8, 0, 7, 0, 0]; 1418 assert(R.array == correct); 1419 } 1420 1421 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst. 1422 __m128i _mm_min_epu32 (__m128i a, __m128i b) @trusted 1423 { 1424 // PERF DMD 1425 static if (GDC_with_SSE41) 1426 { 1427 return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b); 1428 } 1429 else version(LDC) 1430 { 1431 // x86: pminud since LDC 1.1 -O1, also good without sse4.1 1432 // ARM64: umin.4s since LDC 1.8.0 -O1 1433 uint4 sa = cast(uint4)a; 1434 uint4 sb = cast(uint4)b; 1435 uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); 1436 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1437 } 1438 else 1439 { 1440 __m128i valueShift = _mm_set1_epi32(-0x80000000); 1441 __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift)); 1442 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1443 __m128i mask = _mm_and_si128(aTob, higher); 1444 return _mm_xor_si128(b, mask); 1445 } 1446 } 1447 unittest 1448 { 1449 int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), 1450 _mm_setr_epi32( -4,-8, 9, -8)); 1451 int[4] correct = [0x7fffffff, 1, 4, -8]; 1452 assert(R.array == correct); 1453 } 1454 1455 /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, 1456 /// store the minimum and index in return value, and zero the remaining bits. 1457 __m128i _mm_minpos_epu16 (__m128i a) @trusted 1458 { 1459 // PERF DMD 1460 static if (GDC_with_SSE41) 1461 { 1462 return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); 1463 } 1464 else static if (LDC_with_SSE41) 1465 { 1466 return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); 1467 } 1468 else static if (LDC_with_ARM64) 1469 { 1470 __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 1471 __m128i combinedLo = _mm_unpacklo_epi16(indices, a); 1472 __m128i combinedHi = _mm_unpackhi_epi16(indices, a); 1473 __m128i best = _mm_min_epu32(combinedLo, combinedHi); 1474 best = _mm_min_epu32(best, _mm_srli_si128!8(best)); 1475 best = _mm_min_epu32(best, _mm_srli_si128!4(best)); 1476 short8 sbest = cast(short8)best; 1477 short8 r; 1478 r[0] = sbest[1]; 1479 r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie 1480 r[2] = 0; 1481 r[3] = 0; 1482 r[4] = 0; 1483 r[5] = 0; 1484 r[6] = 0; 1485 r[7] = 0; 1486 return cast(__m128i)r; 1487 } 1488 else 1489 { 1490 short8 sa = cast(short8)a; 1491 ushort min = 0xffff; 1492 int index = 0; 1493 for(int n = 0; n < 8; ++n) 1494 { 1495 ushort c = sa.array[n]; 1496 if (c < min) 1497 { 1498 min = c; 1499 index = n; 1500 } 1501 } 1502 short8 r; 1503 r.ptr[0] = min; 1504 r.ptr[1] = cast(short)index; 1505 return cast(__m128i)r; 1506 } 1507 } 1508 unittest 1509 { 1510 __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6); 1511 __m128i B = _mm_setr_epi16(14, 4, 4, 2, -3, 2, 5, 6); 1512 short8 R1 = cast(short8) _mm_minpos_epu16(A); 1513 short8 R2 = cast(short8) _mm_minpos_epu16(B); 1514 short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0]; 1515 short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0]; 1516 assert(R1.array == correct1); 1517 assert(R2.array == correct2); 1518 } 1519 1520 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers 1521 /// in `a` compared to those in `b`, and store the 16-bit results in dst. 1522 /// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. 1523 /// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. 1524 /// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting 1525 /// at the offset specified in `imm8[2]`. 1526 __m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted 1527 { 1528 // PERF DMD 1529 static if (GDC_with_SSE41) 1530 { 1531 return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8); 1532 } 1533 else static if (LDC_with_SSE41) 1534 { 1535 return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8); 1536 } 1537 else 1538 { 1539 int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable... 1540 int b_offset = (imm8 & 3) * 4; 1541 1542 byte16 ba = cast(byte16)a; 1543 byte16 bb = cast(byte16)b; 1544 short8 r; 1545 1546 __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0); 1547 1548 for (int j = 0; j < 8; j += 2) 1549 { 1550 int k = a_offset + j; 1551 __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3], 1552 0, 0, 0, 0, 1553 ba[k+1], ba[k+2], ba[k+3], ba[k+4], 1554 0, 0, 0, 0); 1555 short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64 1556 r.ptr[j] = diffs.array[0]; 1557 r.ptr[j+1] = diffs.array[4]; 1558 } 1559 return cast(__m128i)r; 1560 } 1561 } 1562 unittest 1563 { 1564 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 1565 __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5, 5, 5, 12, 13, 14, 15); 1566 short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23]; 1567 short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749]; 1568 short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35]; 1569 short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741]; 1570 short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4]; 1571 short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B,); 1572 short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B,); 1573 short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B,); 1574 short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B,); 1575 short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B,); 1576 assert(r1.array == correct1); 1577 assert(r4.array == correct4); 1578 assert(r5.array == correct5); 1579 assert(r7.array == correct7); 1580 assert(r8.array == correct0); 1581 } 1582 1583 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst. 1584 __m128i _mm_mul_epi32 (__m128i a, __m128i b) @trusted 1585 { 1586 // PERF DMD 1587 static if (GDC_with_SSE41) 1588 { 1589 return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b); 1590 } 1591 else static if (LDC_with_SSE41) 1592 { 1593 // For some reason, clang has the builtin but it's not in IntrinsicsX86.td 1594 // Use IR instead. 1595 // This generates pmuldq with since LDC 1.2.0 -O0 1596 enum ir = ` 1597 %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 2> 1598 %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> <i32 0, i32 2> 1599 %la = sext <2 x i32> %ia to <2 x i64> 1600 %lb = sext <2 x i32> %ib to <2 x i64> 1601 %r = mul <2 x i64> %la, %lb 1602 ret <2 x i64> %r`; 1603 return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b); 1604 } 1605 else static if (LDC_with_ARM64) 1606 { 1607 // 3 instructions since LDC 1.8 -O2 1608 // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull 1609 int2 a_lo = vmovn_s64(cast(long2)a); 1610 int2 b_lo = vmovn_s64(cast(long2)b); 1611 return cast(__m128i) vmull_s32(a_lo, b_lo); 1612 } 1613 else 1614 { 1615 int4 ia = cast(int4)a; 1616 int4 ib = cast(int4)b; 1617 long2 r; 1618 r.ptr[0] = cast(long)ia.array[0] * ib.array[0]; 1619 r.ptr[1] = cast(long)ia.array[2] * ib.array[2]; 1620 return cast(__m128i)r; 1621 } 1622 } 1623 unittest 1624 { 1625 __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); 1626 __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); 1627 long2 R = cast(long2) _mm_mul_epi32(A, B); 1628 long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144]; 1629 assert(R.array == correct); 1630 } 1631 1632 /// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, 1633 /// return the low 32 bits of the intermediate integers. 1634 __m128i _mm_mullo_epi32 (__m128i a, __m128i b) @trusted 1635 { 1636 // PERF DMD 1637 // PERF GDC without SSE4.1 could be better 1638 static if (GDC_with_SSE41) 1639 { 1640 int4 ia = cast(int4)a; 1641 int4 ib = cast(int4)b; 1642 // Note: older GDC doesn't have that op, but older GDC 1643 // also has no support for -msse4.1 detection 1644 return cast(__m128i)(a * b); 1645 } 1646 else version(LDC) 1647 { 1648 int4 ia = cast(int4)a; 1649 int4 ib = cast(int4)b; 1650 return cast(__m128i)(a * b); 1651 } 1652 else 1653 { 1654 // DMD doesn't take the above 1655 int4 ia = cast(int4)a; 1656 int4 ib = cast(int4)b; 1657 int4 r; 1658 r.ptr[0] = ia.array[0] * ib.array[0]; 1659 r.ptr[1] = ia.array[1] * ib.array[1]; 1660 r.ptr[2] = ia.array[2] * ib.array[2]; 1661 r.ptr[3] = ia.array[3] * ib.array[3]; 1662 return r; 1663 } 1664 } 1665 unittest 1666 { 1667 __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); 1668 __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); 1669 int4 R = cast(int4) _mm_mullo_epi32(A, B); 1670 int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0]; 1671 assert(R.array == correct); 1672 } 1673 1674 1675 /// Convert packed signed 32-bit integers from `a` and `b` 1676 /// to packed 16-bit integers using unsigned saturation. 1677 __m128i _mm_packus_epi32 (__m128i a, __m128i b) @trusted 1678 { 1679 static if (GDC_with_SSE41) 1680 { 1681 // PERF For some reason doesn't generates the builtin??? 1682 return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); 1683 } 1684 else static if (LDC_with_SSE41) 1685 { 1686 return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); 1687 } 1688 else static if (LDC_with_ARM64) 1689 { 1690 int4 z; 1691 z = 0; 1692 return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)), 1693 vqmovn_u32(vmaxq_s32(z, cast(int4)b))); 1694 } 1695 else 1696 { 1697 // PERF: not great without SSE4.1 1698 int4 sa = cast(int4)a; 1699 int4 sb = cast(int4)b; 1700 ushort[8] result; 1701 for (int i = 0; i < 4; ++i) 1702 { 1703 int s = sa.array[i]; 1704 if (s < 0) s = 0; 1705 if (s > 65535) s = 65535; 1706 result.ptr[i] = cast(ushort)s; 1707 1708 s = sb.array[i]; 1709 if (s < 0) s = 0; 1710 if (s > 65535) s = 65535; 1711 result.ptr[i+4] = cast(ushort)s; 1712 } 1713 return cast(__m128i) loadUnaligned!(short8)(cast(short*)result.ptr); 1714 } 1715 } 1716 unittest 1717 { 1718 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 1719 short8 R = cast(short8) _mm_packus_epi32(A, A); 1720 short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0]; 1721 assert(R.array == correct); 1722 } 1723 1724 1725 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 1726 /// rounding parameter, and store the results as packed double-precision floating-point elements. 1727 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1728 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1729 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1730 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1731 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1732 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1733 __m128d _mm_round_pd(int rounding)(__m128d a) @trusted 1734 { 1735 // PERF DMD 1736 static if (GDC_with_SSE41) 1737 { 1738 return __builtin_ia32_roundpd(a, rounding); 1739 } 1740 else static if (LDC_with_SSE41) 1741 { 1742 return __builtin_ia32_roundpd(a, rounding); 1743 } 1744 else 1745 { 1746 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1747 { 1748 // Convert to 64-bit integers 1749 long lo = _mm_cvtsd_si64(a); 1750 a.ptr[0] = a.array[1]; 1751 long hi = _mm_cvtsd_si64(a); 1752 return _mm_setr_pd(lo, hi); 1753 } 1754 else 1755 { 1756 version(GNU) pragma(inline, false); // else fail unittest with optimizations 1757 1758 uint old = _MM_GET_ROUNDING_MODE(); 1759 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1760 1761 // Convert to 64-bit integers 1762 long lo = _mm_cvtsd_si64(a); 1763 a.ptr[0] = a.array[1]; 1764 long hi = _mm_cvtsd_si64(a); 1765 1766 // Convert back to double to achieve the rounding 1767 // The problem is that a 64-bit double can't represent all the values 1768 // a 64-bit integer can (and vice-versa). So this function won't work for 1769 // large values. (TODO: what range exactly?) 1770 _MM_SET_ROUNDING_MODE(old); 1771 return _mm_setr_pd(lo, hi); 1772 } 1773 } 1774 } 1775 unittest 1776 { 1777 // tested in other intrinsics 1778 } 1779 1780 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 1781 /// rounding parameter, and store the results as packed single-precision floating-point elements. 1782 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1783 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1784 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1785 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1786 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1787 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1788 __m128 _mm_round_ps(int rounding)(__m128 a) @trusted 1789 { 1790 static if (GDC_with_SSE41) 1791 { 1792 return __builtin_ia32_roundps(a, rounding); 1793 } 1794 else static if (LDC_with_SSE41) 1795 { 1796 return __builtin_ia32_roundps(a, rounding); 1797 } 1798 else 1799 { 1800 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1801 { 1802 __m128i integers = _mm_cvtps_epi32(a); 1803 return _mm_cvtepi32_ps(integers); 1804 } 1805 else 1806 { 1807 version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled 1808 // TODO: is this caused by __builtin_ia32_cvtps2dq being marked pure? because it's not it reads MXCSR 1809 1810 uint old = _MM_GET_ROUNDING_MODE(); 1811 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1812 scope(exit) _MM_SET_ROUNDING_MODE(old); 1813 1814 // Convert to 64-bit integers 1815 __m128i integers = _mm_cvtps_epi32(a); 1816 1817 // Convert back to float to achieve the rounding 1818 // The problem is that a 32-float can't represent all the values 1819 // a 32-bit integer can (and vice-versa). So this function won't work for 1820 // large values. (TODO: what range exactly?) 1821 __m128 result = _mm_cvtepi32_ps(integers); 1822 1823 return result; 1824 } 1825 } 1826 } 1827 unittest 1828 { 1829 // tested in other intrinsics 1830 } 1831 1832 1833 /// Round the lower double-precision (64-bit) floating-point element in `b` using the 1834 /// rounding parameter, store the result as a double-precision floating-point element 1835 /// in the lower element of result, and copy the upper element from `a` to the upper element of result. 1836 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1837 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1838 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1839 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1840 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1841 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1842 __m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted 1843 { 1844 static if (GDC_with_SSE41) 1845 { 1846 return __builtin_ia32_roundsd(a, b, rounding); 1847 } 1848 else static if (LDC_with_SSE41) 1849 { 1850 return __builtin_ia32_roundsd(a, b, rounding); 1851 } 1852 else 1853 { 1854 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1855 { 1856 // Convert to 64-bit integer 1857 long b0 = _mm_cvtsd_si64(b); 1858 a.ptr[0] = b0; 1859 return a; 1860 } 1861 else 1862 { 1863 version(GNU) pragma(inline, false); // else fail unittest with optimizations 1864 1865 uint old = _MM_GET_ROUNDING_MODE(); 1866 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1867 1868 // Convert to 64-bit integer 1869 long b0 = _mm_cvtsd_si64(b); 1870 a.ptr[0] = b0; 1871 1872 // Convert back to double to achieve the rounding 1873 // The problem is that a 64-bit double can't represent all the values 1874 // a 64-bit integer can (and vice-versa). So this function won't work for 1875 // large values. (TODO: what range exactly?) 1876 _MM_SET_ROUNDING_MODE(old); 1877 return a; 1878 } 1879 } 1880 } 1881 unittest 1882 { 1883 // tested in other intrinsics 1884 } 1885 1886 1887 /// Round the lower single-precision (32-bit) floating-point element in `b` using the 1888 /// rounding parameter, store the result as a single-precision floating-point element 1889 /// in the lower element of result, and copy the upper 3 packed elements from `a` 1890 /// to the upper elements of result. 1891 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1892 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1893 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1894 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1895 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1896 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1897 __m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted 1898 { 1899 static if (GDC_with_SSE41) 1900 { 1901 return __builtin_ia32_roundss(a, b, rounding); 1902 } 1903 else static if (LDC_with_SSE41) 1904 { 1905 return __builtin_ia32_roundss(a, b, rounding); 1906 } 1907 else 1908 { 1909 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1910 { 1911 int b0 = _mm_cvtss_si32(b); 1912 a.ptr[0] = b0; 1913 return a; 1914 } 1915 else 1916 { 1917 uint old = _MM_GET_ROUNDING_MODE(); 1918 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1919 1920 // Convert to 32-bit integer 1921 int b0 = _mm_cvtss_si32(b); 1922 a.ptr[0] = b0; 1923 1924 // Convert back to double to achieve the rounding 1925 // The problem is that a 64-bit double can't represent all the values 1926 // a 64-bit integer can (and vice-versa). So this function won't work for 1927 // large values. (TODO: what range exactly?) 1928 _MM_SET_ROUNDING_MODE(old); 1929 return a; 1930 } 1931 } 1932 } 1933 unittest 1934 { 1935 // tested in other intrinsics 1936 } 1937 1938 1939 /// Load 128-bits of integer data from memory using a non-temporal memory hint. 1940 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection 1941 /// exception may be generated. 1942 __m128i _mm_stream_load_si128 (__m128i * mem_addr) @trusted 1943 { 1944 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 1945 return *mem_addr; // it's a regular move instead 1946 } 1947 1948 1949 /// Return 1 if all bits in `a` are all 1's. Else return 0. 1950 int _mm_test_all_ones (__m128i a) @safe 1951 { 1952 return _mm_testc_si128(a, _mm_set1_epi32(-1)); 1953 } 1954 unittest 1955 { 1956 __m128i A = _mm_set1_epi32(-1); 1957 __m128i B = _mm_set_epi32(-1, -2, -1, -1); 1958 assert(_mm_test_all_ones(A) == 1); 1959 assert(_mm_test_all_ones(B) == 0); 1960 } 1961 1962 /// Return 1 if all bits in `a` are all 0's. Else return 0. 1963 // This is a #BONUS since it was lacking in Intel Intrinsics API. 1964 int _mm_test_all_zeros (__m128i a) @safe 1965 { 1966 return _mm_testz_si128(a, _mm_set1_epi32(-1)); 1967 } 1968 unittest 1969 { 1970 __m128i A = _mm_set1_epi32(0); 1971 __m128i B = _mm_set_epi32(0, 8, 0, 0); 1972 assert(_mm_test_all_zeros(A) == 1); 1973 assert(_mm_test_all_zeros(B) == 0); 1974 } 1975 1976 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, 1977 /// and return 1 if the result is zero, otherwise return 0. 1978 int _mm_test_all_zeros (__m128i a, __m128i mask) @safe 1979 { 1980 return _mm_testz_si128(a, mask); // it's really the same, but with a good name 1981 } 1982 1983 /// Compute the bitwise AND of 128 bits (representing integer data) in a and mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 1984 int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted 1985 { 1986 return _mm_testnzc_si128(a, mask); 1987 } 1988 1989 /// Compute the bitwise NOT of a and then AND with b, and return 1 if the 1990 /// result is zero, otherwise return 0. 1991 /// In other words, test if all bits masked by `b` are 1 in `a`. 1992 int _mm_testc_si128 (__m128i a, __m128i b) @trusted 1993 { 1994 // PERF DMD 1995 static if (GDC_with_SSE41) 1996 { 1997 return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); 1998 } 1999 else static if (LDC_with_SSE41) 2000 { 2001 return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); 2002 } 2003 else static if (LDC_with_ARM64) 2004 { 2005 // Acceptable since LDC 1.8 -02 2006 long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a); 2007 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); 2008 } 2009 else 2010 { 2011 __m128i c = ~a & b; 2012 int[4] zero = [0, 0, 0, 0]; 2013 return c.array == zero; 2014 } 2015 } 2016 unittest 2017 { 2018 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2019 __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00); 2020 __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); 2021 assert(_mm_testc_si128(A, A) == 1); 2022 assert(_mm_testc_si128(A, M1) == 0); 2023 assert(_mm_testc_si128(A, M2) == 1); 2024 } 2025 2026 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, 2027 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 2028 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 2029 /// result is zero, otherwise set CF to 0. 2030 /// Return 1 if both the ZF and CF values are zero, otherwise return 0. 2031 int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted 2032 { 2033 // PERF DMD 2034 static if (GDC_with_SSE41) 2035 { 2036 return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); 2037 } 2038 else static if (LDC_with_SSE41) 2039 { 2040 return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); 2041 } 2042 else static if (LDC_with_ARM64) 2043 { 2044 long2 s640 = vandq_s64(cast(long2)b, cast(long2)a); 2045 long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a); 2046 2047 return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)) 2048 | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) ); 2049 } 2050 else 2051 { 2052 __m128i c = a & b; 2053 __m128i d = ~a & b; 2054 int[4] zero = [0, 0, 0, 0]; 2055 return !( (c.array == zero) || (d.array == zero)); 2056 } 2057 } 2058 unittest 2059 { 2060 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2061 __m128i M = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00); 2062 __m128i Z = _mm_setzero_si128(); 2063 assert(_mm_testnzc_si128(A, Z) == 0); 2064 assert(_mm_testnzc_si128(A, M) == 1); 2065 assert(_mm_testnzc_si128(A, A) == 0); 2066 } 2067 2068 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, 2069 /// and return 1 if the result is zero, otherwise return 0. 2070 /// In other words, test if all bits masked by `b` are 0 in `a`. 2071 int _mm_testz_si128 (__m128i a, __m128i b) @trusted 2072 { 2073 // PERF DMD 2074 static if (GDC_with_SSE41) 2075 { 2076 return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); 2077 } 2078 else static if (LDC_with_SSE41) 2079 { 2080 return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); 2081 } 2082 else static if (LDC_with_ARM64) 2083 { 2084 // Acceptable since LDC 1.8 -02 2085 long2 s64 = vandq_s64(cast(long2)a, cast(long2)b); 2086 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); 2087 } 2088 else 2089 { 2090 __m128i c = a & b; 2091 int[4] zero = [0, 0, 0, 0]; 2092 return c.array == zero; 2093 } 2094 } 2095 unittest 2096 { 2097 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2098 __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07); 2099 __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); 2100 assert(_mm_testz_si128(A, A) == 0); 2101 assert(_mm_testz_si128(A, M1) == 1); 2102 assert(_mm_testz_si128(A, M2) == 0); 2103 } 2104