1 /** 2 * SSE4.1 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1 4 * 5 * Copyright: Guillaume Piolat 2021. 6 * Johan Engelen 2021. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.smmintrin; 10 11 // SSE4.1 instructions 12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1 13 // Note: this header will work whether you have SSE4.1 enabled or not. 14 // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively 15 // generate SSE4.1 instructions. 16 // With GDC, use "dflags-gdc": ["-msse4.1"] or equivalent to generate SSE4.1 instructions. 17 18 public import inteli.types; 19 import inteli.internals; 20 21 // smmintrin pulls in all previous instruction set intrinsics. 22 public import inteli.tmmintrin; 23 24 nothrow @nogc: 25 26 enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes 27 enum int _MM_FROUND_TO_NEG_INF = 0x01; /// ditto 28 enum int _MM_FROUND_TO_POS_INF = 0x02; /// ditto 29 enum int _MM_FROUND_TO_ZERO = 0x03; /// ditto 30 enum int _MM_FROUND_CUR_DIRECTION = 0x04; /// ditto 31 enum int _MM_FROUND_RAISE_EXC = 0x00; /// ditto 32 enum int _MM_FROUND_NO_EXC = 0x08; /// ditto 33 34 enum int _MM_FROUND_NINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT); 35 enum int _MM_FROUND_FLOOR = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF); 36 enum int _MM_FROUND_CEIL = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF); 37 enum int _MM_FROUND_TRUNC = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO); 38 enum int _MM_FROUND_RINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION); 39 enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION); 40 41 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results. 42 // Note: changed signature, GDC needs a compile-time value for imm8. 43 __m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) @trusted 44 { 45 // PERF DMD 46 static if (GDC_with_SSE41) 47 { 48 return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8); 49 } 50 else 51 { 52 // LDC x86 This generates pblendw since LDC 1.1 and -O2 53 short8 r; 54 short8 sa = cast(short8)a; 55 short8 sb = cast(short8)b; 56 for (int n = 0; n < 8; ++n) 57 { 58 r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n]; 59 } 60 return cast(__m128i)r; 61 } 62 } 63 unittest 64 { 65 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 66 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 67 short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011 68 short[8] correct = [8, 9, 2, 3, 12, 5, 6, 15]; 69 assert(C.array == correct); 70 } 71 72 73 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`. 74 // Note: changed signature, GDC needs a compile-time value for `imm8`. 75 __m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted 76 { 77 static assert(imm8 >= 0 && imm8 < 4); 78 // PERF DMD 79 static if (GDC_with_SSE41) 80 { 81 return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8); 82 } 83 else 84 { 85 // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12 86 double2 r; 87 for (int n = 0; n < 2; ++n) 88 { 89 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 90 } 91 return cast(__m128d)r; 92 } 93 } 94 unittest 95 { 96 __m128d A = _mm_setr_pd(0, 1); 97 __m128d B = _mm_setr_pd(8, 9); 98 double2 C = _mm_blend_pd!2(A, B); 99 double[2] correct = [0, 9]; 100 assert(C.array == correct); 101 } 102 103 104 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control mask `imm8`. 105 // Note: changed signature, GDC needs a compile-time value for imm8. 106 __m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) @trusted 107 { 108 // PERF DMD 109 static assert(imm8 >= 0 && imm8 < 16); 110 static if (GDC_with_SSE41) 111 { 112 return __builtin_ia32_blendps(a, b, imm8); 113 } 114 else version(LDC) 115 { 116 // LDC x86: generates blendps since LDC 1.1 -O2 117 // arm64: pretty good, two instructions worst case 118 return shufflevectorLDC!(float4, (imm8 & 1) ? 4 : 0, 119 (imm8 & 2) ? 5 : 1, 120 (imm8 & 4) ? 6 : 2, 121 (imm8 & 8) ? 7 : 3)(a, b); 122 } 123 else 124 { 125 __m128 r; // PERF =void; 126 for (int n = 0; n < 4; ++n) 127 { 128 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 129 } 130 return r; 131 } 132 } 133 unittest 134 { 135 __m128 A = _mm_setr_ps(0, 1, 2, 3); 136 __m128 B = _mm_setr_ps(8, 9, 10, 11); 137 float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101 138 float[4] correct = [8, 1, 10, 11]; 139 assert(C.array == correct); 140 } 141 142 /// Blend packed 8-bit integers from `a` and `b` using `mask`. 143 __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) @trusted 144 { 145 // PERF DMD 146 /*static if (GDC_with_SSE41) 147 { 148 // This intrinsic do nothing in GDC 12. 149 // TODO report to GDC. No problem in GCC. 150 return cast(__m128i) __builtin_ia32_pblendvb128 (cast(ubyte16)a, cast(ubyte16)b, cast(ubyte16)mask); 151 } 152 else*/ 153 static if (LDC_with_SSE41) 154 { 155 return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask); 156 } 157 else static if (LDC_with_ARM64) 158 { 159 // LDC arm64: two instructions since LDC 1.12 -O2 160 byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7); 161 return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a); 162 } 163 else 164 { 165 __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask); 166 return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b); 167 } 168 } 169 unittest 170 { 171 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 172 8, 9, 10, 11, 12, 13, 14, 15); 173 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 174 24, 25, 26, 27, 28, 29, 30, 31); 175 __m128i M = _mm_setr_epi8( 1, -1, 1, 1, -4, 1, -8, 127, 176 1, 1, -1, -1, 4, 1, 8, -128); 177 byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M); 178 byte[16] correct = [ 0, 17, 2, 3, 20, 5, 22, 7, 179 8, 9, 26, 27, 12, 13, 14, 31 ]; 180 assert(R.array == correct); 181 } 182 183 184 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`. 185 __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted 186 { 187 // PERF DMD 188 static if (GDC_with_SSE42) 189 { 190 // PERF Amazingly enough, GCC/GDC generates the blendvpd instruction 191 // with -msse4.2 but not -msse4.1. 192 // Not sure what is the reason, and there is a replacement sequence. 193 // Sounds like a bug. 194 return __builtin_ia32_blendvpd(a, b, mask); 195 } 196 else static if (LDC_with_SSE41) 197 { 198 return __builtin_ia32_blendvpd(a, b, mask); 199 } 200 else static if (LDC_with_ARM64) 201 { 202 long2 shift; 203 shift = 63; 204 long2 lmask = cast(long2)mask >> shift; 205 return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a); 206 } 207 else 208 { 209 __m128d r; // PERF =void; 210 long2 lmask = cast(long2)mask; 211 for (int n = 0; n < 2; ++n) 212 { 213 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 214 } 215 return r; 216 } 217 } 218 unittest 219 { 220 __m128d A = _mm_setr_pd(1.0, 2.0); 221 __m128d B = _mm_setr_pd(3.0, 4.0); 222 __m128d M1 = _mm_setr_pd(-3.0, 2.0); 223 __m128d R1 = _mm_blendv_pd(A, B, M1); 224 double[2] correct1 = [3.0, 2.0]; 225 assert(R1.array == correct1); 226 227 // BUG: LDC _mm_blendv_pd doesn't work with NaN mask in arm64 Linux for some unknown reason. 228 // but it does work in arm64 macOS 229 // yields different results despite FP seemingly not being used 230 version(linux) 231 {} 232 else 233 { 234 __m128d M2 = _mm_setr_pd(double.nan, -double.nan); 235 __m128d R2 = _mm_blendv_pd(A, B, M2); 236 double[2] correct2 = [1.0, 4.0]; 237 assert(R2.array == correct2); 238 } 239 } 240 241 242 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`. 243 __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) @trusted 244 { 245 // PERF DMD 246 static if (GDC_with_SSE41) 247 { 248 return __builtin_ia32_blendvps(a, b, mask); 249 } 250 else static if (LDC_with_SSE41) 251 { 252 return __builtin_ia32_blendvps(a, b, mask); 253 } 254 else static if (LDC_with_ARM64) 255 { 256 int4 shift; 257 shift = 31; 258 int4 lmask = cast(int4)mask >> shift; 259 return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a); 260 } 261 else 262 { 263 __m128 r; // PERF =void; 264 int4 lmask = cast(int4)mask; 265 for (int n = 0; n < 4; ++n) 266 { 267 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 268 } 269 return r; 270 } 271 } 272 unittest 273 { 274 __m128 A = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f); 275 __m128 B = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f); 276 __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f); 277 __m128 M2 = _mm_setr_ps(float.nan, -float.nan, -0.0f, +0.0f); 278 __m128 R1 = _mm_blendv_ps(A, B, M1); 279 __m128 R2 = _mm_blendv_ps(A, B, M2); 280 float[4] correct1 = [ 4.0f, 1.0f, 2.0f, 7.0f]; 281 float[4] correct2 = [ 0.0f, 5.0f, 6.0f, 3.0f]; 282 assert(R1.array == correct1); 283 284 // BUG: like above, LDC _mm_blendv_ps doesn't work with NaN mask in arm64 Linux for some unknown reason. 285 // yields different results despite FP seemingly not being used 286 version(linux) 287 {} 288 else 289 { 290 assert(R2.array == correct2); 291 } 292 } 293 294 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, 295 /// and store the results as packed double-precision floating-point elements. 296 __m128d _mm_ceil_pd (__m128d a) @trusted 297 { 298 static if (LDC_with_ARM64) 299 { 300 // LDC arm64 acceptable since 1.8 -O2 301 // Unfortunately x86 intrinsics force a round-trip back to double2 302 // ARM neon semantics wouldn't have that 303 long2 l = vcvtpq_s64_f64(a); 304 double2 r; 305 r.ptr[0] = l.array[0]; 306 r.ptr[1] = l.array[1]; 307 return r; 308 } 309 else 310 { 311 return _mm_round_pd!2(a); 312 } 313 } 314 unittest 315 { 316 __m128d A = _mm_setr_pd(1.3f, -2.12f); 317 __m128d B = _mm_setr_pd(53.6f, -2.7f); 318 A = _mm_ceil_pd(A); 319 B = _mm_ceil_pd(B); 320 double[2] correctA = [2.0, -2.0]; 321 double[2] correctB = [54.0, -2.0]; 322 assert(A.array == correctA); 323 assert(B.array == correctB); 324 } 325 326 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, 327 /// and store the results as packed single-precision floating-point elements. 328 __m128 _mm_ceil_ps (__m128 a) @trusted 329 { 330 static if (LDC_with_ARM64) 331 { 332 // LDC arm64 acceptable since 1.8 -O1 333 int4 l = vcvtpq_s32_f32(a); 334 float4 r; 335 r.ptr[0] = l.array[0]; 336 r.ptr[1] = l.array[1]; 337 r.ptr[2] = l.array[2]; 338 r.ptr[3] = l.array[3]; 339 return r; 340 } 341 else 342 { 343 return _mm_round_ps!2(a); 344 } 345 } 346 unittest 347 { 348 __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); 349 __m128 C = _mm_ceil_ps(A); 350 float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f]; 351 assert(C.array == correct); 352 } 353 354 /// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, 355 /// store the result as a double-precision floating-point element in the lower element of result, 356 /// and copy the upper element from `a` to the upper element of dst. 357 __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted 358 { 359 static if (LDC_with_ARM64) 360 { 361 a[0] = vcvtps_s64_f64(b[0]); 362 return a; 363 } 364 else 365 { 366 return _mm_round_sd!2(a, b); 367 } 368 } 369 unittest 370 { 371 __m128d A = _mm_setr_pd(1.3, -2.12); 372 __m128d B = _mm_setr_pd(53.6, -3.7); 373 __m128d C = _mm_ceil_sd(A, B); 374 double[2] correct = [54.0, -2.12]; 375 assert(C.array == correct); 376 } 377 378 /// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value, 379 /// store the result as a single-precision floating-point element in the lower element of result, 380 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 381 __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted 382 { 383 static if (LDC_with_ARM64) 384 { 385 a[0] = vcvtps_s32_f32(b[0]); 386 return a; 387 } 388 else 389 { 390 return _mm_round_ss!2(a, b); 391 } 392 } 393 unittest 394 { 395 __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); 396 __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f); 397 __m128 C = _mm_ceil_ss(A, B); 398 float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f]; 399 assert(C.array == correct); 400 } 401 402 /// Compare packed 64-bit integers in `a` and `b` for equality. 403 __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted 404 { 405 // PERF DMD 406 static if (GDC_with_SSE41) 407 { 408 return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b); 409 } 410 else version(LDC) 411 { 412 // LDC x86: generates pcmpeqq since LDC 1.1 -O1 413 // arm64: generates cmeq since LDC 1.8 -O1 414 return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b); 415 } 416 else 417 { 418 // Clever pcmpeqd + pand use with LDC 1.24 -O2 419 long2 la = cast(long2)a; 420 long2 lb = cast(long2)b; 421 long2 res; 422 res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0; 423 res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0; 424 return cast(__m128i)res; 425 } 426 } 427 unittest 428 { 429 __m128i A = _mm_setr_epi64(-1, -2); 430 __m128i B = _mm_setr_epi64(-3, -2); 431 __m128i C = _mm_setr_epi64(-1, -4); 432 long2 AB = cast(long2) _mm_cmpeq_epi64(A, B); 433 long2 AC = cast(long2) _mm_cmpeq_epi64(A, C); 434 long[2] correct1 = [0, -1]; 435 long[2] correct2 = [-1, 0]; 436 assert(AB.array == correct1); 437 assert(AC.array == correct2); 438 } 439 440 441 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers. 442 __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted 443 { 444 // PERF DMD 445 static if (GDC_with_SSE41) 446 { 447 return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a); 448 } 449 else version(LDC) 450 { 451 // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64 452 enum ir = ` 453 %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3> 454 %r = sext <4 x i16> %v to <4 x i32> 455 ret <4 x i32> %r`; 456 return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a); 457 } 458 else 459 { 460 short8 sa = cast(short8)a; 461 int4 r; 462 r.ptr[0] = sa.array[0]; 463 r.ptr[1] = sa.array[1]; 464 r.ptr[2] = sa.array[2]; 465 r.ptr[3] = sa.array[3]; 466 return r; 467 } 468 } 469 unittest 470 { 471 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 472 int4 C = cast(int4) _mm_cvtepi16_epi32(A); 473 int[4] correct = [-1, 0, -32768, 32767]; 474 assert(C.array == correct); 475 } 476 477 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers. 478 __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted 479 { 480 // PERF DMD 481 static if (GDC_with_SSE41) 482 { 483 return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a); 484 } 485 else version(LDC) 486 { 487 // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64 488 enum ir = ` 489 %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> <i32 0, i32 1> 490 %r = sext <2 x i16> %v to <2 x i64> 491 ret <2 x i64> %r`; 492 return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a); 493 } 494 else 495 { 496 short8 sa = cast(short8)a; 497 long2 r; 498 r.ptr[0] = sa.array[0]; 499 r.ptr[1] = sa.array[1]; 500 return cast(__m128i)r; 501 } 502 } 503 unittest 504 { 505 __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0); 506 long2 C = cast(long2) _mm_cvtepi16_epi64(A); 507 long[2] correct = [-32768, 32767]; 508 assert(C.array == correct); 509 } 510 511 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers. 512 __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted 513 { 514 // PERF DMD 515 static if (GDC_with_SSE41) 516 { 517 return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a); 518 } 519 else version(LDC) 520 { 521 // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64 522 enum ir = ` 523 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 524 %r = sext <2 x i32> %v to <2 x i64> 525 ret <2 x i64> %r`; 526 return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a); 527 } 528 else 529 { 530 int4 sa = cast(int4)a; 531 long2 r; 532 r.ptr[0] = sa.array[0]; 533 r.ptr[1] = sa.array[1]; 534 return cast(__m128i)r; 535 } 536 } 537 unittest 538 { 539 __m128i A = _mm_setr_epi32(-4, 42, 0, 0); 540 long2 C = cast(long2) _mm_cvtepi32_epi64(A); 541 long[2] correct = [-4, 42]; 542 assert(C.array == correct); 543 } 544 545 546 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers. 547 __m128i _mm_cvtepi8_epi16 (__m128i a) @trusted 548 { 549 // PERF DMD 550 static if (GDC_with_SSE41) 551 { 552 alias ubyte16 = __vector(ubyte[16]); 553 return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a); 554 } 555 else version(LDC) 556 { 557 // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 558 // LDC ARM64: sshll generated since LDC 1.8.0 -O1 559 enum ir = ` 560 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 561 %r = sext <8 x i8> %v to <8 x i16> 562 ret <8 x i16> %r`; 563 return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a); 564 } 565 else 566 { 567 byte16 sa = cast(byte16)a; 568 short8 r; 569 foreach(n; 0..8) 570 r.ptr[n] = sa.array[n]; 571 return cast(__m128i)r; 572 } 573 } 574 unittest 575 { 576 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 577 short8 C = cast(short8) _mm_cvtepi8_epi16(A); 578 short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8]; 579 assert(C.array == correct); 580 } 581 582 583 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers. 584 __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted 585 { 586 // PERF DMD 587 static if (GDC_with_SSE41) 588 { 589 alias ubyte16 = __vector(ubyte[16]); 590 return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a); 591 } 592 else static if (LDC_with_SSE41) 593 { 594 // LDC x86: Generates pmovsxbd since LDC 1.1 -O0 595 enum ir = ` 596 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 597 %r = sext <4 x i8> %v to <4 x i32> 598 ret <4 x i32> %r`; 599 return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a); 600 } 601 else 602 { 603 // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would 604 byte16 sa = cast(byte16)a; 605 int4 r; 606 r.ptr[0] = sa.array[0]; 607 r.ptr[1] = sa.array[1]; 608 r.ptr[2] = sa.array[2]; 609 r.ptr[3] = sa.array[3]; 610 return cast(__m128i)r; 611 } 612 } 613 unittest 614 { 615 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 616 int4 C = cast(int4) _mm_cvtepi8_epi32(A); 617 int[4] correct = [127, -128, 1, -1]; 618 assert(C.array == correct); 619 } 620 621 622 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. 623 __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted 624 { 625 // PERF DMD 626 static if (GDC_with_SSE41) 627 { 628 alias ubyte16 = __vector(ubyte[16]); 629 return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a); 630 } 631 else version(LDC) 632 { 633 // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, 634 // LDC arm64: it's ok since LDC 1.8 -O1 635 enum ir = ` 636 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> <i32 0, i32 1> 637 %r = sext <2 x i8> %v to <2 x i64> 638 ret <2 x i64> %r`; 639 return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a); 640 } 641 else 642 { 643 byte16 sa = cast(byte16)a; 644 long2 r; 645 foreach(n; 0..2) 646 r.ptr[n] = sa.array[n]; 647 return cast(__m128i)r; 648 } 649 } 650 unittest 651 { 652 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 653 long2 C = cast(long2) _mm_cvtepi8_epi64(A); 654 long[2] correct = [127, -128]; 655 assert(C.array == correct); 656 } 657 658 659 /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers. 660 __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted 661 { 662 // PERF DMD 663 static if (GDC_with_SSE41) 664 { 665 return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a); 666 } 667 else 668 { 669 // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 670 // arm64: ushll since LDC 1.12 -O1 671 short8 sa = cast(short8)a; 672 int4 r; 673 r.ptr[0] = cast(ushort)sa.array[0]; 674 r.ptr[1] = cast(ushort)sa.array[1]; 675 r.ptr[2] = cast(ushort)sa.array[2]; 676 r.ptr[3] = cast(ushort)sa.array[3]; 677 return cast(__m128i)r; 678 } 679 } 680 unittest 681 { 682 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 683 int4 C = cast(int4) _mm_cvtepu16_epi32(A); 684 int[4] correct = [65535, 0, 32768, 32767]; 685 assert(C.array == correct); 686 } 687 688 689 /// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers. 690 __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted 691 { 692 // PERF DMD 693 static if (GDC_with_SSE41) 694 { 695 return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a); 696 } 697 else static if (LDC_with_ARM64) 698 { 699 // LDC arm64: a bit shorter than below, in -O2 700 short8 sa = cast(short8)a; 701 long2 r; 702 for(int n = 0; n < 2; ++n) 703 r.ptr[n] = cast(ushort)sa.array[n]; 704 return cast(__m128i)r; 705 } 706 else 707 { 708 // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 709 short8 sa = cast(short8)a; 710 long2 r; 711 r.ptr[0] = cast(ushort)sa.array[0]; 712 r.ptr[1] = cast(ushort)sa.array[1]; 713 return cast(__m128i)r; 714 } 715 } 716 unittest 717 { 718 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 719 long2 C = cast(long2) _mm_cvtepu16_epi64(A); 720 long[2] correct = [65535, 0]; 721 assert(C.array == correct); 722 } 723 724 725 /// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers. 726 __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted 727 { 728 // PERF DMD 729 static if (GDC_with_SSE41) 730 { 731 return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a); 732 } 733 else 734 { 735 // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1 736 // arm64: generates ushll since LDC 1.12 -O1 737 int4 sa = cast(int4)a; 738 long2 r; 739 r.ptr[0] = cast(uint)sa.array[0]; 740 r.ptr[1] = cast(uint)sa.array[1]; 741 return cast(__m128i)r; 742 } 743 } 744 unittest 745 { 746 __m128i A = _mm_setr_epi32(-1, 42, 0, 0); 747 long2 C = cast(long2) _mm_cvtepu32_epi64(A); 748 long[2] correct = [4294967295, 42]; 749 assert(C.array == correct); 750 } 751 752 753 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers. 754 __m128i _mm_cvtepu8_epi16 (__m128i a) @trusted 755 { 756 // PERF DMD 757 static if (GDC_with_SSE41) 758 { 759 return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(ubyte16)a); 760 } 761 else 762 { 763 // LDC x86: generates pmovzxbw since LDC 1.12 -O1 also good without SSE4.1 764 // arm64: ushll since LDC 1.12 -O1 765 // PERF: catastrophic with GDC without SSE4.1 766 byte16 sa = cast(byte16)a; 767 short8 r; 768 r.ptr[0] = cast(ubyte)sa.array[0]; 769 r.ptr[1] = cast(ubyte)sa.array[1]; 770 r.ptr[2] = cast(ubyte)sa.array[2]; 771 r.ptr[3] = cast(ubyte)sa.array[3]; 772 r.ptr[4] = cast(ubyte)sa.array[4]; 773 r.ptr[5] = cast(ubyte)sa.array[5]; 774 r.ptr[6] = cast(ubyte)sa.array[6]; 775 r.ptr[7] = cast(ubyte)sa.array[7]; 776 return cast(__m128i)r; 777 } 778 } 779 unittest 780 { 781 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 782 short8 C = cast(short8) _mm_cvtepu8_epi16(A); 783 short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248]; 784 assert(C.array == correct); 785 } 786 787 788 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers. 789 __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted 790 { 791 // PERF DMD 792 static if (GDC_with_SSE41) 793 { 794 alias ubyte16 = __vector(ubyte[16]); 795 return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a); 796 } 797 else static if (LDC_with_ARM64) 798 { 799 // LDC arm64: a bit better than below in -O2 800 byte16 sa = cast(byte16)a; 801 int4 r; 802 for(int n = 0; n < 4; ++n) 803 r.ptr[n] = cast(ubyte)sa.array[n]; 804 return cast(__m128i)r; 805 } 806 else 807 { 808 // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1 809 // PERF: catastrophic with GDC without SSE4.1 810 byte16 sa = cast(byte16)a; 811 int4 r; 812 r.ptr[0] = cast(ubyte)sa.array[0]; 813 r.ptr[1] = cast(ubyte)sa.array[1]; 814 r.ptr[2] = cast(ubyte)sa.array[2]; 815 r.ptr[3] = cast(ubyte)sa.array[3]; 816 return cast(__m128i)r; 817 } 818 } 819 unittest 820 { 821 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 822 int4 C = cast(int4) _mm_cvtepu8_epi32(A); 823 int[4] correct = [127, 128, 1, 255]; 824 assert(C.array == correct); 825 } 826 827 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. 828 __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted 829 { 830 // PERF DMD 831 static if (GDC_with_SSE41) 832 { 833 alias ubyte16 = __vector(ubyte[16]); 834 return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a); 835 } 836 else static if (LDC_with_ARM64) 837 { 838 // LDC arm64: this optimizes better than the loop below 839 byte16 sa = cast(byte16)a; 840 long2 r; 841 for (int n = 0; n < 2; ++n) 842 r.ptr[n] = cast(ubyte)sa.array[n]; 843 return cast(__m128i)r; 844 } 845 else 846 { 847 // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1 848 byte16 sa = cast(byte16)a; 849 long2 r; 850 r.ptr[0] = cast(ubyte)sa.array[0]; 851 r.ptr[1] = cast(ubyte)sa.array[1]; 852 return cast(__m128i)r; 853 } 854 } 855 unittest 856 { 857 __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 858 long2 C = cast(long2) _mm_cvtepu8_epi64(A); 859 long[2] correct = [127, 254]; 860 assert(C.array == correct); 861 } 862 863 /// Conditionally multiply the packed double-precision (64-bit) floating-point elements 864 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally 865 /// store the sum in dst using the low 4 bits of `imm8`. 866 __m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted 867 { 868 // PERF DMD 869 static if (GDC_with_SSE41) 870 { 871 return __builtin_ia32_dppd(a, b, imm8 & 0x33); 872 } 873 else static if (LDC_with_SSE41) 874 { 875 return __builtin_ia32_dppd(a, b, imm8 & 0x33); 876 } 877 else 878 { 879 __m128d zero = _mm_setzero_pd(); 880 __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b); 881 double sum = temp.array[0] + temp.array[1]; 882 return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum)); 883 } 884 } 885 unittest 886 { 887 __m128d A = _mm_setr_pd(1.0, 2.0); 888 __m128d B = _mm_setr_pd(4.0, 8.0); 889 double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B); 890 double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B); 891 double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B); 892 double[2] correct1 = [ 4.0, 4.0]; 893 double[2] correct2 = [16.0, 0.0]; 894 double[2] correct3 = [ 0.0, 20.0]; 895 assert(R1.array == correct1); 896 assert(R2.array == correct2); 897 assert(R3.array == correct3); 898 } 899 900 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements 901 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, 902 /// and conditionally store the sum in result using the low 4 bits of `imm8`. 903 __m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted 904 { 905 // PERF DMD 906 static if (GDC_with_SSE41) 907 { 908 return __builtin_ia32_dpps(a, b, cast(ubyte)imm8); 909 } 910 else static if (LDC_with_SSE41) 911 { 912 return __builtin_ia32_dpps(a, b, cast(byte)imm8); 913 } 914 else 915 { 916 __m128 zero = _mm_setzero_ps(); 917 __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b); 918 float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3]; 919 return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum)); 920 } 921 } 922 unittest 923 { 924 __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f); 925 __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f); 926 float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B); 927 float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B); 928 float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B); 929 float[4] correct1 = [67.0f, 67.0f, 67.0f, 67.0f]; 930 float[4] correct2 = [23.0f, 0.0f, 23.0f, 0.0f]; 931 float[4] correct3 = [0.0f, 29.0f, 0.0f, 29.0f]; 932 assert(R1.array == correct1); 933 assert(R2.array == correct2); 934 assert(R3.array == correct3); 935 } 936 937 938 /// Extract a 32-bit integer from `a`, selected with `imm8`. 939 int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted 940 { 941 return (cast(int4)a).array[imm8 & 3]; 942 } 943 unittest 944 { 945 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 946 assert(_mm_extract_epi32(A, 0) == 1); 947 assert(_mm_extract_epi32(A, 1 + 8) == 2); 948 assert(_mm_extract_epi32(A, 3 + 4) == 4); 949 } 950 951 /// Extract a 64-bit integer from `a`, selected with `imm8`. 952 long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted 953 { 954 long2 la = cast(long2)a; 955 return la.array[imm8 & 1]; 956 } 957 unittest 958 { 959 __m128i A = _mm_setr_epi64(45, -67); 960 assert(_mm_extract_epi64(A, 0) == 45); 961 assert(_mm_extract_epi64(A, 1) == -67); 962 assert(_mm_extract_epi64(A, 2) == 45); 963 } 964 965 /// Extract an 8-bit integer from `a`, selected with `imm8`. 966 /// Warning: the returned value is zero-extended to 32-bits. 967 int _mm_extract_epi8 (__m128i a, const int imm8) @trusted 968 { 969 byte16 ba = cast(byte16)a; 970 return cast(ubyte) ba.array[imm8 & 15]; 971 } 972 unittest 973 { 974 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15); 975 assert(_mm_extract_epi8(A, 7) == 7); 976 assert(_mm_extract_epi8(A, 13) == 255); 977 assert(_mm_extract_epi8(A, 7 + 16) == 7); 978 } 979 980 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`. 981 /// Note: returns a 32-bit $(I integer). 982 int _mm_extract_ps (__m128 a, const int imm8) @trusted 983 { 984 return (cast(int4)a).array[imm8 & 3]; 985 } 986 unittest 987 { 988 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f); 989 assert(_mm_extract_ps(A, 0) == 0x3f800000); 990 assert(_mm_extract_ps(A, 1 + 8) == 0x40000000); 991 assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000); 992 } 993 994 995 996 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an 997 /// integer value, and store the results as packed double-precision floating-point elements. 998 __m128d _mm_floor_pd (__m128d a) @trusted 999 { 1000 static if (LDC_with_ARM64) 1001 { 1002 // LDC arm64 acceptable since 1.8 -O2 1003 long2 l = vcvtmq_s64_f64(a); 1004 double2 r; 1005 r.ptr[0] = l.array[0]; 1006 r.ptr[1] = l.array[1]; 1007 return r; 1008 } 1009 else 1010 { 1011 return _mm_round_pd!1(a); 1012 } 1013 } 1014 unittest 1015 { 1016 __m128d A = _mm_setr_pd(1.3f, -2.12f); 1017 __m128d B = _mm_setr_pd(53.6f, -2.7f); 1018 A = _mm_floor_pd(A); 1019 B = _mm_floor_pd(B); 1020 double[2] correctA = [1.0, -3.0]; 1021 double[2] correctB = [53.0, -3.0]; 1022 assert(A.array == correctA); 1023 assert(B.array == correctB); 1024 } 1025 1026 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an 1027 /// integer value, and store the results as packed single-precision floating-point elements. 1028 __m128 _mm_floor_ps (__m128 a) @trusted 1029 { 1030 static if (LDC_with_ARM64) 1031 { 1032 // LDC arm64 acceptable since 1.8 -O1 1033 int4 l = vcvtmq_s32_f32(a); 1034 float4 r; 1035 r.ptr[0] = l.array[0]; 1036 r.ptr[1] = l.array[1]; 1037 r.ptr[2] = l.array[2]; 1038 r.ptr[3] = l.array[3]; 1039 return r; 1040 } 1041 else 1042 { 1043 return _mm_round_ps!1(a); 1044 } 1045 } 1046 unittest 1047 { 1048 __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); 1049 __m128 C = _mm_floor_ps(A); 1050 float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f]; 1051 assert(C.array == correct); 1052 } 1053 1054 /// Round the lower double-precision (64-bit) floating-point element in `b` down to an 1055 /// integer value, store the result as a double-precision floating-point element in the 1056 /// lower element, and copy the upper element from `a` to the upper element. 1057 __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted 1058 { 1059 static if (LDC_with_ARM64) 1060 { 1061 a[0] = vcvtms_s64_f64(b[0]); 1062 return a; 1063 } 1064 else 1065 { 1066 return _mm_round_sd!1(a, b); 1067 } 1068 } 1069 unittest 1070 { 1071 __m128d A = _mm_setr_pd(1.3, -2.12); 1072 __m128d B = _mm_setr_pd(-53.1, -3.7); 1073 __m128d C = _mm_floor_sd(A, B); 1074 double[2] correct = [-54.0, -2.12]; 1075 assert(C.array == correct); 1076 } 1077 1078 /// Round the lower single-precision (32-bit) floating-point element in `b` down to an 1079 /// integer value, store the result as a single-precision floating-point element in the 1080 /// lower element, and copy the upper 3 packed elements from `a` to the upper elements. 1081 __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted 1082 { 1083 static if (LDC_with_ARM64) 1084 { 1085 a[0] = vcvtms_s32_f32(b[0]); 1086 return a; 1087 } 1088 else 1089 { 1090 return _mm_round_ss!1(a, b); 1091 } 1092 } 1093 unittest 1094 { 1095 __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); 1096 __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f); 1097 __m128 C = _mm_floor_ss(A, B); 1098 float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f]; 1099 assert(C.array == correct); 1100 } 1101 1102 /// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`. 1103 __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted 1104 { 1105 // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1 1106 // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1 1107 // LDC arm64: ins.s since LDC 1.8 -O2 1108 int4 ia = cast(int4)a; 1109 ia.ptr[imm8 & 3] = i; 1110 return cast(__m128i)ia; 1111 } 1112 unittest 1113 { 1114 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 1115 int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4); 1116 int[4] result = [1, 2, 5, 4]; 1117 assert(C.array == result); 1118 } 1119 1120 /// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`. 1121 __m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted 1122 { 1123 // GDC: nothing special to do, psinrq generated with -O1 -msse4.1 1124 // LDC x86: always do something sensible. 1125 long2 la = cast(long2)a; 1126 la.ptr[imm8 & 1] = i; 1127 return cast(__m128i)la; 1128 } 1129 unittest 1130 { 1131 __m128i A = _mm_setr_epi64(1, 2); 1132 long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2); 1133 long[2] result = [1, 5]; 1134 assert(C.array == result); 1135 } 1136 1137 /// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`. 1138 /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8. 1139 __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted 1140 { 1141 // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1 1142 // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory. 1143 byte16 ba = cast(byte16)a; 1144 ba.ptr[imm8 & 15] = cast(byte)i; 1145 return cast(__m128i)ba; 1146 } 1147 unittest 1148 { 1149 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 1150 byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16); 1151 byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 1152 assert(C.array == result); 1153 } 1154 1155 1156 /// Warning: of course it does something totally different from `_mm_insert_epi32`! 1157 /// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` 1158 /// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` 1159 /// (elements are zeroed out when the corresponding bit is set). 1160 __m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted 1161 { 1162 // PERF DMD 1163 static if (GDC_with_SSE41) 1164 { 1165 return __builtin_ia32_insertps128(a, b, cast(ubyte)imm8); 1166 } 1167 else static if (LDC_with_SSE41) 1168 { 1169 return __builtin_ia32_insertps128(a, b, cast(byte)imm8); 1170 } 1171 else 1172 { 1173 float4 tmp2 = a; 1174 float tmp1 = b.array[(imm8 >> 6) & 3]; 1175 tmp2.ptr[(imm8 >> 4) & 3] = tmp1; 1176 return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps()); 1177 } 1178 } 1179 unittest 1180 { 1181 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1182 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1183 __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B); 1184 float[4] correct = [1.0f, 2.0f, 0.0f, 7.0f]; 1185 assert(C.array == correct); 1186 } 1187 1188 1189 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. 1190 __m128i _mm_max_epi32 (__m128i a, __m128i b) @trusted 1191 { 1192 static if (GDC_with_SSE41) 1193 { 1194 return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b); 1195 } 1196 else version(LDC) 1197 { 1198 // x86: pmaxsd since LDC 1.1 -O1 1199 // ARM: smax.4s since LDC 1.8 -01 1200 int4 sa = cast(int4)a; 1201 int4 sb = cast(int4)b; 1202 int4 greater = greaterMask!int4(sa, sb); 1203 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1204 } 1205 else 1206 { 1207 __m128i higher = _mm_cmpgt_epi32(a, b); 1208 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1209 __m128i mask = _mm_and_si128(aTob, higher); 1210 return _mm_xor_si128(b, mask); 1211 } 1212 } 1213 unittest 1214 { 1215 int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), 1216 _mm_setr_epi32( -4,-8, 9, -8)); 1217 int[4] correct = [0x7fffffff, 1, 9, 7]; 1218 assert(R.array == correct); 1219 } 1220 1221 /// Compare packed signed 8-bit integers in `a` and `b`, 1222 /// and return packed maximum values. 1223 __m128i _mm_max_epi8 (__m128i a, __m128i b) @trusted 1224 { 1225 // PERF DMD 1226 static if (GDC_with_SSE41) 1227 { 1228 return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b); 1229 } 1230 else version(LDC) 1231 { 1232 // x86: pmaxsb since LDC 1.1 -O1 1233 // ARM64: smax.16b since LDC 1.8.0 -O1 1234 byte16 sa = cast(byte16)a; 1235 byte16 sb = cast(byte16)b; 1236 byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); 1237 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1238 } 1239 else 1240 { 1241 __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else 1242 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1243 __m128i mask = _mm_and_si128(aTob, lower); 1244 return _mm_xor_si128(b, mask); 1245 } 1246 } 1247 unittest 1248 { 1249 __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); 1250 __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 1251 byte16 R = cast(byte16) _mm_max_epi8(A, B); 1252 byte[16] correct = [127, 1, 9, -7, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0]; 1253 assert(R.array == correct); 1254 } 1255 1256 /// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values. 1257 __m128i _mm_max_epu16 (__m128i a, __m128i b) @trusted 1258 { 1259 // PERF DMD 1260 static if (GDC_with_SSE41) 1261 { 1262 return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b); 1263 } 1264 else version(LDC) 1265 { 1266 // x86: pmaxuw since LDC 1.1 -O1 1267 // ARM64: umax.8h since LDC 1.8.0 -O1 1268 // PERF: without sse4.1, LLVM 12 produces a very interesting 1269 // psubusw xmm0, xmm1 1270 // paddw xmm0, xmm1 1271 // sequence that maybe should go in other min/max intrinsics? 1272 ushort8 sa = cast(ushort8)a; 1273 ushort8 sb = cast(ushort8)b; 1274 ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb); 1275 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1276 } 1277 else 1278 { 1279 b = _mm_subs_epu16(b, a); 1280 b = _mm_add_epi16(b, a); 1281 return b; 1282 } 1283 } 1284 unittest 1285 { 1286 short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), 1287 _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); 1288 short[8] correct = [ -4, -8, -4, -7, 9,-32768, 0, 57]; 1289 assert(R.array == correct); 1290 } 1291 1292 /// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values. 1293 __m128i _mm_max_epu32 (__m128i a, __m128i b) @trusted 1294 { 1295 // PERF DMD 1296 static if (GDC_with_SSE41) 1297 { 1298 return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b); 1299 } 1300 else version(LDC) 1301 { 1302 // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1 1303 // ARM64: umax.4s since LDC 1.8.0 -O1 1304 uint4 sa = cast(uint4)a; 1305 uint4 sb = cast(uint4)b; 1306 uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); 1307 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1308 } 1309 else 1310 { 1311 __m128i valueShift = _mm_set1_epi32(-0x80000000); 1312 __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift)); 1313 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1314 __m128i mask = _mm_and_si128(aTob, higher); 1315 return _mm_xor_si128(b, mask); 1316 } 1317 } 1318 unittest 1319 { 1320 int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), 1321 _mm_setr_epi32( -4,-8, 9, -8)); 1322 int[4] correct = [ -4,-8, 9, -7]; 1323 assert(R.array == correct); 1324 } 1325 1326 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. 1327 __m128i _mm_min_epi32 (__m128i a, __m128i b) @trusted 1328 { 1329 // PERF DMD 1330 static if (GDC_with_SSE41) 1331 { 1332 return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b); 1333 } 1334 else version(LDC) 1335 { 1336 // x86: pminsd since LDC 1.1 -O1, also good without sse4.1 1337 // ARM: smin.4s since LDC 1.8 -01 1338 int4 sa = cast(int4)a; 1339 int4 sb = cast(int4)b; 1340 int4 greater = greaterMask!int4(sa, sb); 1341 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1342 } 1343 else 1344 { 1345 __m128i higher = _mm_cmplt_epi32(a, b); 1346 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1347 __m128i mask = _mm_and_si128(aTob, higher); 1348 return _mm_xor_si128(b, mask); 1349 } 1350 } 1351 unittest 1352 { 1353 int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), 1354 _mm_setr_epi32( -4, -8, 9, -8)); 1355 int[4] correct = [ -4, -8, -4, -8]; 1356 assert(R.array == correct); 1357 } 1358 1359 /// Compare packed signed 8-bit integers in `a` and `b`, 1360 /// and return packed minimum values. 1361 __m128i _mm_min_epi8 (__m128i a, __m128i b) @trusted 1362 { 1363 // PERF DMD 1364 static if (GDC_with_SSE41) 1365 { 1366 return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b); 1367 } 1368 else version(LDC) 1369 { 1370 // x86: pminsb since LDC 1.1 -O1 1371 // ARM64: smin.16b since LDC 1.8.0 -O1 1372 byte16 sa = cast(byte16)a; 1373 byte16 sb = cast(byte16)b; 1374 byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); 1375 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1376 } 1377 else 1378 { 1379 __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else 1380 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1381 __m128i mask = _mm_and_si128(aTob, lower); 1382 return _mm_xor_si128(b, mask); 1383 } 1384 } 1385 unittest 1386 { 1387 __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); 1388 __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 1389 byte16 R = cast(byte16) _mm_min_epi8(A, B); 1390 byte[16] correct = [ 4, -8, -4, -8, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 1391 assert(R.array == correct); 1392 } 1393 1394 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst. 1395 __m128i _mm_min_epu16 (__m128i a, __m128i b) @trusted 1396 { 1397 // PERF DMD 1398 static if (GDC_with_SSE41) 1399 { 1400 return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b); 1401 } 1402 else version(LDC) 1403 { 1404 // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1 1405 // ARM64: umin.8h since LDC 1.8.0 -O1 1406 ushort8 sa = cast(ushort8)a; 1407 ushort8 sb = cast(ushort8)b; 1408 ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa); 1409 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1410 } 1411 else 1412 { 1413 __m128i c = _mm_subs_epu16(b, a); 1414 b = _mm_sub_epi16(b, c); 1415 return b; 1416 } 1417 } 1418 unittest 1419 { 1420 short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), 1421 _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); 1422 short[8] correct = [32767, 1, 9, -8, 0, 7, 0, 0]; 1423 assert(R.array == correct); 1424 } 1425 1426 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst. 1427 __m128i _mm_min_epu32 (__m128i a, __m128i b) @trusted 1428 { 1429 // PERF DMD 1430 static if (GDC_with_SSE41) 1431 { 1432 return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b); 1433 } 1434 else version(LDC) 1435 { 1436 // x86: pminud since LDC 1.1 -O1, also good without sse4.1 1437 // ARM64: umin.4s since LDC 1.8.0 -O1 1438 uint4 sa = cast(uint4)a; 1439 uint4 sb = cast(uint4)b; 1440 uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); 1441 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1442 } 1443 else 1444 { 1445 __m128i valueShift = _mm_set1_epi32(-0x80000000); 1446 __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift)); 1447 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1448 __m128i mask = _mm_and_si128(aTob, higher); 1449 return _mm_xor_si128(b, mask); 1450 } 1451 } 1452 unittest 1453 { 1454 int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), 1455 _mm_setr_epi32( -4,-8, 9, -8)); 1456 int[4] correct = [0x7fffffff, 1, 4, -8]; 1457 assert(R.array == correct); 1458 } 1459 1460 /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, 1461 /// store the minimum and index in return value, and zero the remaining bits. 1462 __m128i _mm_minpos_epu16 (__m128i a) @trusted 1463 { 1464 // PERF DMD 1465 static if (GDC_with_SSE41) 1466 { 1467 return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); 1468 } 1469 else static if (LDC_with_SSE41) 1470 { 1471 return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); 1472 } 1473 else static if (LDC_with_ARM64) 1474 { 1475 __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 1476 __m128i combinedLo = _mm_unpacklo_epi16(indices, a); 1477 __m128i combinedHi = _mm_unpackhi_epi16(indices, a); 1478 __m128i best = _mm_min_epu32(combinedLo, combinedHi); 1479 best = _mm_min_epu32(best, _mm_srli_si128!8(best)); 1480 best = _mm_min_epu32(best, _mm_srli_si128!4(best)); 1481 short8 sbest = cast(short8)best; 1482 short8 r; 1483 r[0] = sbest[1]; 1484 r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie 1485 r[2] = 0; 1486 r[3] = 0; 1487 r[4] = 0; 1488 r[5] = 0; 1489 r[6] = 0; 1490 r[7] = 0; 1491 return cast(__m128i)r; 1492 } 1493 else 1494 { 1495 short8 sa = cast(short8)a; 1496 ushort min = 0xffff; 1497 int index = 0; 1498 for(int n = 0; n < 8; ++n) 1499 { 1500 ushort c = sa.array[n]; 1501 if (c < min) 1502 { 1503 min = c; 1504 index = n; 1505 } 1506 } 1507 short8 r; 1508 r.ptr[0] = min; 1509 r.ptr[1] = cast(short)index; 1510 return cast(__m128i)r; 1511 } 1512 } 1513 unittest 1514 { 1515 __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6); 1516 __m128i B = _mm_setr_epi16(14, 4, 4, 2, -3, 2, 5, 6); 1517 short8 R1 = cast(short8) _mm_minpos_epu16(A); 1518 short8 R2 = cast(short8) _mm_minpos_epu16(B); 1519 short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0]; 1520 short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0]; 1521 assert(R1.array == correct1); 1522 assert(R2.array == correct2); 1523 } 1524 1525 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers 1526 /// in `a` compared to those in `b`, and store the 16-bit results in dst. 1527 /// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. 1528 /// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. 1529 /// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting 1530 /// at the offset specified in `imm8[2]`. 1531 __m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted 1532 { 1533 // PERF DMD 1534 static if (GDC_with_SSE41) 1535 { 1536 return cast(__m128i) __builtin_ia32_mpsadbw128(cast(ubyte16)a, cast(ubyte16)b, cast(ubyte)imm8); 1537 } 1538 else static if (LDC_with_SSE41) 1539 { 1540 return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8); 1541 } 1542 else 1543 { 1544 int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable... 1545 int b_offset = (imm8 & 3) * 4; 1546 1547 byte16 ba = cast(byte16)a; 1548 byte16 bb = cast(byte16)b; 1549 short8 r; 1550 1551 __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0); 1552 1553 for (int j = 0; j < 8; j += 2) 1554 { 1555 int k = a_offset + j; 1556 __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3], 1557 0, 0, 0, 0, 1558 ba[k+1], ba[k+2], ba[k+3], ba[k+4], 1559 0, 0, 0, 0); 1560 short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64 1561 r.ptr[j] = diffs.array[0]; 1562 r.ptr[j+1] = diffs.array[4]; 1563 } 1564 return cast(__m128i)r; 1565 } 1566 } 1567 unittest 1568 { 1569 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 1570 __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5, 5, 5, 12, 13, 14, 15); 1571 short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23]; 1572 short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749]; 1573 short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35]; 1574 short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741]; 1575 short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4]; 1576 short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B); 1577 short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B); 1578 short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B); 1579 short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B); 1580 short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B); 1581 assert(r1.array == correct1); 1582 assert(r4.array == correct4); 1583 assert(r5.array == correct5); 1584 assert(r7.array == correct7); 1585 assert(r8.array == correct0); 1586 } 1587 1588 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst. 1589 __m128i _mm_mul_epi32 (__m128i a, __m128i b) @trusted 1590 { 1591 // PERF DMD 1592 static if (GDC_with_SSE41) 1593 { 1594 return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b); 1595 } 1596 else static if (LDC_with_SSE41) 1597 { 1598 // For some reason, clang has the builtin but it's not in IntrinsicsX86.td 1599 // Use IR instead. 1600 // This generates pmuldq with since LDC 1.2.0 -O0 1601 enum ir = ` 1602 %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 2> 1603 %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> <i32 0, i32 2> 1604 %la = sext <2 x i32> %ia to <2 x i64> 1605 %lb = sext <2 x i32> %ib to <2 x i64> 1606 %r = mul <2 x i64> %la, %lb 1607 ret <2 x i64> %r`; 1608 return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b); 1609 } 1610 else static if (LDC_with_ARM64) 1611 { 1612 // 3 instructions since LDC 1.8 -O2 1613 // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull 1614 int2 a_lo = vmovn_s64(cast(long2)a); 1615 int2 b_lo = vmovn_s64(cast(long2)b); 1616 return cast(__m128i) vmull_s32(a_lo, b_lo); 1617 } 1618 else 1619 { 1620 int4 ia = cast(int4)a; 1621 int4 ib = cast(int4)b; 1622 long2 r; 1623 r.ptr[0] = cast(long)ia.array[0] * ib.array[0]; 1624 r.ptr[1] = cast(long)ia.array[2] * ib.array[2]; 1625 return cast(__m128i)r; 1626 } 1627 } 1628 unittest 1629 { 1630 __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); 1631 __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); 1632 long2 R = cast(long2) _mm_mul_epi32(A, B); 1633 long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144]; 1634 assert(R.array == correct); 1635 } 1636 1637 /// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, 1638 /// return the low 32 bits of the intermediate integers. 1639 __m128i _mm_mullo_epi32 (__m128i a, __m128i b) @trusted 1640 { 1641 // PERF DMD 1642 // PERF GDC without SSE4.1 could be better 1643 static if (GDC_with_SSE41) 1644 { 1645 int4 ia = cast(int4)a; 1646 int4 ib = cast(int4)b; 1647 // Note: older GDC doesn't have that op, but older GDC 1648 // also has no support for -msse4.1 detection 1649 return cast(__m128i)(a * b); 1650 } 1651 else version(LDC) 1652 { 1653 int4 ia = cast(int4)a; 1654 int4 ib = cast(int4)b; 1655 return cast(__m128i)(a * b); 1656 } 1657 else 1658 { 1659 // DMD doesn't take the above 1660 int4 ia = cast(int4)a; 1661 int4 ib = cast(int4)b; 1662 int4 r; 1663 r.ptr[0] = ia.array[0] * ib.array[0]; 1664 r.ptr[1] = ia.array[1] * ib.array[1]; 1665 r.ptr[2] = ia.array[2] * ib.array[2]; 1666 r.ptr[3] = ia.array[3] * ib.array[3]; 1667 return r; 1668 } 1669 } 1670 unittest 1671 { 1672 __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); 1673 __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); 1674 int4 R = cast(int4) _mm_mullo_epi32(A, B); 1675 int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0]; 1676 assert(R.array == correct); 1677 } 1678 1679 1680 /// Convert packed signed 32-bit integers from `a` and `b` 1681 /// to packed 16-bit integers using unsigned saturation. 1682 __m128i _mm_packus_epi32 (__m128i a, __m128i b) @trusted 1683 { 1684 static if (GDC_with_SSE41) 1685 { 1686 // PERF For some reason doesn't generates the builtin??? 1687 return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); 1688 } 1689 else static if (LDC_with_SSE41) 1690 { 1691 return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); 1692 } 1693 else static if (LDC_with_ARM64) 1694 { 1695 int4 z; 1696 z = 0; 1697 return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)), 1698 vqmovn_u32(vmaxq_s32(z, cast(int4)b))); 1699 } 1700 else 1701 { 1702 // PERF: not great without SSE4.1 1703 int4 sa = cast(int4)a; 1704 int4 sb = cast(int4)b; 1705 align(16) ushort[8] result; 1706 for (int i = 0; i < 4; ++i) 1707 { 1708 int s = sa.array[i]; 1709 if (s < 0) s = 0; 1710 if (s > 65535) s = 65535; 1711 result.ptr[i] = cast(ushort)s; 1712 1713 s = sb.array[i]; 1714 if (s < 0) s = 0; 1715 if (s > 65535) s = 65535; 1716 result.ptr[i+4] = cast(ushort)s; 1717 } 1718 return *cast(__m128i*)(result.ptr); 1719 } 1720 } 1721 unittest 1722 { 1723 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 1724 short8 R = cast(short8) _mm_packus_epi32(A, A); 1725 short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0]; 1726 assert(R.array == correct); 1727 } 1728 1729 1730 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 1731 /// rounding parameter, and store the results as packed double-precision floating-point elements. 1732 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1733 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1734 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1735 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1736 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1737 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1738 __m128d _mm_round_pd(int rounding)(__m128d a) @trusted 1739 { 1740 // PERF DMD 1741 static if (GDC_with_SSE41) 1742 { 1743 return __builtin_ia32_roundpd(a, rounding); 1744 } 1745 else static if (LDC_with_SSE41) 1746 { 1747 return __builtin_ia32_roundpd(a, rounding); 1748 } 1749 else 1750 { 1751 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1752 { 1753 // Convert to 64-bit integers 1754 long lo = _mm_cvtsd_si64(a); 1755 a.ptr[0] = a.array[1]; 1756 long hi = _mm_cvtsd_si64(a); 1757 return _mm_setr_pd(lo, hi); 1758 } 1759 else 1760 { 1761 version(GNU) pragma(inline, false); // else fail unittest with optimizations 1762 1763 uint old = _MM_GET_ROUNDING_MODE(); 1764 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1765 1766 // Convert to 64-bit integers 1767 long lo = _mm_cvtsd_si64(a); 1768 a.ptr[0] = a.array[1]; 1769 long hi = _mm_cvtsd_si64(a); 1770 1771 // Convert back to double to achieve the rounding 1772 // The problem is that a 64-bit double can't represent all the values 1773 // a 64-bit integer can (and vice-versa). So this function won't work for 1774 // large values. (TODO: what range exactly?) 1775 _MM_SET_ROUNDING_MODE(old); 1776 return _mm_setr_pd(lo, hi); 1777 } 1778 } 1779 } 1780 unittest 1781 { 1782 // tested in other intrinsics 1783 } 1784 1785 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 1786 /// rounding parameter, and store the results as packed single-precision floating-point elements. 1787 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1788 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1789 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1790 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1791 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1792 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1793 __m128 _mm_round_ps(int rounding)(__m128 a) @trusted 1794 { 1795 static if (GDC_with_SSE41) 1796 { 1797 return __builtin_ia32_roundps(a, rounding); 1798 } 1799 else static if (LDC_with_SSE41) 1800 { 1801 return __builtin_ia32_roundps(a, rounding); 1802 } 1803 else 1804 { 1805 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1806 { 1807 __m128i integers = _mm_cvtps_epi32(a); 1808 return _mm_cvtepi32_ps(integers); 1809 } 1810 else 1811 { 1812 version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled 1813 uint old = _MM_GET_ROUNDING_MODE(); 1814 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1815 scope(exit) _MM_SET_ROUNDING_MODE(old); 1816 1817 // Convert to 64-bit integers 1818 __m128i integers = _mm_cvtps_epi32(a); 1819 1820 // Convert back to float to achieve the rounding 1821 // The problem is that a 32-float can't represent all the values 1822 // a 32-bit integer can (and vice-versa). So this function won't work for 1823 // large values. (TODO: what range exactly?) 1824 __m128 result = _mm_cvtepi32_ps(integers); 1825 1826 return result; 1827 } 1828 } 1829 } 1830 unittest 1831 { 1832 // tested in other intrinsics 1833 } 1834 1835 1836 /// Round the lower double-precision (64-bit) floating-point element in `b` using the 1837 /// rounding parameter, store the result as a double-precision floating-point element 1838 /// in the lower element of result, and copy the upper element from `a` to the upper element of result. 1839 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1840 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1841 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1842 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1843 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1844 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1845 __m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted 1846 { 1847 static if (GDC_with_SSE41) 1848 { 1849 return __builtin_ia32_roundsd(a, b, rounding); 1850 } 1851 else static if (LDC_with_SSE41) 1852 { 1853 return __builtin_ia32_roundsd(a, b, rounding); 1854 } 1855 else 1856 { 1857 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1858 { 1859 // Convert to 64-bit integer 1860 long b0 = _mm_cvtsd_si64(b); 1861 a.ptr[0] = b0; 1862 return a; 1863 } 1864 else 1865 { 1866 version(GNU) pragma(inline, false); // else fail unittest with optimizations 1867 1868 uint old = _MM_GET_ROUNDING_MODE(); 1869 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1870 1871 // Convert to 64-bit integer 1872 long b0 = _mm_cvtsd_si64(b); 1873 a.ptr[0] = b0; 1874 1875 // Convert back to double to achieve the rounding 1876 // The problem is that a 64-bit double can't represent all the values 1877 // a 64-bit integer can (and vice-versa). So this function won't work for 1878 // large values. (TODO: what range exactly?) 1879 _MM_SET_ROUNDING_MODE(old); 1880 return a; 1881 } 1882 } 1883 } 1884 unittest 1885 { 1886 // tested in other intrinsics 1887 } 1888 1889 1890 /// Round the lower single-precision (32-bit) floating-point element in `b` using the 1891 /// rounding parameter, store the result as a single-precision floating-point element 1892 /// in the lower element of result, and copy the upper 3 packed elements from `a` 1893 /// to the upper elements of result. 1894 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1895 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1896 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1897 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1898 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1899 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1900 __m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted 1901 { 1902 static if (GDC_with_SSE41) 1903 { 1904 return __builtin_ia32_roundss(a, b, rounding); 1905 } 1906 else static if (LDC_with_SSE41) 1907 { 1908 return __builtin_ia32_roundss(a, b, rounding); 1909 } 1910 else 1911 { 1912 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1913 { 1914 int b0 = _mm_cvtss_si32(b); 1915 a.ptr[0] = b0; 1916 return a; 1917 } 1918 else version(GNU) 1919 { 1920 pragma(inline, false) 1921 __m128 GDCworkaround() nothrow @nogc @trusted 1922 { 1923 uint old = _MM_GET_ROUNDING_MODE(); 1924 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1925 1926 // Convert to 32-bit integer 1927 int b0 = _mm_cvtss_si32(b); 1928 a.ptr[0] = b0; 1929 1930 // Convert back to double to achieve the rounding 1931 // The problem is that a 64-bit double can't represent all the values 1932 // a 64-bit integer can (and vice-versa). So this function won't work for 1933 // large values. (TODO: what range exactly?) 1934 _MM_SET_ROUNDING_MODE(old); 1935 return a; 1936 } 1937 return GDCworkaround(); 1938 } 1939 else 1940 { 1941 uint old = _MM_GET_ROUNDING_MODE(); 1942 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1943 1944 // Convert to 32-bit integer 1945 int b0 = _mm_cvtss_si32(b); 1946 a.ptr[0] = b0; 1947 1948 // Convert back to double to achieve the rounding 1949 // The problem is that a 64-bit double can't represent all the values 1950 // a 64-bit integer can (and vice-versa). So this function won't work for 1951 // large values. (TODO: what range exactly?) 1952 _MM_SET_ROUNDING_MODE(old); 1953 return a; 1954 } 1955 } 1956 } 1957 unittest 1958 { 1959 // tested in other intrinsics 1960 } 1961 1962 1963 /// Load 128-bits of integer data from memory using a non-temporal memory hint. 1964 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection 1965 /// exception may be generated. 1966 __m128i _mm_stream_load_si128 (__m128i * mem_addr) pure @trusted 1967 { 1968 // PERF DMD D_SIMD 1969 static if (GDC_with_SSE41) 1970 { 1971 return cast(__m128i) __builtin_ia32_movntdqa(cast(long2*)mem_addr); 1972 } 1973 else version(LDC) 1974 { 1975 enum prefix = `!0 = !{ i32 1 }`; 1976 enum ir = ` 1977 %r = load <4 x i32>, <4 x i32>* %0, !nontemporal !0 1978 ret <4 x i32> %r`; 1979 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", int4, int4*)(mem_addr); 1980 } 1981 else 1982 { 1983 return *mem_addr; // regular move instead 1984 } 1985 } 1986 // TODO unittest 1987 1988 1989 /// Return 1 if all bits in `a` are all 1's. Else return 0. 1990 int _mm_test_all_ones (__m128i a) @safe 1991 { 1992 return _mm_testc_si128(a, _mm_set1_epi32(-1)); 1993 } 1994 unittest 1995 { 1996 __m128i A = _mm_set1_epi32(-1); 1997 __m128i B = _mm_set_epi32(-1, -2, -1, -1); 1998 assert(_mm_test_all_ones(A) == 1); 1999 assert(_mm_test_all_ones(B) == 0); 2000 } 2001 2002 /// Return 1 if all bits in `a` are all 0's. Else return 0. 2003 // This is a #BONUS since it was lacking in Intel Intrinsics API. 2004 int _mm_test_all_zeros (__m128i a) @safe 2005 { 2006 return _mm_testz_si128(a, _mm_set1_epi32(-1)); 2007 } 2008 unittest 2009 { 2010 __m128i A = _mm_set1_epi32(0); 2011 __m128i B = _mm_set_epi32(0, 8, 0, 0); 2012 assert(_mm_test_all_zeros(A) == 1); 2013 assert(_mm_test_all_zeros(B) == 0); 2014 } 2015 2016 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, 2017 /// and return 1 if the result is zero, otherwise return 0. 2018 int _mm_test_all_zeros (__m128i a, __m128i mask) @safe 2019 { 2020 return _mm_testz_si128(a, mask); // it's really the same, but with a good name 2021 } 2022 2023 /// Compute the bitwise AND of 128 bits (representing integer data) in a and mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 2024 int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted 2025 { 2026 return _mm_testnzc_si128(a, mask); 2027 } 2028 2029 /// Compute the bitwise NOT of a and then AND with b, and return 1 if the 2030 /// result is zero, otherwise return 0. 2031 /// In other words, test if all bits masked by `b` are 1 in `a`. 2032 int _mm_testc_si128 (__m128i a, __m128i b) @trusted 2033 { 2034 // PERF DMD 2035 static if (GDC_with_SSE41) 2036 { 2037 return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); 2038 } 2039 else static if (LDC_with_SSE41) 2040 { 2041 return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); 2042 } 2043 else static if (LDC_with_ARM64) 2044 { 2045 // Acceptable since LDC 1.8 -02 2046 long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a); 2047 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); 2048 } 2049 else 2050 { 2051 __m128i c = ~a & b; 2052 int[4] zero = [0, 0, 0, 0]; 2053 return c.array == zero; 2054 } 2055 } 2056 unittest 2057 { 2058 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2059 __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00); 2060 __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); 2061 assert(_mm_testc_si128(A, A) == 1); 2062 assert(_mm_testc_si128(A, M1) == 0); 2063 assert(_mm_testc_si128(A, M2) == 1); 2064 } 2065 2066 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, 2067 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 2068 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 2069 /// result is zero, otherwise set CF to 0. 2070 /// Return 1 if both the ZF and CF values are zero, otherwise return 0. 2071 int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted 2072 { 2073 // PERF DMD 2074 static if (GDC_with_SSE41) 2075 { 2076 return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); 2077 } 2078 else static if (LDC_with_SSE41) 2079 { 2080 return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); 2081 } 2082 else static if (LDC_with_ARM64) 2083 { 2084 long2 s640 = vandq_s64(cast(long2)b, cast(long2)a); 2085 long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a); 2086 2087 return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)) 2088 | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) ); 2089 } 2090 else 2091 { 2092 __m128i c = a & b; 2093 __m128i d = ~a & b; 2094 int[4] zero = [0, 0, 0, 0]; 2095 return !( (c.array == zero) || (d.array == zero)); 2096 } 2097 } 2098 unittest 2099 { 2100 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2101 __m128i M = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00); 2102 __m128i Z = _mm_setzero_si128(); 2103 assert(_mm_testnzc_si128(A, Z) == 0); 2104 assert(_mm_testnzc_si128(A, M) == 1); 2105 assert(_mm_testnzc_si128(A, A) == 0); 2106 } 2107 2108 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, 2109 /// and return 1 if the result is zero, otherwise return 0. 2110 /// In other words, test if all bits masked by `b` are 0 in `a`. 2111 int _mm_testz_si128 (__m128i a, __m128i b) @trusted 2112 { 2113 // PERF DMD 2114 static if (GDC_with_SSE41) 2115 { 2116 return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); 2117 } 2118 else static if (LDC_with_SSE41) 2119 { 2120 return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); 2121 } 2122 else static if (LDC_with_ARM64) 2123 { 2124 // Acceptable since LDC 1.8 -02 2125 long2 s64 = vandq_s64(cast(long2)a, cast(long2)b); 2126 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); 2127 } 2128 else 2129 { 2130 __m128i c = a & b; 2131 int[4] zero = [0, 0, 0, 0]; 2132 return c.array == zero; 2133 } 2134 } 2135 unittest 2136 { 2137 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2138 __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07); 2139 __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); 2140 assert(_mm_testz_si128(A, A) == 0); 2141 assert(_mm_testz_si128(A, M1) == 1); 2142 assert(_mm_testz_si128(A, M2) == 0); 2143 } 2144