1 /** 2 * SSE4.1 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1 4 * 5 * Copyright: Guillaume Piolat 2021. 6 * Johan Engelen 2021. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.smmintrin; 10 11 // SSE4.1 instructions 12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1 13 // Note: this header will work whether you have SSE4.1 enabled or not. 14 // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively 15 // generate SSE4.1 instructions. 16 // With GDC, use "dflags-gdc": ["-msse4.1"] or equivalent to generate SSE4.1 instructions. 17 18 public import inteli.types; 19 import inteli.internals; 20 21 // smmintrin pulls in all previous instruction set intrinsics. 22 public import inteli.tmmintrin; 23 24 nothrow @nogc: 25 26 enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes 27 enum int _MM_FROUND_TO_NEG_INF = 0x01; /// ditto 28 enum int _MM_FROUND_TO_POS_INF = 0x02; /// ditto 29 enum int _MM_FROUND_TO_ZERO = 0x03; /// ditto 30 enum int _MM_FROUND_CUR_DIRECTION = 0x04; /// ditto 31 enum int _MM_FROUND_RAISE_EXC = 0x00; /// ditto 32 enum int _MM_FROUND_NO_EXC = 0x08; /// ditto 33 34 enum int _MM_FROUND_NINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT); 35 enum int _MM_FROUND_FLOOR = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF); 36 enum int _MM_FROUND_CEIL = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF); 37 enum int _MM_FROUND_TRUNC = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO); 38 enum int _MM_FROUND_RINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION); 39 enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION); 40 41 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results. 42 // Note: changed signature, GDC needs a compile-time value for imm8. 43 __m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) @trusted 44 { 45 // PERF DMD 46 static if (GDC_with_SSE41) 47 { 48 return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8); 49 } 50 else 51 { 52 // LDC x86 This generates pblendw since LDC 1.1 and -O2 53 short8 r; 54 short8 sa = cast(short8)a; 55 short8 sb = cast(short8)b; 56 for (int n = 0; n < 8; ++n) 57 { 58 r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n]; 59 } 60 return cast(__m128i)r; 61 } 62 } 63 unittest 64 { 65 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 66 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 67 short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011 68 short[8] correct = [8, 9, 2, 3, 12, 5, 6, 15]; 69 assert(C.array == correct); 70 } 71 72 73 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`. 74 // Note: changed signature, GDC needs a compile-time value for `imm8`. 75 __m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted 76 { 77 static assert(imm8 >= 0 && imm8 < 4); 78 // PERF DMD 79 static if (GDC_with_SSE41) 80 { 81 return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8); 82 } 83 else 84 { 85 // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12 86 double2 r; 87 for (int n = 0; n < 2; ++n) 88 { 89 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 90 } 91 return cast(__m128d)r; 92 } 93 } 94 unittest 95 { 96 __m128d A = _mm_setr_pd(0, 1); 97 __m128d B = _mm_setr_pd(8, 9); 98 double2 C = _mm_blend_pd!2(A, B); 99 double[2] correct = [0, 9]; 100 assert(C.array == correct); 101 } 102 103 104 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control mask `imm8`. 105 // Note: changed signature, GDC needs a compile-time value for imm8. 106 __m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) @trusted 107 { 108 // PERF DMD 109 static assert(imm8 >= 0 && imm8 < 16); 110 static if (GDC_with_SSE41) 111 { 112 return __builtin_ia32_blendps(a, b, imm8); 113 } 114 else version(LDC) 115 { 116 // LDC x86: generates blendps since LDC 1.1 -O2 117 // arm64: pretty good, two instructions worst case 118 return shufflevectorLDC!(float4, (imm8 & 1) ? 4 : 0, 119 (imm8 & 2) ? 5 : 1, 120 (imm8 & 4) ? 6 : 2, 121 (imm8 & 8) ? 7 : 3)(a, b); 122 } 123 else 124 { 125 __m128 r; // PERF =void; 126 for (int n = 0; n < 4; ++n) 127 { 128 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 129 } 130 return r; 131 } 132 } 133 unittest 134 { 135 __m128 A = _mm_setr_ps(0, 1, 2, 3); 136 __m128 B = _mm_setr_ps(8, 9, 10, 11); 137 float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101 138 float[4] correct = [8, 1, 10, 11]; 139 assert(C.array == correct); 140 } 141 142 /// Blend packed 8-bit integers from `a` and `b` using `mask`. 143 __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) @trusted 144 { 145 // PERF DMD 146 /*static if (GDC_with_SSE41) 147 { 148 // This intrinsic do nothing in GDC 12. 149 // TODO report to GDC. No problem in GCC. 150 return cast(__m128i) __builtin_ia32_pblendvb128 (cast(ubyte16)a, cast(ubyte16)b, cast(ubyte16)mask); 151 } 152 else*/ 153 static if (LDC_with_SSE41) 154 { 155 return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask); 156 } 157 else static if (LDC_with_ARM64) 158 { 159 // LDC arm64: two instructions since LDC 1.12 -O2 160 byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7); 161 return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a); 162 } 163 else 164 { 165 __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask); 166 return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b); 167 } 168 } 169 unittest 170 { 171 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 172 8, 9, 10, 11, 12, 13, 14, 15); 173 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 174 24, 25, 26, 27, 28, 29, 30, 31); 175 __m128i M = _mm_setr_epi8( 1, -1, 1, 1, -4, 1, -8, 127, 176 1, 1, -1, -1, 4, 1, 8, -128); 177 byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M); 178 byte[16] correct = [ 0, 17, 2, 3, 20, 5, 22, 7, 179 8, 9, 26, 27, 12, 13, 14, 31 ]; 180 assert(R.array == correct); 181 } 182 183 184 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`. 185 __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted 186 { 187 // PERF DMD 188 static if (GDC_with_SSE42) 189 { 190 // PERF Amazingly enough, GCC/GDC generates the blendvpd instruction 191 // with -msse4.2 but not -msse4.1. 192 // Not sure what is the reason, and there is a replacement sequence. 193 // Sounds like a bug. 194 return __builtin_ia32_blendvpd(a, b, mask); 195 } 196 else static if (LDC_with_SSE41) 197 { 198 return __builtin_ia32_blendvpd(a, b, mask); 199 } 200 else static if (LDC_with_ARM64) 201 { 202 long2 shift; 203 shift = 63; 204 long2 lmask = cast(long2)mask >> shift; 205 return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a); 206 } 207 else 208 { 209 __m128d r; // PERF =void; 210 long2 lmask = cast(long2)mask; 211 for (int n = 0; n < 2; ++n) 212 { 213 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 214 } 215 return r; 216 } 217 } 218 unittest 219 { 220 __m128d A = _mm_setr_pd(1.0, 2.0); 221 __m128d B = _mm_setr_pd(3.0, 4.0); 222 __m128d M1 = _mm_setr_pd(-3.0, 2.0); 223 __m128d R1 = _mm_blendv_pd(A, B, M1); 224 double[2] correct1 = [3.0, 2.0]; 225 assert(R1.array == correct1); 226 227 // Note: wouldn't work with -double.nan, since in some AArch64 archs the NaN sign bit is lost 228 // See Issue #78 229 __m128d M2 = _mm_setr_pd(double.nan, double.infinity); 230 __m128d R2 = _mm_blendv_pd(A, B, M2); 231 double[2] correct2 = [1.0, 2.0]; 232 assert(R2.array == correct2); 233 } 234 235 236 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`. 237 __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) @trusted 238 { 239 // PERF DMD 240 static if (GDC_with_SSE41) 241 { 242 return __builtin_ia32_blendvps(a, b, mask); 243 } 244 else static if (LDC_with_SSE41) 245 { 246 return __builtin_ia32_blendvps(a, b, mask); 247 } 248 else static if (LDC_with_ARM64) 249 { 250 int4 shift; 251 shift = 31; 252 int4 lmask = cast(int4)mask >> shift; 253 return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a); 254 } 255 else 256 { 257 __m128 r; // PERF =void; 258 int4 lmask = cast(int4)mask; 259 for (int n = 0; n < 4; ++n) 260 { 261 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 262 } 263 return r; 264 } 265 } 266 unittest 267 { 268 __m128 A = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f); 269 __m128 B = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f); 270 __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f); 271 __m128 M2 = _mm_setr_ps(float.nan, float.nan, -0.0f, +0.0f); 272 __m128 R1 = _mm_blendv_ps(A, B, M1); 273 __m128 R2 = _mm_blendv_ps(A, B, M2); 274 float[4] correct1 = [ 4.0f, 1.0f, 2.0f, 7.0f]; 275 float[4] correct2 = [ 0.0f, 1.0f, 6.0f, 3.0f]; 276 assert(R1.array == correct1); 277 278 // Note: wouldn't work with -float.nan, since in some AArch64 archs the NaN sign bit is lost 279 // See Issue #78 280 assert(R2.array == correct2); 281 } 282 283 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, 284 /// and store the results as packed double-precision floating-point elements. 285 __m128d _mm_ceil_pd (__m128d a) @trusted 286 { 287 static if (LDC_with_ARM64) 288 { 289 // LDC arm64 acceptable since 1.8 -O2 290 // Unfortunately x86 intrinsics force a round-trip back to double2 291 // ARM neon semantics wouldn't have that 292 long2 l = vcvtpq_s64_f64(a); 293 double2 r; 294 r.ptr[0] = l.array[0]; 295 r.ptr[1] = l.array[1]; 296 return r; 297 } 298 else 299 { 300 return _mm_round_pd!2(a); 301 } 302 } 303 unittest 304 { 305 __m128d A = _mm_setr_pd(1.3f, -2.12f); 306 __m128d B = _mm_setr_pd(53.6f, -2.7f); 307 A = _mm_ceil_pd(A); 308 B = _mm_ceil_pd(B); 309 double[2] correctA = [2.0, -2.0]; 310 double[2] correctB = [54.0, -2.0]; 311 assert(A.array == correctA); 312 assert(B.array == correctB); 313 } 314 315 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, 316 /// and store the results as packed single-precision floating-point elements. 317 __m128 _mm_ceil_ps (__m128 a) @trusted 318 { 319 static if (LDC_with_ARM64) 320 { 321 // LDC arm64 acceptable since 1.8 -O1 322 int4 l = vcvtpq_s32_f32(a); 323 float4 r; 324 r.ptr[0] = l.array[0]; 325 r.ptr[1] = l.array[1]; 326 r.ptr[2] = l.array[2]; 327 r.ptr[3] = l.array[3]; 328 return r; 329 } 330 else 331 { 332 return _mm_round_ps!2(a); 333 } 334 } 335 unittest 336 { 337 __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); 338 __m128 C = _mm_ceil_ps(A); 339 float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f]; 340 assert(C.array == correct); 341 } 342 343 /// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, 344 /// store the result as a double-precision floating-point element in the lower element of result, 345 /// and copy the upper element from `a` to the upper element of dst. 346 __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted 347 { 348 static if (LDC_with_ARM64) 349 { 350 a[0] = vcvtps_s64_f64(b[0]); 351 return a; 352 } 353 else 354 { 355 return _mm_round_sd!2(a, b); 356 } 357 } 358 unittest 359 { 360 __m128d A = _mm_setr_pd(1.3, -2.12); 361 __m128d B = _mm_setr_pd(53.6, -3.7); 362 __m128d C = _mm_ceil_sd(A, B); 363 double[2] correct = [54.0, -2.12]; 364 assert(C.array == correct); 365 } 366 367 /// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value, 368 /// store the result as a single-precision floating-point element in the lower element of result, 369 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 370 __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted 371 { 372 static if (LDC_with_ARM64) 373 { 374 a[0] = vcvtps_s32_f32(b[0]); 375 return a; 376 } 377 else 378 { 379 return _mm_round_ss!2(a, b); 380 } 381 } 382 unittest 383 { 384 __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); 385 __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f); 386 __m128 C = _mm_ceil_ss(A, B); 387 float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f]; 388 assert(C.array == correct); 389 } 390 391 /// Compare packed 64-bit integers in `a` and `b` for equality. 392 __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted 393 { 394 // PERF DMD 395 static if (GDC_with_SSE41) 396 { 397 return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b); 398 } 399 else version(LDC) 400 { 401 // LDC x86: generates pcmpeqq since LDC 1.1 -O1 402 // arm64: generates cmeq since LDC 1.8 -O1 403 return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b); 404 } 405 else 406 { 407 // Clever pcmpeqd + pand use with LDC 1.24 -O2 408 long2 la = cast(long2)a; 409 long2 lb = cast(long2)b; 410 long2 res; 411 res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0; 412 res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0; 413 return cast(__m128i)res; 414 } 415 } 416 unittest 417 { 418 __m128i A = _mm_setr_epi64(-1, -2); 419 __m128i B = _mm_setr_epi64(-3, -2); 420 __m128i C = _mm_setr_epi64(-1, -4); 421 long2 AB = cast(long2) _mm_cmpeq_epi64(A, B); 422 long2 AC = cast(long2) _mm_cmpeq_epi64(A, C); 423 long[2] correct1 = [0, -1]; 424 long[2] correct2 = [-1, 0]; 425 assert(AB.array == correct1); 426 assert(AC.array == correct2); 427 } 428 429 430 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers. 431 __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted 432 { 433 // PERF DMD 434 static if (GDC_with_SSE41) 435 { 436 return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a); 437 } 438 else version(LDC) 439 { 440 // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64 441 enum ir = ` 442 %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3> 443 %r = sext <4 x i16> %v to <4 x i32> 444 ret <4 x i32> %r`; 445 return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a); 446 } 447 else 448 { 449 short8 sa = cast(short8)a; 450 int4 r; 451 r.ptr[0] = sa.array[0]; 452 r.ptr[1] = sa.array[1]; 453 r.ptr[2] = sa.array[2]; 454 r.ptr[3] = sa.array[3]; 455 return r; 456 } 457 } 458 unittest 459 { 460 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 461 int4 C = cast(int4) _mm_cvtepi16_epi32(A); 462 int[4] correct = [-1, 0, -32768, 32767]; 463 assert(C.array == correct); 464 } 465 466 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers. 467 __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted 468 { 469 // PERF DMD 470 static if (GDC_with_SSE41) 471 { 472 return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a); 473 } 474 else version(LDC) 475 { 476 // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64 477 enum ir = ` 478 %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> <i32 0, i32 1> 479 %r = sext <2 x i16> %v to <2 x i64> 480 ret <2 x i64> %r`; 481 return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a); 482 } 483 else 484 { 485 short8 sa = cast(short8)a; 486 long2 r; 487 r.ptr[0] = sa.array[0]; 488 r.ptr[1] = sa.array[1]; 489 return cast(__m128i)r; 490 } 491 } 492 unittest 493 { 494 __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0); 495 long2 C = cast(long2) _mm_cvtepi16_epi64(A); 496 long[2] correct = [-32768, 32767]; 497 assert(C.array == correct); 498 } 499 500 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers. 501 __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted 502 { 503 // PERF DMD 504 static if (GDC_with_SSE41) 505 { 506 return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a); 507 } 508 else version(LDC) 509 { 510 // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64 511 enum ir = ` 512 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 513 %r = sext <2 x i32> %v to <2 x i64> 514 ret <2 x i64> %r`; 515 return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a); 516 } 517 else 518 { 519 int4 sa = cast(int4)a; 520 long2 r; 521 r.ptr[0] = sa.array[0]; 522 r.ptr[1] = sa.array[1]; 523 return cast(__m128i)r; 524 } 525 } 526 unittest 527 { 528 __m128i A = _mm_setr_epi32(-4, 42, 0, 0); 529 long2 C = cast(long2) _mm_cvtepi32_epi64(A); 530 long[2] correct = [-4, 42]; 531 assert(C.array == correct); 532 } 533 534 535 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers. 536 __m128i _mm_cvtepi8_epi16 (__m128i a) @trusted 537 { 538 // PERF DMD 539 static if (GDC_with_SSE41) 540 { 541 alias ubyte16 = __vector(ubyte[16]); 542 return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a); 543 } 544 else version(LDC) 545 { 546 // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 547 // LDC ARM64: sshll generated since LDC 1.8.0 -O1 548 enum ir = ` 549 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 550 %r = sext <8 x i8> %v to <8 x i16> 551 ret <8 x i16> %r`; 552 return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a); 553 } 554 else 555 { 556 byte16 sa = cast(byte16)a; 557 short8 r; 558 foreach(n; 0..8) 559 r.ptr[n] = sa.array[n]; 560 return cast(__m128i)r; 561 } 562 } 563 unittest 564 { 565 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 566 short8 C = cast(short8) _mm_cvtepi8_epi16(A); 567 short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8]; 568 assert(C.array == correct); 569 } 570 571 572 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers. 573 __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted 574 { 575 // PERF DMD 576 static if (GDC_with_SSE41) 577 { 578 alias ubyte16 = __vector(ubyte[16]); 579 return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a); 580 } 581 else static if (LDC_with_SSE41) 582 { 583 // LDC x86: Generates pmovsxbd since LDC 1.1 -O0 584 enum ir = ` 585 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 586 %r = sext <4 x i8> %v to <4 x i32> 587 ret <4 x i32> %r`; 588 return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a); 589 } 590 else 591 { 592 // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would 593 byte16 sa = cast(byte16)a; 594 int4 r; 595 r.ptr[0] = sa.array[0]; 596 r.ptr[1] = sa.array[1]; 597 r.ptr[2] = sa.array[2]; 598 r.ptr[3] = sa.array[3]; 599 return cast(__m128i)r; 600 } 601 } 602 unittest 603 { 604 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 605 int4 C = cast(int4) _mm_cvtepi8_epi32(A); 606 int[4] correct = [127, -128, 1, -1]; 607 assert(C.array == correct); 608 } 609 610 611 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. 612 __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted 613 { 614 // PERF DMD 615 static if (GDC_with_SSE41) 616 { 617 alias ubyte16 = __vector(ubyte[16]); 618 return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a); 619 } 620 else version(LDC) 621 { 622 // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, 623 // LDC arm64: it's ok since LDC 1.8 -O1 624 enum ir = ` 625 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> <i32 0, i32 1> 626 %r = sext <2 x i8> %v to <2 x i64> 627 ret <2 x i64> %r`; 628 return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a); 629 } 630 else 631 { 632 byte16 sa = cast(byte16)a; 633 long2 r; 634 foreach(n; 0..2) 635 r.ptr[n] = sa.array[n]; 636 return cast(__m128i)r; 637 } 638 } 639 unittest 640 { 641 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 642 long2 C = cast(long2) _mm_cvtepi8_epi64(A); 643 long[2] correct = [127, -128]; 644 assert(C.array == correct); 645 } 646 647 648 /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers. 649 __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted 650 { 651 // PERF DMD 652 static if (GDC_with_SSE41) 653 { 654 return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a); 655 } 656 else 657 { 658 // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 659 // arm64: ushll since LDC 1.12 -O1 660 short8 sa = cast(short8)a; 661 int4 r; 662 r.ptr[0] = cast(ushort)sa.array[0]; 663 r.ptr[1] = cast(ushort)sa.array[1]; 664 r.ptr[2] = cast(ushort)sa.array[2]; 665 r.ptr[3] = cast(ushort)sa.array[3]; 666 return cast(__m128i)r; 667 } 668 } 669 unittest 670 { 671 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 672 int4 C = cast(int4) _mm_cvtepu16_epi32(A); 673 int[4] correct = [65535, 0, 32768, 32767]; 674 assert(C.array == correct); 675 } 676 677 678 /// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers. 679 __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted 680 { 681 // PERF DMD 682 static if (GDC_with_SSE41) 683 { 684 return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a); 685 } 686 else static if (LDC_with_ARM64) 687 { 688 // LDC arm64: a bit shorter than below, in -O2 689 short8 sa = cast(short8)a; 690 long2 r; 691 for(int n = 0; n < 2; ++n) 692 r.ptr[n] = cast(ushort)sa.array[n]; 693 return cast(__m128i)r; 694 } 695 else 696 { 697 // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 698 short8 sa = cast(short8)a; 699 long2 r; 700 r.ptr[0] = cast(ushort)sa.array[0]; 701 r.ptr[1] = cast(ushort)sa.array[1]; 702 return cast(__m128i)r; 703 } 704 } 705 unittest 706 { 707 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 708 long2 C = cast(long2) _mm_cvtepu16_epi64(A); 709 long[2] correct = [65535, 0]; 710 assert(C.array == correct); 711 } 712 713 714 /// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers. 715 __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted 716 { 717 // PERF DMD 718 static if (GDC_with_SSE41) 719 { 720 return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a); 721 } 722 else 723 { 724 // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1 725 // arm64: generates ushll since LDC 1.12 -O1 726 int4 sa = cast(int4)a; 727 long2 r; 728 r.ptr[0] = cast(uint)sa.array[0]; 729 r.ptr[1] = cast(uint)sa.array[1]; 730 return cast(__m128i)r; 731 } 732 } 733 unittest 734 { 735 __m128i A = _mm_setr_epi32(-1, 42, 0, 0); 736 long2 C = cast(long2) _mm_cvtepu32_epi64(A); 737 long[2] correct = [4294967295, 42]; 738 assert(C.array == correct); 739 } 740 741 742 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers. 743 __m128i _mm_cvtepu8_epi16 (__m128i a) @trusted 744 { 745 // PERF DMD 746 static if (GDC_with_SSE41) 747 { 748 return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(ubyte16)a); 749 } 750 else 751 { 752 // LDC x86: generates pmovzxbw since LDC 1.12 -O1 also good without SSE4.1 753 // arm64: ushll since LDC 1.12 -O1 754 // PERF: catastrophic with GDC without SSE4.1 755 byte16 sa = cast(byte16)a; 756 short8 r; 757 r.ptr[0] = cast(ubyte)sa.array[0]; 758 r.ptr[1] = cast(ubyte)sa.array[1]; 759 r.ptr[2] = cast(ubyte)sa.array[2]; 760 r.ptr[3] = cast(ubyte)sa.array[3]; 761 r.ptr[4] = cast(ubyte)sa.array[4]; 762 r.ptr[5] = cast(ubyte)sa.array[5]; 763 r.ptr[6] = cast(ubyte)sa.array[6]; 764 r.ptr[7] = cast(ubyte)sa.array[7]; 765 return cast(__m128i)r; 766 } 767 } 768 unittest 769 { 770 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 771 short8 C = cast(short8) _mm_cvtepu8_epi16(A); 772 short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248]; 773 assert(C.array == correct); 774 } 775 776 777 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers. 778 __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted 779 { 780 // PERF DMD 781 static if (GDC_with_SSE41) 782 { 783 alias ubyte16 = __vector(ubyte[16]); 784 return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a); 785 } 786 else static if (LDC_with_ARM64) 787 { 788 // LDC arm64: a bit better than below in -O2 789 byte16 sa = cast(byte16)a; 790 int4 r; 791 for(int n = 0; n < 4; ++n) 792 r.ptr[n] = cast(ubyte)sa.array[n]; 793 return cast(__m128i)r; 794 } 795 else 796 { 797 // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1 798 // PERF: catastrophic with GDC without SSE4.1 799 byte16 sa = cast(byte16)a; 800 int4 r; 801 r.ptr[0] = cast(ubyte)sa.array[0]; 802 r.ptr[1] = cast(ubyte)sa.array[1]; 803 r.ptr[2] = cast(ubyte)sa.array[2]; 804 r.ptr[3] = cast(ubyte)sa.array[3]; 805 return cast(__m128i)r; 806 } 807 } 808 unittest 809 { 810 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 811 int4 C = cast(int4) _mm_cvtepu8_epi32(A); 812 int[4] correct = [127, 128, 1, 255]; 813 assert(C.array == correct); 814 } 815 816 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. 817 __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted 818 { 819 // PERF DMD 820 static if (GDC_with_SSE41) 821 { 822 alias ubyte16 = __vector(ubyte[16]); 823 return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a); 824 } 825 else static if (LDC_with_ARM64) 826 { 827 // LDC arm64: this optimizes better than the loop below 828 byte16 sa = cast(byte16)a; 829 long2 r; 830 for (int n = 0; n < 2; ++n) 831 r.ptr[n] = cast(ubyte)sa.array[n]; 832 return cast(__m128i)r; 833 } 834 else 835 { 836 // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1 837 byte16 sa = cast(byte16)a; 838 long2 r; 839 r.ptr[0] = cast(ubyte)sa.array[0]; 840 r.ptr[1] = cast(ubyte)sa.array[1]; 841 return cast(__m128i)r; 842 } 843 } 844 unittest 845 { 846 __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 847 long2 C = cast(long2) _mm_cvtepu8_epi64(A); 848 long[2] correct = [127, 254]; 849 assert(C.array == correct); 850 } 851 852 /// Conditionally multiply the packed double-precision (64-bit) floating-point elements 853 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally 854 /// store the sum in dst using the low 4 bits of `imm8`. 855 __m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted 856 { 857 // PERF DMD 858 static if (GDC_with_SSE41) 859 { 860 return __builtin_ia32_dppd(a, b, imm8 & 0x33); 861 } 862 else static if (LDC_with_SSE41) 863 { 864 return __builtin_ia32_dppd(a, b, imm8 & 0x33); 865 } 866 else 867 { 868 __m128d zero = _mm_setzero_pd(); 869 __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b); 870 double sum = temp.array[0] + temp.array[1]; 871 return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum)); 872 } 873 } 874 unittest 875 { 876 __m128d A = _mm_setr_pd(1.0, 2.0); 877 __m128d B = _mm_setr_pd(4.0, 8.0); 878 double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B); 879 double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B); 880 double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B); 881 double[2] correct1 = [ 4.0, 4.0]; 882 double[2] correct2 = [16.0, 0.0]; 883 double[2] correct3 = [ 0.0, 20.0]; 884 assert(R1.array == correct1); 885 assert(R2.array == correct2); 886 assert(R3.array == correct3); 887 } 888 889 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements 890 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, 891 /// and conditionally store the sum in result using the low 4 bits of `imm8`. 892 __m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted 893 { 894 // PERF DMD 895 static if (GDC_with_SSE41) 896 { 897 return __builtin_ia32_dpps(a, b, cast(ubyte)imm8); 898 } 899 else static if (LDC_with_SSE41) 900 { 901 return __builtin_ia32_dpps(a, b, cast(byte)imm8); 902 } 903 else 904 { 905 __m128 zero = _mm_setzero_ps(); 906 __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b); 907 float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3]; 908 return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum)); 909 } 910 } 911 unittest 912 { 913 __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f); 914 __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f); 915 float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B); 916 float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B); 917 float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B); 918 float[4] correct1 = [67.0f, 67.0f, 67.0f, 67.0f]; 919 float[4] correct2 = [23.0f, 0.0f, 23.0f, 0.0f]; 920 float[4] correct3 = [0.0f, 29.0f, 0.0f, 29.0f]; 921 assert(R1.array == correct1); 922 assert(R2.array == correct2); 923 assert(R3.array == correct3); 924 } 925 926 927 /// Extract a 32-bit integer from `a`, selected with `imm8`. 928 int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted 929 { 930 return (cast(int4)a).array[imm8 & 3]; 931 } 932 unittest 933 { 934 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 935 assert(_mm_extract_epi32(A, 0) == 1); 936 assert(_mm_extract_epi32(A, 1 + 8) == 2); 937 assert(_mm_extract_epi32(A, 3 + 4) == 4); 938 } 939 940 /// Extract a 64-bit integer from `a`, selected with `imm8`. 941 long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted 942 { 943 long2 la = cast(long2)a; 944 return la.array[imm8 & 1]; 945 } 946 unittest 947 { 948 __m128i A = _mm_setr_epi64(45, -67); 949 assert(_mm_extract_epi64(A, 0) == 45); 950 assert(_mm_extract_epi64(A, 1) == -67); 951 assert(_mm_extract_epi64(A, 2) == 45); 952 } 953 954 /// Extract an 8-bit integer from `a`, selected with `imm8`. 955 /// Warning: the returned value is zero-extended to 32-bits. 956 int _mm_extract_epi8 (__m128i a, const int imm8) @trusted 957 { 958 byte16 ba = cast(byte16)a; 959 return cast(ubyte) ba.array[imm8 & 15]; 960 } 961 unittest 962 { 963 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15); 964 assert(_mm_extract_epi8(A, 7) == 7); 965 assert(_mm_extract_epi8(A, 13) == 255); 966 assert(_mm_extract_epi8(A, 7 + 16) == 7); 967 } 968 969 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`. 970 /// Note: returns a 32-bit $(I integer). 971 int _mm_extract_ps (__m128 a, const int imm8) @trusted 972 { 973 return (cast(int4)a).array[imm8 & 3]; 974 } 975 unittest 976 { 977 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f); 978 assert(_mm_extract_ps(A, 0) == 0x3f800000); 979 assert(_mm_extract_ps(A, 1 + 8) == 0x40000000); 980 assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000); 981 } 982 983 984 985 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an 986 /// integer value, and store the results as packed double-precision floating-point elements. 987 __m128d _mm_floor_pd (__m128d a) @trusted 988 { 989 static if (LDC_with_ARM64) 990 { 991 // LDC arm64 acceptable since 1.8 -O2 992 long2 l = vcvtmq_s64_f64(a); 993 double2 r; 994 r.ptr[0] = l.array[0]; 995 r.ptr[1] = l.array[1]; 996 return r; 997 } 998 else 999 { 1000 return _mm_round_pd!1(a); 1001 } 1002 } 1003 unittest 1004 { 1005 __m128d A = _mm_setr_pd(1.3f, -2.12f); 1006 __m128d B = _mm_setr_pd(53.6f, -2.7f); 1007 A = _mm_floor_pd(A); 1008 B = _mm_floor_pd(B); 1009 double[2] correctA = [1.0, -3.0]; 1010 double[2] correctB = [53.0, -3.0]; 1011 assert(A.array == correctA); 1012 assert(B.array == correctB); 1013 } 1014 1015 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an 1016 /// integer value, and store the results as packed single-precision floating-point elements. 1017 __m128 _mm_floor_ps (__m128 a) @trusted 1018 { 1019 static if (LDC_with_ARM64) 1020 { 1021 // LDC arm64 acceptable since 1.8 -O1 1022 int4 l = vcvtmq_s32_f32(a); 1023 float4 r; 1024 r.ptr[0] = l.array[0]; 1025 r.ptr[1] = l.array[1]; 1026 r.ptr[2] = l.array[2]; 1027 r.ptr[3] = l.array[3]; 1028 return r; 1029 } 1030 else 1031 { 1032 return _mm_round_ps!1(a); 1033 } 1034 } 1035 unittest 1036 { 1037 __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); 1038 __m128 C = _mm_floor_ps(A); 1039 float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f]; 1040 assert(C.array == correct); 1041 } 1042 1043 /// Round the lower double-precision (64-bit) floating-point element in `b` down to an 1044 /// integer value, store the result as a double-precision floating-point element in the 1045 /// lower element, and copy the upper element from `a` to the upper element. 1046 __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted 1047 { 1048 static if (LDC_with_ARM64) 1049 { 1050 a[0] = vcvtms_s64_f64(b[0]); 1051 return a; 1052 } 1053 else 1054 { 1055 return _mm_round_sd!1(a, b); 1056 } 1057 } 1058 unittest 1059 { 1060 __m128d A = _mm_setr_pd(1.3, -2.12); 1061 __m128d B = _mm_setr_pd(-53.1, -3.7); 1062 __m128d C = _mm_floor_sd(A, B); 1063 double[2] correct = [-54.0, -2.12]; 1064 assert(C.array == correct); 1065 } 1066 1067 /// Round the lower single-precision (32-bit) floating-point element in `b` down to an 1068 /// integer value, store the result as a single-precision floating-point element in the 1069 /// lower element, and copy the upper 3 packed elements from `a` to the upper elements. 1070 __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted 1071 { 1072 static if (LDC_with_ARM64) 1073 { 1074 a[0] = vcvtms_s32_f32(b[0]); 1075 return a; 1076 } 1077 else 1078 { 1079 return _mm_round_ss!1(a, b); 1080 } 1081 } 1082 unittest 1083 { 1084 __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); 1085 __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f); 1086 __m128 C = _mm_floor_ss(A, B); 1087 float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f]; 1088 assert(C.array == correct); 1089 } 1090 1091 /// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`. 1092 __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted 1093 { 1094 // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1 1095 // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1 1096 // LDC arm64: ins.s since LDC 1.8 -O2 1097 int4 ia = cast(int4)a; 1098 ia.ptr[imm8 & 3] = i; 1099 return cast(__m128i)ia; 1100 } 1101 unittest 1102 { 1103 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 1104 int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4); 1105 int[4] result = [1, 2, 5, 4]; 1106 assert(C.array == result); 1107 } 1108 1109 /// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`. 1110 __m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted 1111 { 1112 // GDC: nothing special to do, psinrq generated with -O1 -msse4.1 1113 // LDC x86: always do something sensible. 1114 long2 la = cast(long2)a; 1115 la.ptr[imm8 & 1] = i; 1116 return cast(__m128i)la; 1117 } 1118 unittest 1119 { 1120 __m128i A = _mm_setr_epi64(1, 2); 1121 long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2); 1122 long[2] result = [1, 5]; 1123 assert(C.array == result); 1124 } 1125 1126 /// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`. 1127 /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8. 1128 __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted 1129 { 1130 // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1 1131 // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory. 1132 byte16 ba = cast(byte16)a; 1133 ba.ptr[imm8 & 15] = cast(byte)i; 1134 return cast(__m128i)ba; 1135 } 1136 unittest 1137 { 1138 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 1139 byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16); 1140 byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 1141 assert(C.array == result); 1142 } 1143 1144 1145 /// Warning: of course it does something totally different from `_mm_insert_epi32`! 1146 /// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` 1147 /// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` 1148 /// (elements are zeroed out when the corresponding bit is set). 1149 __m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted 1150 { 1151 // PERF DMD 1152 static if (GDC_with_SSE41) 1153 { 1154 return __builtin_ia32_insertps128(a, b, cast(ubyte)imm8); 1155 } 1156 else static if (LDC_with_SSE41) 1157 { 1158 return __builtin_ia32_insertps128(a, b, cast(byte)imm8); 1159 } 1160 else 1161 { 1162 float4 tmp2 = a; 1163 float tmp1 = b.array[(imm8 >> 6) & 3]; 1164 tmp2.ptr[(imm8 >> 4) & 3] = tmp1; 1165 return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps()); 1166 } 1167 } 1168 unittest 1169 { 1170 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1171 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1172 __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B); 1173 float[4] correct = [1.0f, 2.0f, 0.0f, 7.0f]; 1174 assert(C.array == correct); 1175 } 1176 1177 1178 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. 1179 __m128i _mm_max_epi32 (__m128i a, __m128i b) @trusted 1180 { 1181 static if (GDC_with_SSE41) 1182 { 1183 return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b); 1184 } 1185 else version(LDC) 1186 { 1187 // x86: pmaxsd since LDC 1.1 -O1 1188 // ARM: smax.4s since LDC 1.8 -01 1189 int4 sa = cast(int4)a; 1190 int4 sb = cast(int4)b; 1191 int4 greater = greaterMask!int4(sa, sb); 1192 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1193 } 1194 else 1195 { 1196 __m128i higher = _mm_cmpgt_epi32(a, b); 1197 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1198 __m128i mask = _mm_and_si128(aTob, higher); 1199 return _mm_xor_si128(b, mask); 1200 } 1201 } 1202 unittest 1203 { 1204 int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), 1205 _mm_setr_epi32( -4,-8, 9, -8)); 1206 int[4] correct = [0x7fffffff, 1, 9, 7]; 1207 assert(R.array == correct); 1208 } 1209 1210 /// Compare packed signed 8-bit integers in `a` and `b`, 1211 /// and return packed maximum values. 1212 __m128i _mm_max_epi8 (__m128i a, __m128i b) @trusted 1213 { 1214 // PERF DMD 1215 static if (GDC_with_SSE41) 1216 { 1217 return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b); 1218 } 1219 else version(LDC) 1220 { 1221 // x86: pmaxsb since LDC 1.1 -O1 1222 // ARM64: smax.16b since LDC 1.8.0 -O1 1223 byte16 sa = cast(byte16)a; 1224 byte16 sb = cast(byte16)b; 1225 byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); 1226 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1227 } 1228 else 1229 { 1230 __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else 1231 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1232 __m128i mask = _mm_and_si128(aTob, lower); 1233 return _mm_xor_si128(b, mask); 1234 } 1235 } 1236 unittest 1237 { 1238 __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); 1239 __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 1240 byte16 R = cast(byte16) _mm_max_epi8(A, B); 1241 byte[16] correct = [127, 1, 9, -7, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0]; 1242 assert(R.array == correct); 1243 } 1244 1245 /// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values. 1246 __m128i _mm_max_epu16 (__m128i a, __m128i b) @trusted 1247 { 1248 // PERF DMD 1249 static if (GDC_with_SSE41) 1250 { 1251 return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b); 1252 } 1253 else version(LDC) 1254 { 1255 // x86: pmaxuw since LDC 1.1 -O1 1256 // ARM64: umax.8h since LDC 1.8.0 -O1 1257 // PERF: without sse4.1, LLVM 12 produces a very interesting 1258 // psubusw xmm0, xmm1 1259 // paddw xmm0, xmm1 1260 // sequence that maybe should go in other min/max intrinsics? 1261 ushort8 sa = cast(ushort8)a; 1262 ushort8 sb = cast(ushort8)b; 1263 ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb); 1264 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1265 } 1266 else 1267 { 1268 b = _mm_subs_epu16(b, a); 1269 b = _mm_add_epi16(b, a); 1270 return b; 1271 } 1272 } 1273 unittest 1274 { 1275 short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), 1276 _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); 1277 short[8] correct = [ -4, -8, -4, -7, 9,-32768, 0, 57]; 1278 assert(R.array == correct); 1279 } 1280 1281 /// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values. 1282 __m128i _mm_max_epu32 (__m128i a, __m128i b) @trusted 1283 { 1284 // PERF DMD 1285 static if (GDC_with_SSE41) 1286 { 1287 return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b); 1288 } 1289 else version(LDC) 1290 { 1291 // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1 1292 // ARM64: umax.4s since LDC 1.8.0 -O1 1293 uint4 sa = cast(uint4)a; 1294 uint4 sb = cast(uint4)b; 1295 uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); 1296 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1297 } 1298 else 1299 { 1300 __m128i valueShift = _mm_set1_epi32(-0x80000000); 1301 __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift)); 1302 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1303 __m128i mask = _mm_and_si128(aTob, higher); 1304 return _mm_xor_si128(b, mask); 1305 } 1306 } 1307 unittest 1308 { 1309 int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), 1310 _mm_setr_epi32( -4,-8, 9, -8)); 1311 int[4] correct = [ -4,-8, 9, -7]; 1312 assert(R.array == correct); 1313 } 1314 1315 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. 1316 __m128i _mm_min_epi32 (__m128i a, __m128i b) @trusted 1317 { 1318 // PERF DMD 1319 static if (GDC_with_SSE41) 1320 { 1321 return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b); 1322 } 1323 else version(LDC) 1324 { 1325 // x86: pminsd since LDC 1.1 -O1, also good without sse4.1 1326 // ARM: smin.4s since LDC 1.8 -01 1327 int4 sa = cast(int4)a; 1328 int4 sb = cast(int4)b; 1329 int4 greater = greaterMask!int4(sa, sb); 1330 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1331 } 1332 else 1333 { 1334 __m128i higher = _mm_cmplt_epi32(a, b); 1335 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1336 __m128i mask = _mm_and_si128(aTob, higher); 1337 return _mm_xor_si128(b, mask); 1338 } 1339 } 1340 unittest 1341 { 1342 int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), 1343 _mm_setr_epi32( -4, -8, 9, -8)); 1344 int[4] correct = [ -4, -8, -4, -8]; 1345 assert(R.array == correct); 1346 } 1347 1348 /// Compare packed signed 8-bit integers in `a` and `b`, 1349 /// and return packed minimum values. 1350 __m128i _mm_min_epi8 (__m128i a, __m128i b) @trusted 1351 { 1352 // PERF DMD 1353 static if (GDC_with_SSE41) 1354 { 1355 return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b); 1356 } 1357 else version(LDC) 1358 { 1359 // x86: pminsb since LDC 1.1 -O1 1360 // ARM64: smin.16b since LDC 1.8.0 -O1 1361 byte16 sa = cast(byte16)a; 1362 byte16 sb = cast(byte16)b; 1363 byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); 1364 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1365 } 1366 else 1367 { 1368 __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else 1369 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1370 __m128i mask = _mm_and_si128(aTob, lower); 1371 return _mm_xor_si128(b, mask); 1372 } 1373 } 1374 unittest 1375 { 1376 __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); 1377 __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 1378 byte16 R = cast(byte16) _mm_min_epi8(A, B); 1379 byte[16] correct = [ 4, -8, -4, -8, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 1380 assert(R.array == correct); 1381 } 1382 1383 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst. 1384 __m128i _mm_min_epu16 (__m128i a, __m128i b) @trusted 1385 { 1386 // PERF DMD 1387 static if (GDC_with_SSE41) 1388 { 1389 return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b); 1390 } 1391 else version(LDC) 1392 { 1393 // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1 1394 // ARM64: umin.8h since LDC 1.8.0 -O1 1395 ushort8 sa = cast(ushort8)a; 1396 ushort8 sb = cast(ushort8)b; 1397 ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa); 1398 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1399 } 1400 else 1401 { 1402 __m128i c = _mm_subs_epu16(b, a); 1403 b = _mm_sub_epi16(b, c); 1404 return b; 1405 } 1406 } 1407 unittest 1408 { 1409 short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), 1410 _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); 1411 short[8] correct = [32767, 1, 9, -8, 0, 7, 0, 0]; 1412 assert(R.array == correct); 1413 } 1414 1415 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst. 1416 __m128i _mm_min_epu32 (__m128i a, __m128i b) @trusted 1417 { 1418 // PERF DMD 1419 static if (GDC_with_SSE41) 1420 { 1421 return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b); 1422 } 1423 else version(LDC) 1424 { 1425 // x86: pminud since LDC 1.1 -O1, also good without sse4.1 1426 // ARM64: umin.4s since LDC 1.8.0 -O1 1427 uint4 sa = cast(uint4)a; 1428 uint4 sb = cast(uint4)b; 1429 uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); 1430 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1431 } 1432 else 1433 { 1434 __m128i valueShift = _mm_set1_epi32(-0x80000000); 1435 __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift)); 1436 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1437 __m128i mask = _mm_and_si128(aTob, higher); 1438 return _mm_xor_si128(b, mask); 1439 } 1440 } 1441 unittest 1442 { 1443 int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), 1444 _mm_setr_epi32( -4,-8, 9, -8)); 1445 int[4] correct = [0x7fffffff, 1, 4, -8]; 1446 assert(R.array == correct); 1447 } 1448 1449 /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, 1450 /// store the minimum and index in return value, and zero the remaining bits. 1451 __m128i _mm_minpos_epu16 (__m128i a) @trusted 1452 { 1453 // PERF DMD 1454 static if (GDC_with_SSE41) 1455 { 1456 return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); 1457 } 1458 else static if (LDC_with_SSE41) 1459 { 1460 return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); 1461 } 1462 else static if (LDC_with_ARM64) 1463 { 1464 __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 1465 __m128i combinedLo = _mm_unpacklo_epi16(indices, a); 1466 __m128i combinedHi = _mm_unpackhi_epi16(indices, a); 1467 __m128i best = _mm_min_epu32(combinedLo, combinedHi); 1468 best = _mm_min_epu32(best, _mm_srli_si128!8(best)); 1469 best = _mm_min_epu32(best, _mm_srli_si128!4(best)); 1470 short8 sbest = cast(short8)best; 1471 short8 r; 1472 r[0] = sbest[1]; 1473 r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie 1474 r[2] = 0; 1475 r[3] = 0; 1476 r[4] = 0; 1477 r[5] = 0; 1478 r[6] = 0; 1479 r[7] = 0; 1480 return cast(__m128i)r; 1481 } 1482 else 1483 { 1484 short8 sa = cast(short8)a; 1485 ushort min = 0xffff; 1486 int index = 0; 1487 for(int n = 0; n < 8; ++n) 1488 { 1489 ushort c = sa.array[n]; 1490 if (c < min) 1491 { 1492 min = c; 1493 index = n; 1494 } 1495 } 1496 short8 r; 1497 r.ptr[0] = min; 1498 r.ptr[1] = cast(short)index; 1499 return cast(__m128i)r; 1500 } 1501 } 1502 unittest 1503 { 1504 __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6); 1505 __m128i B = _mm_setr_epi16(14, 4, 4, 2, -3, 2, 5, 6); 1506 short8 R1 = cast(short8) _mm_minpos_epu16(A); 1507 short8 R2 = cast(short8) _mm_minpos_epu16(B); 1508 short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0]; 1509 short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0]; 1510 assert(R1.array == correct1); 1511 assert(R2.array == correct2); 1512 } 1513 1514 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers 1515 /// in `a` compared to those in `b`, and store the 16-bit results in dst. 1516 /// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. 1517 /// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. 1518 /// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting 1519 /// at the offset specified in `imm8[2]`. 1520 __m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted 1521 { 1522 // PERF DMD 1523 static if (GDC_with_SSE41) 1524 { 1525 return cast(__m128i) __builtin_ia32_mpsadbw128(cast(ubyte16)a, cast(ubyte16)b, cast(ubyte)imm8); 1526 } 1527 else static if (LDC_with_SSE41) 1528 { 1529 return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8); 1530 } 1531 else 1532 { 1533 int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable... 1534 int b_offset = (imm8 & 3) * 4; 1535 1536 byte16 ba = cast(byte16)a; 1537 byte16 bb = cast(byte16)b; 1538 short8 r; 1539 1540 __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0); 1541 1542 for (int j = 0; j < 8; j += 2) 1543 { 1544 int k = a_offset + j; 1545 __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3], 1546 0, 0, 0, 0, 1547 ba[k+1], ba[k+2], ba[k+3], ba[k+4], 1548 0, 0, 0, 0); 1549 short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64 1550 r.ptr[j] = diffs.array[0]; 1551 r.ptr[j+1] = diffs.array[4]; 1552 } 1553 return cast(__m128i)r; 1554 } 1555 } 1556 unittest 1557 { 1558 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 1559 __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5, 5, 5, 12, 13, 14, 15); 1560 short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23]; 1561 short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749]; 1562 short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35]; 1563 short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741]; 1564 short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4]; 1565 short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B); 1566 short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B); 1567 short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B); 1568 short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B); 1569 short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B); 1570 assert(r1.array == correct1); 1571 assert(r4.array == correct4); 1572 assert(r5.array == correct5); 1573 assert(r7.array == correct7); 1574 assert(r8.array == correct0); 1575 } 1576 1577 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst. 1578 __m128i _mm_mul_epi32 (__m128i a, __m128i b) @trusted 1579 { 1580 // PERF DMD 1581 static if (GDC_with_SSE41) 1582 { 1583 return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b); 1584 } 1585 else static if (LDC_with_SSE41) 1586 { 1587 // For some reason, clang has the builtin but it's not in IntrinsicsX86.td 1588 // Use IR instead. 1589 // This generates pmuldq with since LDC 1.2.0 -O0 1590 enum ir = ` 1591 %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 2> 1592 %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> <i32 0, i32 2> 1593 %la = sext <2 x i32> %ia to <2 x i64> 1594 %lb = sext <2 x i32> %ib to <2 x i64> 1595 %r = mul <2 x i64> %la, %lb 1596 ret <2 x i64> %r`; 1597 return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b); 1598 } 1599 else static if (LDC_with_ARM64) 1600 { 1601 // 3 instructions since LDC 1.8 -O2 1602 // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull 1603 int2 a_lo = vmovn_s64(cast(long2)a); 1604 int2 b_lo = vmovn_s64(cast(long2)b); 1605 return cast(__m128i) vmull_s32(a_lo, b_lo); 1606 } 1607 else 1608 { 1609 int4 ia = cast(int4)a; 1610 int4 ib = cast(int4)b; 1611 long2 r; 1612 r.ptr[0] = cast(long)ia.array[0] * ib.array[0]; 1613 r.ptr[1] = cast(long)ia.array[2] * ib.array[2]; 1614 return cast(__m128i)r; 1615 } 1616 } 1617 unittest 1618 { 1619 __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); 1620 __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); 1621 long2 R = cast(long2) _mm_mul_epi32(A, B); 1622 long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144]; 1623 assert(R.array == correct); 1624 } 1625 1626 /// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, 1627 /// return the low 32 bits of the intermediate integers. 1628 __m128i _mm_mullo_epi32 (__m128i a, __m128i b) @trusted 1629 { 1630 // PERF DMD 1631 // PERF GDC without SSE4.1 could be better 1632 static if (GDC_with_SSE41) 1633 { 1634 int4 ia = cast(int4)a; 1635 int4 ib = cast(int4)b; 1636 // Note: older GDC doesn't have that op, but older GDC 1637 // also has no support for -msse4.1 detection 1638 return cast(__m128i)(a * b); 1639 } 1640 else version(LDC) 1641 { 1642 int4 ia = cast(int4)a; 1643 int4 ib = cast(int4)b; 1644 return cast(__m128i)(a * b); 1645 } 1646 else 1647 { 1648 // DMD doesn't take the above 1649 int4 ia = cast(int4)a; 1650 int4 ib = cast(int4)b; 1651 int4 r; 1652 r.ptr[0] = ia.array[0] * ib.array[0]; 1653 r.ptr[1] = ia.array[1] * ib.array[1]; 1654 r.ptr[2] = ia.array[2] * ib.array[2]; 1655 r.ptr[3] = ia.array[3] * ib.array[3]; 1656 return r; 1657 } 1658 } 1659 unittest 1660 { 1661 __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); 1662 __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); 1663 int4 R = cast(int4) _mm_mullo_epi32(A, B); 1664 int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0]; 1665 assert(R.array == correct); 1666 } 1667 1668 1669 /// Convert packed signed 32-bit integers from `a` and `b` 1670 /// to packed 16-bit integers using unsigned saturation. 1671 __m128i _mm_packus_epi32 (__m128i a, __m128i b) @trusted 1672 { 1673 static if (GDC_with_SSE41) 1674 { 1675 // PERF For some reason doesn't generates the builtin??? 1676 return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); 1677 } 1678 else static if (LDC_with_SSE41) 1679 { 1680 return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); 1681 } 1682 else static if (LDC_with_ARM64) 1683 { 1684 int4 z; 1685 z = 0; 1686 return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)), 1687 vqmovn_u32(vmaxq_s32(z, cast(int4)b))); 1688 } 1689 else 1690 { 1691 // PERF: not great without SSE4.1 1692 int4 sa = cast(int4)a; 1693 int4 sb = cast(int4)b; 1694 align(16) ushort[8] result; 1695 for (int i = 0; i < 4; ++i) 1696 { 1697 int s = sa.array[i]; 1698 if (s < 0) s = 0; 1699 if (s > 65535) s = 65535; 1700 result.ptr[i] = cast(ushort)s; 1701 1702 s = sb.array[i]; 1703 if (s < 0) s = 0; 1704 if (s > 65535) s = 65535; 1705 result.ptr[i+4] = cast(ushort)s; 1706 } 1707 return *cast(__m128i*)(result.ptr); 1708 } 1709 } 1710 unittest 1711 { 1712 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 1713 short8 R = cast(short8) _mm_packus_epi32(A, A); 1714 short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0]; 1715 assert(R.array == correct); 1716 } 1717 1718 1719 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 1720 /// rounding parameter, and store the results as packed double-precision floating-point elements. 1721 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1722 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1723 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1724 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1725 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1726 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1727 __m128d _mm_round_pd(int rounding)(__m128d a) @trusted 1728 { 1729 // PERF DMD 1730 static if (GDC_with_SSE41) 1731 { 1732 return __builtin_ia32_roundpd(a, rounding); 1733 } 1734 else static if (LDC_with_SSE41) 1735 { 1736 return __builtin_ia32_roundpd(a, rounding); 1737 } 1738 else 1739 { 1740 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1741 { 1742 // Convert to 64-bit integers 1743 long lo = _mm_cvtsd_si64(a); 1744 a.ptr[0] = a.array[1]; 1745 long hi = _mm_cvtsd_si64(a); 1746 return _mm_setr_pd(lo, hi); 1747 } 1748 else 1749 { 1750 version(GNU) pragma(inline, false); // else fail unittest with optimizations 1751 1752 uint old = _MM_GET_ROUNDING_MODE(); 1753 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1754 1755 // Convert to 64-bit integers 1756 long lo = _mm_cvtsd_si64(a); 1757 a.ptr[0] = a.array[1]; 1758 long hi = _mm_cvtsd_si64(a); 1759 1760 // Convert back to double to achieve the rounding 1761 // The problem is that a 64-bit double can't represent all the values 1762 // a 64-bit integer can (and vice-versa). So this function won't work for 1763 // large values. (TODO: what range exactly?) 1764 _MM_SET_ROUNDING_MODE(old); 1765 return _mm_setr_pd(lo, hi); 1766 } 1767 } 1768 } 1769 unittest 1770 { 1771 // tested in other intrinsics 1772 } 1773 1774 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 1775 /// rounding parameter, and store the results as packed single-precision floating-point elements. 1776 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1777 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1778 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1779 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1780 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1781 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1782 __m128 _mm_round_ps(int rounding)(__m128 a) @trusted 1783 { 1784 // PERF ARM64: there is duplication because this isn't optimal for ARM64, so it is avoided externally 1785 static if (GDC_or_LDC_with_SSE41) 1786 { 1787 return __builtin_ia32_roundps(a, rounding); 1788 } 1789 else 1790 { 1791 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1792 { 1793 __m128i integers = _mm_cvtps_epi32(a); 1794 return _mm_cvtepi32_ps(integers); 1795 } 1796 else 1797 { 1798 version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled 1799 uint old = _MM_GET_ROUNDING_MODE(); 1800 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1801 scope(exit) _MM_SET_ROUNDING_MODE(old); 1802 1803 // Convert to 64-bit integers 1804 __m128i integers = _mm_cvtps_epi32(a); 1805 1806 // Convert back to float to achieve the rounding 1807 // The problem is that a 32-float can't represent all the values 1808 // a 32-bit integer can (and vice-versa). So this function won't work for 1809 // large values. (TODO: what range exactly?) 1810 __m128 result = _mm_cvtepi32_ps(integers); 1811 1812 return result; 1813 } 1814 } 1815 } 1816 unittest 1817 { 1818 // tested in other intrinsics 1819 } 1820 1821 1822 /// Round the lower double-precision (64-bit) floating-point element in `b` using the 1823 /// rounding parameter, store the result as a double-precision floating-point element 1824 /// in the lower element of result, and copy the upper element from `a` to the upper element of result. 1825 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1826 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1827 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1828 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1829 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1830 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1831 __m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted 1832 { 1833 static if (GDC_with_SSE41) 1834 { 1835 return __builtin_ia32_roundsd(a, b, rounding); 1836 } 1837 else static if (LDC_with_SSE41) 1838 { 1839 return __builtin_ia32_roundsd(a, b, rounding); 1840 } 1841 else 1842 { 1843 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1844 { 1845 // Convert to 64-bit integer 1846 long b0 = _mm_cvtsd_si64(b); 1847 a.ptr[0] = b0; 1848 return a; 1849 } 1850 else 1851 { 1852 version(GNU) pragma(inline, false); // else fail unittest with optimizations 1853 1854 uint old = _MM_GET_ROUNDING_MODE(); 1855 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1856 1857 // Convert to 64-bit integer 1858 long b0 = _mm_cvtsd_si64(b); 1859 a.ptr[0] = b0; 1860 1861 // Convert back to double to achieve the rounding 1862 // The problem is that a 64-bit double can't represent all the values 1863 // a 64-bit integer can (and vice-versa). So this function won't work for 1864 // large values. (TODO: what range exactly?) 1865 _MM_SET_ROUNDING_MODE(old); 1866 return a; 1867 } 1868 } 1869 } 1870 unittest 1871 { 1872 // tested in other intrinsics 1873 } 1874 1875 1876 /// Round the lower single-precision (32-bit) floating-point element in `b` using the 1877 /// rounding parameter, store the result as a single-precision floating-point element 1878 /// in the lower element of result, and copy the upper 3 packed elements from `a` 1879 /// to the upper elements of result. 1880 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1881 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1882 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1883 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1884 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1885 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1886 __m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted 1887 { 1888 static if (GDC_with_SSE41) 1889 { 1890 return __builtin_ia32_roundss(a, b, rounding); 1891 } 1892 else static if (LDC_with_SSE41) 1893 { 1894 return __builtin_ia32_roundss(a, b, rounding); 1895 } 1896 else 1897 { 1898 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1899 { 1900 int b0 = _mm_cvtss_si32(b); 1901 a.ptr[0] = b0; 1902 return a; 1903 } 1904 else version(GNU) 1905 { 1906 pragma(inline, false) 1907 __m128 GDCworkaround() nothrow @nogc @trusted 1908 { 1909 uint old = _MM_GET_ROUNDING_MODE(); 1910 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1911 1912 // Convert to 32-bit integer 1913 int b0 = _mm_cvtss_si32(b); 1914 a.ptr[0] = b0; 1915 1916 // Convert back to double to achieve the rounding 1917 // The problem is that a 32-bit float can't represent all the values 1918 // a 32-bit integer can (and vice-versa). So this function won't work for 1919 // large values. (TODO: what range exactly?) 1920 _MM_SET_ROUNDING_MODE(old); 1921 return a; 1922 } 1923 return GDCworkaround(); 1924 } 1925 else 1926 { 1927 uint old = _MM_GET_ROUNDING_MODE(); 1928 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1929 1930 // Convert to 32-bit integer 1931 int b0 = _mm_cvtss_si32(b); 1932 a.ptr[0] = b0; 1933 1934 // Convert back to double to achieve the rounding 1935 // The problem is that a 32-bit float can't represent all the values 1936 // a 32-bit integer can (and vice-versa). So this function won't work for 1937 // large values. (TODO: what range exactly?) 1938 _MM_SET_ROUNDING_MODE(old); 1939 return a; 1940 } 1941 } 1942 } 1943 unittest 1944 { 1945 // tested in other intrinsics 1946 } 1947 1948 1949 /// Load 128-bits of integer data from memory using a non-temporal memory hint. 1950 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection 1951 /// exception may be generated. 1952 __m128i _mm_stream_load_si128 (__m128i * mem_addr) pure @trusted 1953 { 1954 // PERF DMD D_SIMD 1955 static if (GDC_with_SSE41) 1956 { 1957 return cast(__m128i) __builtin_ia32_movntdqa(cast(long2*)mem_addr); 1958 } 1959 else version(LDC) 1960 { 1961 enum prefix = `!0 = !{ i32 1 }`; 1962 enum ir = ` 1963 %r = load <4 x i32>, <4 x i32>* %0, !nontemporal !0 1964 ret <4 x i32> %r`; 1965 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", int4, int4*)(mem_addr); 1966 } 1967 else 1968 { 1969 return *mem_addr; // regular move instead 1970 } 1971 } 1972 // TODO unittest 1973 1974 1975 /// Return 1 if all bits in `a` are all 1's. Else return 0. 1976 int _mm_test_all_ones (__m128i a) @safe 1977 { 1978 return _mm_testc_si128(a, _mm_set1_epi32(-1)); 1979 } 1980 unittest 1981 { 1982 __m128i A = _mm_set1_epi32(-1); 1983 __m128i B = _mm_set_epi32(-1, -2, -1, -1); 1984 assert(_mm_test_all_ones(A) == 1); 1985 assert(_mm_test_all_ones(B) == 0); 1986 } 1987 1988 /// Return 1 if all bits in `a` are all 0's. Else return 0. 1989 // This is a #BONUS since it was lacking in Intel Intrinsics API. 1990 int _mm_test_all_zeros (__m128i a) @safe 1991 { 1992 return _mm_testz_si128(a, _mm_set1_epi32(-1)); 1993 } 1994 unittest 1995 { 1996 __m128i A = _mm_set1_epi32(0); 1997 __m128i B = _mm_set_epi32(0, 8, 0, 0); 1998 assert(_mm_test_all_zeros(A) == 1); 1999 assert(_mm_test_all_zeros(B) == 0); 2000 } 2001 2002 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, 2003 /// and return 1 if the result is zero, otherwise return 0. 2004 int _mm_test_all_zeros (__m128i a, __m128i mask) @safe 2005 { 2006 return _mm_testz_si128(a, mask); // it's really the same, but with a good name 2007 } 2008 2009 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and mask, and set ZF to 1 2010 /// if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with 2011 /// mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and 2012 /// CF values are zero, otherwise return 0. 2013 int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted 2014 { 2015 return _mm_testnzc_si128(a, mask); 2016 } 2017 2018 /// Compute the bitwise NOT of a and then AND with b, and return 1 if the 2019 /// result is zero, otherwise return 0. 2020 /// In other words, test if all bits masked by `b` are 1 in `a`. 2021 int _mm_testc_si128 (__m128i a, __m128i b) pure @trusted 2022 { 2023 // PERF DMD 2024 static if (GDC_with_SSE41) 2025 { 2026 return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); 2027 } 2028 else static if (LDC_with_SSE41) 2029 { 2030 return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); 2031 } 2032 else static if (LDC_with_ARM64) 2033 { 2034 // Acceptable since LDC 1.8 -02 2035 long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a); 2036 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); 2037 } 2038 else 2039 { 2040 __m128i c = ~a & b; 2041 int[4] zero = [0, 0, 0, 0]; 2042 return c.array == zero; 2043 } 2044 } 2045 unittest 2046 { 2047 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2048 __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00); 2049 __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); 2050 assert(_mm_testc_si128(A, A) == 1); 2051 assert(_mm_testc_si128(A, M1) == 0); 2052 assert(_mm_testc_si128(A, M2) == 1); 2053 } 2054 2055 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, 2056 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 2057 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 2058 /// result is zero, otherwise set CF to 0. 2059 /// Return 1 if both the ZF and CF values are zero, otherwise return 0. 2060 int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted 2061 { 2062 // PERF DMD 2063 static if (GDC_with_SSE41) 2064 { 2065 return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); 2066 } 2067 else static if (LDC_with_SSE41) 2068 { 2069 return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); 2070 } 2071 else static if (LDC_with_ARM64) 2072 { 2073 long2 s640 = vandq_s64(cast(long2)b, cast(long2)a); 2074 long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a); 2075 2076 return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)) 2077 | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) ); 2078 } 2079 else 2080 { 2081 __m128i c = a & b; 2082 __m128i d = ~a & b; 2083 int[4] zero = [0, 0, 0, 0]; 2084 return !( (c.array == zero) || (d.array == zero)); 2085 } 2086 } 2087 unittest 2088 { 2089 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2090 __m128i M = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00); 2091 __m128i Z = _mm_setzero_si128(); 2092 assert(_mm_testnzc_si128(A, Z) == 0); 2093 assert(_mm_testnzc_si128(A, M) == 1); 2094 assert(_mm_testnzc_si128(A, A) == 0); 2095 } 2096 2097 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, 2098 /// and return 1 if the result is zero, otherwise return 0. 2099 /// In other words, test if all bits masked by `b` are 0 in `a`. 2100 int _mm_testz_si128 (__m128i a, __m128i b) @trusted 2101 { 2102 // PERF DMD 2103 static if (GDC_with_SSE41) 2104 { 2105 return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); 2106 } 2107 else static if (LDC_with_SSE41) 2108 { 2109 return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); 2110 } 2111 else static if (LDC_with_ARM64) 2112 { 2113 // Acceptable since LDC 1.8 -02 2114 long2 s64 = vandq_s64(cast(long2)a, cast(long2)b); 2115 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); 2116 } 2117 else 2118 { 2119 __m128i c = a & b; 2120 int[4] zero = [0, 0, 0, 0]; 2121 return c.array == zero; 2122 } 2123 } 2124 unittest 2125 { 2126 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2127 __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07); 2128 __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); 2129 assert(_mm_testz_si128(A, A) == 0); 2130 assert(_mm_testz_si128(A, M1) == 1); 2131 assert(_mm_testz_si128(A, M2) == 0); 2132 } 2133