1 /** 2 * SSE4.1 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1 4 * 5 * Copyright: Guillaume Piolat 2021. 6 * Johan Engelen 2021. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.smmintrin; 10 11 // SSE4.1 instructions 12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1 13 // Note: this header will work whether you have SSE4.1 enabled or not. 14 // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively 15 // generate SSE4.1 instructions. 16 // With GDC, use "dflags-gdc": ["-msse4.1"] or equivalent to generate SSE4.1 instructions. 17 18 public import inteli.types; 19 import inteli.internals; 20 21 // smmintrin pulls in all previous instruction set intrinsics. 22 public import inteli.tmmintrin; 23 24 nothrow @nogc: 25 26 enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes 27 enum int _MM_FROUND_TO_NEG_INF = 0x01; /// ditto 28 enum int _MM_FROUND_TO_POS_INF = 0x02; /// ditto 29 enum int _MM_FROUND_TO_ZERO = 0x03; /// ditto 30 enum int _MM_FROUND_CUR_DIRECTION = 0x04; /// ditto 31 enum int _MM_FROUND_RAISE_EXC = 0x00; /// ditto 32 enum int _MM_FROUND_NO_EXC = 0x08; /// ditto 33 34 enum int _MM_FROUND_NINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT); 35 enum int _MM_FROUND_FLOOR = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF); 36 enum int _MM_FROUND_CEIL = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF); 37 enum int _MM_FROUND_TRUNC = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO); 38 enum int _MM_FROUND_RINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION); 39 enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION); 40 41 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results. 42 // Note: changed signature, GDC needs a compile-time value for imm8. 43 __m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) @trusted 44 { 45 // PERF DMD 46 static if (GDC_with_SSE41) 47 { 48 return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8); 49 } 50 else 51 { 52 // LDC x86 This generates pblendw since LDC 1.1 and -O2 53 short8 r; 54 short8 sa = cast(short8)a; 55 short8 sb = cast(short8)b; 56 for (int n = 0; n < 8; ++n) 57 { 58 r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n]; 59 } 60 return cast(__m128i)r; 61 } 62 } 63 unittest 64 { 65 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 66 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 67 short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011 68 short[8] correct = [8, 9, 2, 3, 12, 5, 6, 15]; 69 assert(C.array == correct); 70 } 71 72 73 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`. 74 // Note: changed signature, GDC needs a compile-time value for `imm8`. 75 __m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted 76 { 77 static assert(imm8 >= 0 && imm8 < 4); 78 // PERF DMD 79 static if (GDC_with_SSE41) 80 { 81 return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8); 82 } 83 else 84 { 85 // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12 86 double2 r; 87 for (int n = 0; n < 2; ++n) 88 { 89 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 90 } 91 return cast(__m128d)r; 92 } 93 } 94 unittest 95 { 96 __m128d A = _mm_setr_pd(0, 1); 97 __m128d B = _mm_setr_pd(8, 9); 98 double2 C = _mm_blend_pd!2(A, B); 99 double[2] correct = [0, 9]; 100 assert(C.array == correct); 101 } 102 103 104 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control 105 /// mask `imm8`. 106 // Note: changed signature, GDC needs a compile-time value for imm8. 107 __m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) @trusted 108 { 109 // PERF DMD 110 static assert(imm8 >= 0 && imm8 < 16); 111 static if (GDC_with_SSE41) 112 { 113 return __builtin_ia32_blendps(a, b, imm8); 114 } 115 else version(LDC) 116 { 117 // LDC x86: generates blendps since LDC 1.1 -O2 118 // arm64: pretty good, two instructions worst case 119 return shufflevectorLDC!(float4, (imm8 & 1) ? 4 : 0, 120 (imm8 & 2) ? 5 : 1, 121 (imm8 & 4) ? 6 : 2, 122 (imm8 & 8) ? 7 : 3)(a, b); 123 } 124 else 125 { 126 __m128 r; // PERF =void; 127 for (int n = 0; n < 4; ++n) 128 { 129 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 130 } 131 return r; 132 } 133 } 134 unittest 135 { 136 __m128 A = _mm_setr_ps(0, 1, 2, 3); 137 __m128 B = _mm_setr_ps(8, 9, 10, 11); 138 float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101 139 float[4] correct = [8, 1, 10, 11]; 140 assert(C.array == correct); 141 } 142 143 /// Blend packed 8-bit integers from `a` and `b` using `mask`. 144 __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) @trusted 145 { 146 // PERF DMD 147 /*static if (GDC_with_SSE41) 148 { 149 // This intrinsic do nothing in GDC 12. 150 // TODO report to GDC. No problem in GCC. 151 return cast(__m128i) __builtin_ia32_pblendvb128 (cast(ubyte16)a, cast(ubyte16)b, cast(ubyte16)mask); 152 } 153 else*/ 154 static if (LDC_with_SSE41) 155 { 156 return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask); 157 } 158 else static if (LDC_with_ARM64) 159 { 160 // LDC arm64: two instructions since LDC 1.12 -O2 161 byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7); 162 return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a); 163 } 164 else 165 { 166 __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask); 167 return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b); 168 } 169 } 170 unittest 171 { 172 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 173 8, 9, 10, 11, 12, 13, 14, 15); 174 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 175 24, 25, 26, 27, 28, 29, 30, 31); 176 __m128i M = _mm_setr_epi8( 1, -1, 1, 1, -4, 1, -8, 127, 177 1, 1, -1, -1, 4, 1, 8, -128); 178 byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M); 179 byte[16] correct = [ 0, 17, 2, 3, 20, 5, 22, 7, 180 8, 9, 26, 27, 12, 13, 14, 31 ]; 181 assert(R.array == correct); 182 } 183 184 185 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`. 186 __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted 187 { 188 // PERF DMD 189 static if (GDC_with_SSE42) 190 { 191 // PERF Amazingly enough, GCC/GDC generates the blendvpd instruction 192 // with -msse4.2 but not -msse4.1. 193 // Not sure what is the reason, and there is a replacement sequence. 194 // Sounds like a bug. 195 return __builtin_ia32_blendvpd(a, b, mask); 196 } 197 else static if (LDC_with_SSE41) 198 { 199 return __builtin_ia32_blendvpd(a, b, mask); 200 } 201 else static if (LDC_with_ARM64) 202 { 203 long2 shift; 204 shift = 63; 205 long2 lmask = cast(long2)mask >> shift; 206 return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a); 207 } 208 else 209 { 210 __m128d r; // PERF =void; 211 long2 lmask = cast(long2)mask; 212 for (int n = 0; n < 2; ++n) 213 { 214 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 215 } 216 return r; 217 } 218 } 219 unittest 220 { 221 __m128d A = _mm_setr_pd(1.0, 2.0); 222 __m128d B = _mm_setr_pd(3.0, 4.0); 223 __m128d M1 = _mm_setr_pd(-3.0, 2.0); 224 __m128d R1 = _mm_blendv_pd(A, B, M1); 225 double[2] correct1 = [3.0, 2.0]; 226 assert(R1.array == correct1); 227 228 // Note: wouldn't work with -double.nan, since in some AArch64 archs the NaN sign bit is lost 229 // See Issue #78 230 __m128d M2 = _mm_setr_pd(double.nan, double.infinity); 231 __m128d R2 = _mm_blendv_pd(A, B, M2); 232 double[2] correct2 = [1.0, 2.0]; 233 assert(R2.array == correct2); 234 } 235 236 237 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`. 238 __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) @trusted 239 { 240 // PERF DMD 241 static if (GDC_with_SSE41) 242 { 243 return __builtin_ia32_blendvps(a, b, mask); 244 } 245 else static if (LDC_with_SSE41) 246 { 247 return __builtin_ia32_blendvps(a, b, mask); 248 } 249 else static if (LDC_with_ARM64) 250 { 251 int4 shift; 252 shift = 31; 253 int4 lmask = cast(int4)mask >> shift; 254 return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a); 255 } 256 else 257 { 258 __m128 r; // PERF =void; 259 int4 lmask = cast(int4)mask; 260 for (int n = 0; n < 4; ++n) 261 { 262 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 263 } 264 return r; 265 } 266 } 267 unittest 268 { 269 __m128 A = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f); 270 __m128 B = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f); 271 __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f); 272 __m128 M2 = _mm_setr_ps(float.nan, float.nan, -0.0f, +0.0f); 273 __m128 R1 = _mm_blendv_ps(A, B, M1); 274 __m128 R2 = _mm_blendv_ps(A, B, M2); 275 float[4] correct1 = [ 4.0f, 1.0f, 2.0f, 7.0f]; 276 float[4] correct2 = [ 0.0f, 1.0f, 6.0f, 3.0f]; 277 assert(R1.array == correct1); 278 279 // Note: wouldn't work with -float.nan, since in some AArch64 archs the NaN sign bit is lost 280 // See Issue #78 281 assert(R2.array == correct2); 282 } 283 284 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, 285 /// and store the results as packed double-precision floating-point elements. 286 __m128d _mm_ceil_pd (__m128d a) @trusted 287 { 288 static if (LDC_with_ARM64) 289 { 290 // LDC arm64 acceptable since 1.8 -O2 291 // Unfortunately x86 intrinsics force a round-trip back to double2 292 // ARM neon semantics wouldn't have that 293 long2 l = vcvtpq_s64_f64(a); 294 double2 r; 295 r.ptr[0] = l.array[0]; 296 r.ptr[1] = l.array[1]; 297 return r; 298 } 299 else 300 { 301 return _mm_round_pd!2(a); 302 } 303 } 304 unittest 305 { 306 __m128d A = _mm_setr_pd(1.3f, -2.12f); 307 __m128d B = _mm_setr_pd(53.6f, -2.7f); 308 A = _mm_ceil_pd(A); 309 B = _mm_ceil_pd(B); 310 double[2] correctA = [2.0, -2.0]; 311 double[2] correctB = [54.0, -2.0]; 312 assert(A.array == correctA); 313 assert(B.array == correctB); 314 } 315 316 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, 317 /// and store the results as packed single-precision floating-point elements. 318 __m128 _mm_ceil_ps (__m128 a) @trusted 319 { 320 static if (LDC_with_ARM64) 321 { 322 // LDC arm64 acceptable since 1.8 -O1 323 int4 l = vcvtpq_s32_f32(a); 324 float4 r; 325 r.ptr[0] = l.array[0]; 326 r.ptr[1] = l.array[1]; 327 r.ptr[2] = l.array[2]; 328 r.ptr[3] = l.array[3]; 329 return r; 330 } 331 else 332 { 333 return _mm_round_ps!2(a); 334 } 335 } 336 unittest 337 { 338 __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); 339 __m128 C = _mm_ceil_ps(A); 340 float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f]; 341 assert(C.array == correct); 342 } 343 344 /// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, 345 /// store the result as a double-precision floating-point element in the lower element of result, 346 /// and copy the upper element from `a` to the upper element of dst. 347 __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted 348 { 349 static if (LDC_with_ARM64) 350 { 351 a[0] = vcvtps_s64_f64(b[0]); 352 return a; 353 } 354 else 355 { 356 return _mm_round_sd!2(a, b); 357 } 358 } 359 unittest 360 { 361 __m128d A = _mm_setr_pd(1.3, -2.12); 362 __m128d B = _mm_setr_pd(53.6, -3.7); 363 __m128d C = _mm_ceil_sd(A, B); 364 double[2] correct = [54.0, -2.12]; 365 assert(C.array == correct); 366 } 367 368 /// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value, 369 /// store the result as a single-precision floating-point element in the lower element of result, 370 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 371 __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted 372 { 373 static if (LDC_with_ARM64) 374 { 375 a[0] = vcvtps_s32_f32(b[0]); 376 return a; 377 } 378 else 379 { 380 return _mm_round_ss!2(a, b); 381 } 382 } 383 unittest 384 { 385 __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); 386 __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f); 387 __m128 C = _mm_ceil_ss(A, B); 388 float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f]; 389 assert(C.array == correct); 390 } 391 392 /// Compare packed 64-bit integers in `a` and `b` for equality. 393 __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted 394 { 395 // PERF DMD 396 static if (GDC_with_SSE41) 397 { 398 return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b); 399 } 400 else version(LDC) 401 { 402 // LDC x86: generates pcmpeqq since LDC 1.1 -O1 403 // arm64: generates cmeq since LDC 1.8 -O1 404 return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b); 405 } 406 else 407 { 408 // Clever pcmpeqd + pand use with LDC 1.24 -O2 409 long2 la = cast(long2)a; 410 long2 lb = cast(long2)b; 411 long2 res; 412 res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0; 413 res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0; 414 return cast(__m128i)res; 415 } 416 } 417 unittest 418 { 419 __m128i A = _mm_setr_epi64(-1, -2); 420 __m128i B = _mm_setr_epi64(-3, -2); 421 __m128i C = _mm_setr_epi64(-1, -4); 422 long2 AB = cast(long2) _mm_cmpeq_epi64(A, B); 423 long2 AC = cast(long2) _mm_cmpeq_epi64(A, C); 424 long[2] correct1 = [0, -1]; 425 long[2] correct2 = [-1, 0]; 426 assert(AB.array == correct1); 427 assert(AC.array == correct2); 428 } 429 430 431 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers. 432 __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted 433 { 434 // PERF DMD 435 static if (GDC_with_SSE41) 436 { 437 return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a); 438 } 439 else version(LDC) 440 { 441 // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64 442 enum ir = ` 443 %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3> 444 %r = sext <4 x i16> %v to <4 x i32> 445 ret <4 x i32> %r`; 446 return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a); 447 } 448 else 449 { 450 short8 sa = cast(short8)a; 451 int4 r; 452 r.ptr[0] = sa.array[0]; 453 r.ptr[1] = sa.array[1]; 454 r.ptr[2] = sa.array[2]; 455 r.ptr[3] = sa.array[3]; 456 return r; 457 } 458 } 459 unittest 460 { 461 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 462 int4 C = cast(int4) _mm_cvtepi16_epi32(A); 463 int[4] correct = [-1, 0, -32768, 32767]; 464 assert(C.array == correct); 465 } 466 467 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers. 468 __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted 469 { 470 // PERF DMD 471 static if (GDC_with_SSE41) 472 { 473 return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a); 474 } 475 else version(LDC) 476 { 477 // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64 478 enum ir = ` 479 %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> <i32 0, i32 1> 480 %r = sext <2 x i16> %v to <2 x i64> 481 ret <2 x i64> %r`; 482 return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a); 483 } 484 else 485 { 486 short8 sa = cast(short8)a; 487 long2 r; 488 r.ptr[0] = sa.array[0]; 489 r.ptr[1] = sa.array[1]; 490 return cast(__m128i)r; 491 } 492 } 493 unittest 494 { 495 __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0); 496 long2 C = cast(long2) _mm_cvtepi16_epi64(A); 497 long[2] correct = [-32768, 32767]; 498 assert(C.array == correct); 499 } 500 501 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers. 502 __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted 503 { 504 // PERF DMD 505 static if (GDC_with_SSE41) 506 { 507 return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a); 508 } 509 else version(LDC) 510 { 511 // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64 512 enum ir = ` 513 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 514 %r = sext <2 x i32> %v to <2 x i64> 515 ret <2 x i64> %r`; 516 return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a); 517 } 518 else 519 { 520 int4 sa = cast(int4)a; 521 long2 r; 522 r.ptr[0] = sa.array[0]; 523 r.ptr[1] = sa.array[1]; 524 return cast(__m128i)r; 525 } 526 } 527 unittest 528 { 529 __m128i A = _mm_setr_epi32(-4, 42, 0, 0); 530 long2 C = cast(long2) _mm_cvtepi32_epi64(A); 531 long[2] correct = [-4, 42]; 532 assert(C.array == correct); 533 } 534 535 536 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers. 537 __m128i _mm_cvtepi8_epi16 (__m128i a) @trusted 538 { 539 // PERF DMD 540 static if (GDC_with_SSE41) 541 { 542 alias ubyte16 = __vector(ubyte[16]); 543 return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a); 544 } 545 else version(LDC) 546 { 547 // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 548 // LDC ARM64: sshll generated since LDC 1.8.0 -O1 549 enum ir = ` 550 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 551 %r = sext <8 x i8> %v to <8 x i16> 552 ret <8 x i16> %r`; 553 return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a); 554 } 555 else 556 { 557 byte16 sa = cast(byte16)a; 558 short8 r; 559 foreach(n; 0..8) 560 r.ptr[n] = sa.array[n]; 561 return cast(__m128i)r; 562 } 563 } 564 unittest 565 { 566 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 567 short8 C = cast(short8) _mm_cvtepi8_epi16(A); 568 short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8]; 569 assert(C.array == correct); 570 } 571 572 573 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers. 574 __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted 575 { 576 // PERF DMD 577 static if (GDC_with_SSE41) 578 { 579 alias ubyte16 = __vector(ubyte[16]); 580 return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a); 581 } 582 else static if (LDC_with_SSE41) 583 { 584 // LDC x86: Generates pmovsxbd since LDC 1.1 -O0 585 enum ir = ` 586 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 587 %r = sext <4 x i8> %v to <4 x i32> 588 ret <4 x i32> %r`; 589 return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a); 590 } 591 else 592 { 593 // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would 594 byte16 sa = cast(byte16)a; 595 int4 r; 596 r.ptr[0] = sa.array[0]; 597 r.ptr[1] = sa.array[1]; 598 r.ptr[2] = sa.array[2]; 599 r.ptr[3] = sa.array[3]; 600 return cast(__m128i)r; 601 } 602 } 603 unittest 604 { 605 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 606 int4 C = cast(int4) _mm_cvtepi8_epi32(A); 607 int[4] correct = [127, -128, 1, -1]; 608 assert(C.array == correct); 609 } 610 611 612 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. 613 __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted 614 { 615 // PERF DMD 616 static if (GDC_with_SSE41) 617 { 618 alias ubyte16 = __vector(ubyte[16]); 619 return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a); 620 } 621 else version(LDC) 622 { 623 // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, 624 // LDC arm64: it's ok since LDC 1.8 -O1 625 enum ir = ` 626 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> <i32 0, i32 1> 627 %r = sext <2 x i8> %v to <2 x i64> 628 ret <2 x i64> %r`; 629 return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a); 630 } 631 else 632 { 633 byte16 sa = cast(byte16)a; 634 long2 r; 635 foreach(n; 0..2) 636 r.ptr[n] = sa.array[n]; 637 return cast(__m128i)r; 638 } 639 } 640 unittest 641 { 642 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 643 long2 C = cast(long2) _mm_cvtepi8_epi64(A); 644 long[2] correct = [127, -128]; 645 assert(C.array == correct); 646 } 647 648 649 /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers. 650 __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted 651 { 652 // PERF DMD 653 static if (GDC_with_SSE41) 654 { 655 return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a); 656 } 657 else 658 { 659 // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 660 // arm64: ushll since LDC 1.12 -O1 661 short8 sa = cast(short8)a; 662 int4 r; 663 r.ptr[0] = cast(ushort)sa.array[0]; 664 r.ptr[1] = cast(ushort)sa.array[1]; 665 r.ptr[2] = cast(ushort)sa.array[2]; 666 r.ptr[3] = cast(ushort)sa.array[3]; 667 return cast(__m128i)r; 668 } 669 } 670 unittest 671 { 672 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 673 int4 C = cast(int4) _mm_cvtepu16_epi32(A); 674 int[4] correct = [65535, 0, 32768, 32767]; 675 assert(C.array == correct); 676 } 677 678 679 /// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers. 680 __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted 681 { 682 // PERF DMD 683 static if (GDC_with_SSE41) 684 { 685 return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a); 686 } 687 else static if (LDC_with_ARM64) 688 { 689 // LDC arm64: a bit shorter than below, in -O2 690 short8 sa = cast(short8)a; 691 long2 r; 692 for(int n = 0; n < 2; ++n) 693 r.ptr[n] = cast(ushort)sa.array[n]; 694 return cast(__m128i)r; 695 } 696 else 697 { 698 // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 699 short8 sa = cast(short8)a; 700 long2 r; 701 r.ptr[0] = cast(ushort)sa.array[0]; 702 r.ptr[1] = cast(ushort)sa.array[1]; 703 return cast(__m128i)r; 704 } 705 } 706 unittest 707 { 708 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 709 long2 C = cast(long2) _mm_cvtepu16_epi64(A); 710 long[2] correct = [65535, 0]; 711 assert(C.array == correct); 712 } 713 714 715 /// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers. 716 __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted 717 { 718 // PERF DMD 719 static if (GDC_with_SSE41) 720 { 721 return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a); 722 } 723 else 724 { 725 // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1 726 // arm64: generates ushll since LDC 1.12 -O1 727 int4 sa = cast(int4)a; 728 long2 r; 729 r.ptr[0] = cast(uint)sa.array[0]; 730 r.ptr[1] = cast(uint)sa.array[1]; 731 return cast(__m128i)r; 732 } 733 } 734 unittest 735 { 736 __m128i A = _mm_setr_epi32(-1, 42, 0, 0); 737 long2 C = cast(long2) _mm_cvtepu32_epi64(A); 738 long[2] correct = [4294967295, 42]; 739 assert(C.array == correct); 740 } 741 742 743 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers. 744 __m128i _mm_cvtepu8_epi16 (__m128i a) @trusted 745 { 746 // PERF DMD 747 static if (GDC_with_SSE41) 748 { 749 return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(ubyte16)a); 750 } 751 else 752 { 753 // LDC x86: generates pmovzxbw since LDC 1.12 -O1 also good without SSE4.1 754 // arm64: ushll since LDC 1.12 -O1 755 // PERF: catastrophic with GDC without SSE4.1 756 byte16 sa = cast(byte16)a; 757 short8 r; 758 r.ptr[0] = cast(ubyte)sa.array[0]; 759 r.ptr[1] = cast(ubyte)sa.array[1]; 760 r.ptr[2] = cast(ubyte)sa.array[2]; 761 r.ptr[3] = cast(ubyte)sa.array[3]; 762 r.ptr[4] = cast(ubyte)sa.array[4]; 763 r.ptr[5] = cast(ubyte)sa.array[5]; 764 r.ptr[6] = cast(ubyte)sa.array[6]; 765 r.ptr[7] = cast(ubyte)sa.array[7]; 766 return cast(__m128i)r; 767 } 768 } 769 unittest 770 { 771 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 772 short8 C = cast(short8) _mm_cvtepu8_epi16(A); 773 short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248]; 774 assert(C.array == correct); 775 } 776 777 778 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers. 779 __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted 780 { 781 // PERF DMD 782 static if (GDC_with_SSE41) 783 { 784 alias ubyte16 = __vector(ubyte[16]); 785 return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a); 786 } 787 else static if (LDC_with_ARM64) 788 { 789 // LDC arm64: a bit better than below in -O2 790 byte16 sa = cast(byte16)a; 791 int4 r; 792 for(int n = 0; n < 4; ++n) 793 r.ptr[n] = cast(ubyte)sa.array[n]; 794 return cast(__m128i)r; 795 } 796 else 797 { 798 // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1 799 // PERF: catastrophic with GDC without SSE4.1 800 byte16 sa = cast(byte16)a; 801 int4 r; 802 r.ptr[0] = cast(ubyte)sa.array[0]; 803 r.ptr[1] = cast(ubyte)sa.array[1]; 804 r.ptr[2] = cast(ubyte)sa.array[2]; 805 r.ptr[3] = cast(ubyte)sa.array[3]; 806 return cast(__m128i)r; 807 } 808 } 809 unittest 810 { 811 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 812 int4 C = cast(int4) _mm_cvtepu8_epi32(A); 813 int[4] correct = [127, 128, 1, 255]; 814 assert(C.array == correct); 815 } 816 817 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. 818 __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted 819 { 820 // PERF DMD 821 static if (GDC_with_SSE41) 822 { 823 alias ubyte16 = __vector(ubyte[16]); 824 return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a); 825 } 826 else static if (LDC_with_ARM64) 827 { 828 // LDC arm64: this optimizes better than the loop below 829 byte16 sa = cast(byte16)a; 830 long2 r; 831 for (int n = 0; n < 2; ++n) 832 r.ptr[n] = cast(ubyte)sa.array[n]; 833 return cast(__m128i)r; 834 } 835 else 836 { 837 // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1 838 byte16 sa = cast(byte16)a; 839 long2 r; 840 r.ptr[0] = cast(ubyte)sa.array[0]; 841 r.ptr[1] = cast(ubyte)sa.array[1]; 842 return cast(__m128i)r; 843 } 844 } 845 unittest 846 { 847 __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 848 long2 C = cast(long2) _mm_cvtepu8_epi64(A); 849 long[2] correct = [127, 254]; 850 assert(C.array == correct); 851 } 852 853 /// Conditionally multiply the packed double-precision (64-bit) floating-point elements 854 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally 855 /// store the sum in dst using the low 4 bits of `imm8`. 856 __m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted 857 { 858 // PERF DMD 859 static if (GDC_with_SSE41) 860 { 861 return __builtin_ia32_dppd(a, b, imm8 & 0x33); 862 } 863 else static if (LDC_with_SSE41) 864 { 865 return __builtin_ia32_dppd(a, b, imm8 & 0x33); 866 } 867 else 868 { 869 __m128d zero = _mm_setzero_pd(); 870 __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b); 871 double sum = temp.array[0] + temp.array[1]; 872 return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum)); 873 } 874 } 875 unittest 876 { 877 __m128d A = _mm_setr_pd(1.0, 2.0); 878 __m128d B = _mm_setr_pd(4.0, 8.0); 879 double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B); 880 double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B); 881 double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B); 882 double[2] correct1 = [ 4.0, 4.0]; 883 double[2] correct2 = [16.0, 0.0]; 884 double[2] correct3 = [ 0.0, 20.0]; 885 assert(R1.array == correct1); 886 assert(R2.array == correct2); 887 assert(R3.array == correct3); 888 } 889 890 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements 891 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, 892 /// and conditionally store the sum in result using the low 4 bits of `imm8`. 893 __m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted 894 { 895 // PERF DMD 896 static if (GDC_with_SSE41) 897 { 898 return __builtin_ia32_dpps(a, b, cast(ubyte)imm8); 899 } 900 else static if (LDC_with_SSE41) 901 { 902 return __builtin_ia32_dpps(a, b, cast(byte)imm8); 903 } 904 else 905 { 906 __m128 zero = _mm_setzero_ps(); 907 __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b); 908 float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3]; 909 return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum)); 910 } 911 } 912 unittest 913 { 914 __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f); 915 __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f); 916 float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B); 917 float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B); 918 float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B); 919 float[4] correct1 = [67.0f, 67.0f, 67.0f, 67.0f]; 920 float[4] correct2 = [23.0f, 0.0f, 23.0f, 0.0f]; 921 float[4] correct3 = [0.0f, 29.0f, 0.0f, 29.0f]; 922 assert(R1.array == correct1); 923 assert(R2.array == correct2); 924 assert(R3.array == correct3); 925 } 926 927 928 /// Extract a 32-bit integer from `a`, selected with `imm8`. 929 int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted 930 { 931 return (cast(int4)a).array[imm8 & 3]; 932 } 933 unittest 934 { 935 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 936 assert(_mm_extract_epi32(A, 0) == 1); 937 assert(_mm_extract_epi32(A, 1 + 8) == 2); 938 assert(_mm_extract_epi32(A, 3 + 4) == 4); 939 } 940 941 /// Extract a 64-bit integer from `a`, selected with `imm8`. 942 long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted 943 { 944 long2 la = cast(long2)a; 945 return la.array[imm8 & 1]; 946 } 947 unittest 948 { 949 __m128i A = _mm_setr_epi64(45, -67); 950 assert(_mm_extract_epi64(A, 0) == 45); 951 assert(_mm_extract_epi64(A, 1) == -67); 952 assert(_mm_extract_epi64(A, 2) == 45); 953 } 954 955 /// Extract an 8-bit integer from `a`, selected with `imm8`. 956 /// Warning: the returned value is zero-extended to 32-bits. 957 int _mm_extract_epi8 (__m128i a, const int imm8) @trusted 958 { 959 byte16 ba = cast(byte16)a; 960 return cast(ubyte) ba.array[imm8 & 15]; 961 } 962 unittest 963 { 964 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15); 965 assert(_mm_extract_epi8(A, 7) == 7); 966 assert(_mm_extract_epi8(A, 13) == 255); 967 assert(_mm_extract_epi8(A, 7 + 16) == 7); 968 } 969 970 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`. 971 /// Note: returns a 32-bit $(I integer). 972 int _mm_extract_ps (__m128 a, const int imm8) @trusted 973 { 974 return (cast(int4)a).array[imm8 & 3]; 975 } 976 unittest 977 { 978 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f); 979 assert(_mm_extract_ps(A, 0) == 0x3f800000); 980 assert(_mm_extract_ps(A, 1 + 8) == 0x40000000); 981 assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000); 982 } 983 984 985 986 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an 987 /// integer value, and store the results as packed double-precision floating-point elements. 988 __m128d _mm_floor_pd (__m128d a) @trusted 989 { 990 static if (LDC_with_ARM64) 991 { 992 // LDC arm64 acceptable since 1.8 -O2 993 long2 l = vcvtmq_s64_f64(a); 994 double2 r; 995 r.ptr[0] = l.array[0]; 996 r.ptr[1] = l.array[1]; 997 return r; 998 } 999 else 1000 { 1001 return _mm_round_pd!1(a); 1002 } 1003 } 1004 unittest 1005 { 1006 __m128d A = _mm_setr_pd(1.3f, -2.12f); 1007 __m128d B = _mm_setr_pd(53.6f, -2.7f); 1008 A = _mm_floor_pd(A); 1009 B = _mm_floor_pd(B); 1010 double[2] correctA = [1.0, -3.0]; 1011 double[2] correctB = [53.0, -3.0]; 1012 assert(A.array == correctA); 1013 assert(B.array == correctB); 1014 } 1015 1016 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an 1017 /// integer value, and store the results as packed single-precision floating-point elements. 1018 __m128 _mm_floor_ps (__m128 a) @trusted 1019 { 1020 static if (LDC_with_ARM64) 1021 { 1022 // LDC arm64 acceptable since 1.8 -O1 1023 int4 l = vcvtmq_s32_f32(a); 1024 float4 r; 1025 r.ptr[0] = l.array[0]; 1026 r.ptr[1] = l.array[1]; 1027 r.ptr[2] = l.array[2]; 1028 r.ptr[3] = l.array[3]; 1029 return r; 1030 } 1031 else 1032 { 1033 return _mm_round_ps!1(a); 1034 } 1035 } 1036 unittest 1037 { 1038 __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); 1039 __m128 C = _mm_floor_ps(A); 1040 float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f]; 1041 assert(C.array == correct); 1042 } 1043 1044 /// Round the lower double-precision (64-bit) floating-point element in `b` down to an 1045 /// integer value, store the result as a double-precision floating-point element in the 1046 /// lower element, and copy the upper element from `a` to the upper element. 1047 __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted 1048 { 1049 static if (LDC_with_ARM64) 1050 { 1051 a[0] = vcvtms_s64_f64(b[0]); 1052 return a; 1053 } 1054 else 1055 { 1056 return _mm_round_sd!1(a, b); 1057 } 1058 } 1059 unittest 1060 { 1061 __m128d A = _mm_setr_pd(1.3, -2.12); 1062 __m128d B = _mm_setr_pd(-53.1, -3.7); 1063 __m128d C = _mm_floor_sd(A, B); 1064 double[2] correct = [-54.0, -2.12]; 1065 assert(C.array == correct); 1066 } 1067 1068 /// Round the lower single-precision (32-bit) floating-point element in `b` down to an 1069 /// integer value, store the result as a single-precision floating-point element in the 1070 /// lower element, and copy the upper 3 packed elements from `a` to the upper elements. 1071 __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted 1072 { 1073 static if (LDC_with_ARM64) 1074 { 1075 a[0] = vcvtms_s32_f32(b[0]); 1076 return a; 1077 } 1078 else 1079 { 1080 return _mm_round_ss!1(a, b); 1081 } 1082 } 1083 unittest 1084 { 1085 __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); 1086 __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f); 1087 __m128 C = _mm_floor_ss(A, B); 1088 float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f]; 1089 assert(C.array == correct); 1090 } 1091 1092 /// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`. 1093 __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted 1094 { 1095 // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1 1096 // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1 1097 // LDC arm64: ins.s since LDC 1.8 -O2 1098 int4 ia = cast(int4)a; 1099 ia.ptr[imm8 & 3] = i; 1100 return cast(__m128i)ia; 1101 } 1102 unittest 1103 { 1104 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 1105 int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4); 1106 int[4] result = [1, 2, 5, 4]; 1107 assert(C.array == result); 1108 } 1109 1110 /// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`. 1111 __m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted 1112 { 1113 // GDC: nothing special to do, psinrq generated with -O1 -msse4.1 1114 // LDC x86: always do something sensible. 1115 long2 la = cast(long2)a; 1116 la.ptr[imm8 & 1] = i; 1117 return cast(__m128i)la; 1118 } 1119 unittest 1120 { 1121 __m128i A = _mm_setr_epi64(1, 2); 1122 long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2); 1123 long[2] result = [1, 5]; 1124 assert(C.array == result); 1125 } 1126 1127 /// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`. 1128 /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8. 1129 __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted 1130 { 1131 // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1 1132 // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory. 1133 byte16 ba = cast(byte16)a; 1134 ba.ptr[imm8 & 15] = cast(byte)i; 1135 return cast(__m128i)ba; 1136 } 1137 unittest 1138 { 1139 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 1140 byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16); 1141 byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 1142 assert(C.array == result); 1143 } 1144 1145 1146 /// Warning: of course it does something totally different from `_mm_insert_epi32`! 1147 /// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` 1148 /// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` 1149 /// (elements are zeroed out when the corresponding bit is set). 1150 __m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted 1151 { 1152 // PERF DMD 1153 static if (GDC_with_SSE41) 1154 { 1155 return __builtin_ia32_insertps128(a, b, cast(ubyte)imm8); 1156 } 1157 else static if (LDC_with_SSE41) 1158 { 1159 return __builtin_ia32_insertps128(a, b, cast(byte)imm8); 1160 } 1161 else 1162 { 1163 float4 tmp2 = a; 1164 float tmp1 = b.array[(imm8 >> 6) & 3]; 1165 tmp2.ptr[(imm8 >> 4) & 3] = tmp1; 1166 return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps()); 1167 } 1168 } 1169 unittest 1170 { 1171 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1172 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1173 __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B); 1174 float[4] correct = [1.0f, 2.0f, 0.0f, 7.0f]; 1175 assert(C.array == correct); 1176 } 1177 1178 1179 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. 1180 __m128i _mm_max_epi32 (__m128i a, __m128i b) @trusted 1181 { 1182 static if (GDC_with_SSE41) 1183 { 1184 return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b); 1185 } 1186 else version(LDC) 1187 { 1188 // x86: pmaxsd since LDC 1.1 -O1 1189 // ARM: smax.4s since LDC 1.8 -01 1190 int4 sa = cast(int4)a; 1191 int4 sb = cast(int4)b; 1192 int4 greater = greaterMask!int4(sa, sb); 1193 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1194 } 1195 else 1196 { 1197 __m128i higher = _mm_cmpgt_epi32(a, b); 1198 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1199 __m128i mask = _mm_and_si128(aTob, higher); 1200 return _mm_xor_si128(b, mask); 1201 } 1202 } 1203 unittest 1204 { 1205 int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), 1206 _mm_setr_epi32( -4,-8, 9, -8)); 1207 int[4] correct = [0x7fffffff, 1, 9, 7]; 1208 assert(R.array == correct); 1209 } 1210 1211 /// Compare packed signed 8-bit integers in `a` and `b`, 1212 /// and return packed maximum values. 1213 __m128i _mm_max_epi8 (__m128i a, __m128i b) @trusted 1214 { 1215 // PERF DMD 1216 static if (GDC_with_SSE41) 1217 { 1218 return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b); 1219 } 1220 else version(LDC) 1221 { 1222 // x86: pmaxsb since LDC 1.1 -O1 1223 // ARM64: smax.16b since LDC 1.8.0 -O1 1224 byte16 sa = cast(byte16)a; 1225 byte16 sb = cast(byte16)b; 1226 byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); 1227 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1228 } 1229 else 1230 { 1231 __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else 1232 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1233 __m128i mask = _mm_and_si128(aTob, lower); 1234 return _mm_xor_si128(b, mask); 1235 } 1236 } 1237 unittest 1238 { 1239 __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); 1240 __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 1241 byte16 R = cast(byte16) _mm_max_epi8(A, B); 1242 byte[16] correct = [127, 1, 9, -7, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0]; 1243 assert(R.array == correct); 1244 } 1245 1246 /// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values. 1247 __m128i _mm_max_epu16 (__m128i a, __m128i b) @trusted 1248 { 1249 // PERF DMD 1250 static if (GDC_with_SSE41) 1251 { 1252 return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b); 1253 } 1254 else version(LDC) 1255 { 1256 // x86: pmaxuw since LDC 1.1 -O1 1257 // ARM64: umax.8h since LDC 1.8.0 -O1 1258 // PERF: without sse4.1, LLVM 12 produces a very interesting 1259 // psubusw xmm0, xmm1 1260 // paddw xmm0, xmm1 1261 // sequence that maybe should go in other min/max intrinsics? 1262 ushort8 sa = cast(ushort8)a; 1263 ushort8 sb = cast(ushort8)b; 1264 ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb); 1265 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1266 } 1267 else 1268 { 1269 b = _mm_subs_epu16(b, a); 1270 b = _mm_add_epi16(b, a); 1271 return b; 1272 } 1273 } 1274 unittest 1275 { 1276 short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), 1277 _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); 1278 short[8] correct = [ -4, -8, -4, -7, 9,-32768, 0, 57]; 1279 assert(R.array == correct); 1280 } 1281 1282 /// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values. 1283 __m128i _mm_max_epu32 (__m128i a, __m128i b) @trusted 1284 { 1285 // PERF DMD 1286 static if (GDC_with_SSE41) 1287 { 1288 return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b); 1289 } 1290 else version(LDC) 1291 { 1292 // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1 1293 // ARM64: umax.4s since LDC 1.8.0 -O1 1294 uint4 sa = cast(uint4)a; 1295 uint4 sb = cast(uint4)b; 1296 uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); 1297 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1298 } 1299 else 1300 { 1301 __m128i valueShift = _mm_set1_epi32(-0x80000000); 1302 __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift)); 1303 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1304 __m128i mask = _mm_and_si128(aTob, higher); 1305 return _mm_xor_si128(b, mask); 1306 } 1307 } 1308 unittest 1309 { 1310 int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), 1311 _mm_setr_epi32( -4,-8, 9, -8)); 1312 int[4] correct = [ -4,-8, 9, -7]; 1313 assert(R.array == correct); 1314 } 1315 1316 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. 1317 __m128i _mm_min_epi32 (__m128i a, __m128i b) @trusted 1318 { 1319 // PERF DMD 1320 static if (GDC_with_SSE41) 1321 { 1322 return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b); 1323 } 1324 else version(LDC) 1325 { 1326 // x86: pminsd since LDC 1.1 -O1, also good without sse4.1 1327 // ARM: smin.4s since LDC 1.8 -01 1328 int4 sa = cast(int4)a; 1329 int4 sb = cast(int4)b; 1330 int4 greater = greaterMask!int4(sa, sb); 1331 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1332 } 1333 else 1334 { 1335 __m128i higher = _mm_cmplt_epi32(a, b); 1336 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1337 __m128i mask = _mm_and_si128(aTob, higher); 1338 return _mm_xor_si128(b, mask); 1339 } 1340 } 1341 unittest 1342 { 1343 int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), 1344 _mm_setr_epi32( -4, -8, 9, -8)); 1345 int[4] correct = [ -4, -8, -4, -8]; 1346 assert(R.array == correct); 1347 } 1348 1349 /// Compare packed signed 8-bit integers in `a` and `b`, 1350 /// and return packed minimum values. 1351 __m128i _mm_min_epi8 (__m128i a, __m128i b) @trusted 1352 { 1353 // PERF DMD 1354 static if (GDC_with_SSE41) 1355 { 1356 return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b); 1357 } 1358 else version(LDC) 1359 { 1360 // x86: pminsb since LDC 1.1 -O1 1361 // ARM64: smin.16b since LDC 1.8.0 -O1 1362 byte16 sa = cast(byte16)a; 1363 byte16 sb = cast(byte16)b; 1364 byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); 1365 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1366 } 1367 else 1368 { 1369 __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else 1370 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1371 __m128i mask = _mm_and_si128(aTob, lower); 1372 return _mm_xor_si128(b, mask); 1373 } 1374 } 1375 unittest 1376 { 1377 __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); 1378 __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 1379 byte16 R = cast(byte16) _mm_min_epi8(A, B); 1380 byte[16] correct = [ 4, -8, -4, -8, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 1381 assert(R.array == correct); 1382 } 1383 1384 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst. 1385 __m128i _mm_min_epu16 (__m128i a, __m128i b) @trusted 1386 { 1387 // PERF DMD 1388 static if (GDC_with_SSE41) 1389 { 1390 return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b); 1391 } 1392 else version(LDC) 1393 { 1394 // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1 1395 // ARM64: umin.8h since LDC 1.8.0 -O1 1396 ushort8 sa = cast(ushort8)a; 1397 ushort8 sb = cast(ushort8)b; 1398 ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa); 1399 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1400 } 1401 else 1402 { 1403 __m128i c = _mm_subs_epu16(b, a); 1404 b = _mm_sub_epi16(b, c); 1405 return b; 1406 } 1407 } 1408 unittest 1409 { 1410 short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), 1411 _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); 1412 short[8] correct = [32767, 1, 9, -8, 0, 7, 0, 0]; 1413 assert(R.array == correct); 1414 } 1415 1416 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst. 1417 __m128i _mm_min_epu32 (__m128i a, __m128i b) @trusted 1418 { 1419 // PERF DMD 1420 static if (GDC_with_SSE41) 1421 { 1422 return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b); 1423 } 1424 else version(LDC) 1425 { 1426 // x86: pminud since LDC 1.1 -O1, also good without sse4.1 1427 // ARM64: umin.4s since LDC 1.8.0 -O1 1428 uint4 sa = cast(uint4)a; 1429 uint4 sb = cast(uint4)b; 1430 uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); 1431 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1432 } 1433 else 1434 { 1435 __m128i valueShift = _mm_set1_epi32(-0x80000000); 1436 __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift)); 1437 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1438 __m128i mask = _mm_and_si128(aTob, higher); 1439 return _mm_xor_si128(b, mask); 1440 } 1441 } 1442 unittest 1443 { 1444 int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), 1445 _mm_setr_epi32( -4,-8, 9, -8)); 1446 int[4] correct = [0x7fffffff, 1, 4, -8]; 1447 assert(R.array == correct); 1448 } 1449 1450 /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, 1451 /// store the minimum and index in return value, and zero the remaining bits. 1452 __m128i _mm_minpos_epu16 (__m128i a) @trusted 1453 { 1454 // PERF DMD 1455 static if (GDC_with_SSE41) 1456 { 1457 return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); 1458 } 1459 else static if (LDC_with_SSE41) 1460 { 1461 return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); 1462 } 1463 else static if (LDC_with_ARM64) 1464 { 1465 __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 1466 __m128i combinedLo = _mm_unpacklo_epi16(indices, a); 1467 __m128i combinedHi = _mm_unpackhi_epi16(indices, a); 1468 __m128i best = _mm_min_epu32(combinedLo, combinedHi); 1469 best = _mm_min_epu32(best, _mm_srli_si128!8(best)); 1470 best = _mm_min_epu32(best, _mm_srli_si128!4(best)); 1471 short8 sbest = cast(short8)best; 1472 short8 r; 1473 r[0] = sbest[1]; 1474 r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie 1475 r[2] = 0; 1476 r[3] = 0; 1477 r[4] = 0; 1478 r[5] = 0; 1479 r[6] = 0; 1480 r[7] = 0; 1481 return cast(__m128i)r; 1482 } 1483 else 1484 { 1485 short8 sa = cast(short8)a; 1486 ushort min = 0xffff; 1487 int index = 0; 1488 for(int n = 0; n < 8; ++n) 1489 { 1490 ushort c = sa.array[n]; 1491 if (c < min) 1492 { 1493 min = c; 1494 index = n; 1495 } 1496 } 1497 short8 r; 1498 r.ptr[0] = min; 1499 r.ptr[1] = cast(short)index; 1500 return cast(__m128i)r; 1501 } 1502 } 1503 unittest 1504 { 1505 __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6); 1506 __m128i B = _mm_setr_epi16(14, 4, 4, 2, -3, 2, 5, 6); 1507 short8 R1 = cast(short8) _mm_minpos_epu16(A); 1508 short8 R2 = cast(short8) _mm_minpos_epu16(B); 1509 short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0]; 1510 short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0]; 1511 assert(R1.array == correct1); 1512 assert(R2.array == correct2); 1513 } 1514 1515 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers 1516 /// in `a` compared to those in `b`, and store the 16-bit results in dst. 1517 /// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. 1518 /// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. 1519 /// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting 1520 /// at the offset specified in `imm8[2]`. 1521 __m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted 1522 { 1523 // PERF DMD 1524 static if (GDC_with_SSE41) 1525 { 1526 return cast(__m128i) __builtin_ia32_mpsadbw128(cast(ubyte16)a, cast(ubyte16)b, cast(ubyte)imm8); 1527 } 1528 else static if (LDC_with_SSE41) 1529 { 1530 return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8); 1531 } 1532 else 1533 { 1534 int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable... 1535 int b_offset = (imm8 & 3) * 4; 1536 1537 byte16 ba = cast(byte16)a; 1538 byte16 bb = cast(byte16)b; 1539 short8 r; 1540 1541 __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0); 1542 1543 for (int j = 0; j < 8; j += 2) 1544 { 1545 int k = a_offset + j; 1546 __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3], 1547 0, 0, 0, 0, 1548 ba[k+1], ba[k+2], ba[k+3], ba[k+4], 1549 0, 0, 0, 0); 1550 short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64 1551 r.ptr[j] = diffs.array[0]; 1552 r.ptr[j+1] = diffs.array[4]; 1553 } 1554 return cast(__m128i)r; 1555 } 1556 } 1557 unittest 1558 { 1559 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 1560 __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5, 5, 5, 12, 13, 14, 15); 1561 short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23]; 1562 short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749]; 1563 short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35]; 1564 short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741]; 1565 short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4]; 1566 short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B); 1567 short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B); 1568 short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B); 1569 short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B); 1570 short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B); 1571 assert(r1.array == correct1); 1572 assert(r4.array == correct4); 1573 assert(r5.array == correct5); 1574 assert(r7.array == correct7); 1575 assert(r8.array == correct0); 1576 } 1577 1578 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst. 1579 __m128i _mm_mul_epi32 (__m128i a, __m128i b) @trusted 1580 { 1581 // PERF DMD 1582 static if (GDC_with_SSE41) 1583 { 1584 return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b); 1585 } 1586 else static if (LDC_with_SSE41) 1587 { 1588 // For some reason, clang has the builtin but it's not in IntrinsicsX86.td 1589 // Use IR instead. 1590 // This generates pmuldq with since LDC 1.2.0 -O0 1591 enum ir = ` 1592 %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 2> 1593 %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> <i32 0, i32 2> 1594 %la = sext <2 x i32> %ia to <2 x i64> 1595 %lb = sext <2 x i32> %ib to <2 x i64> 1596 %r = mul <2 x i64> %la, %lb 1597 ret <2 x i64> %r`; 1598 return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b); 1599 } 1600 else static if (LDC_with_ARM64) 1601 { 1602 // 3 instructions since LDC 1.8 -O2 1603 // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull 1604 int2 a_lo = vmovn_s64(cast(long2)a); 1605 int2 b_lo = vmovn_s64(cast(long2)b); 1606 return cast(__m128i) vmull_s32(a_lo, b_lo); 1607 } 1608 else 1609 { 1610 int4 ia = cast(int4)a; 1611 int4 ib = cast(int4)b; 1612 long2 r; 1613 r.ptr[0] = cast(long)ia.array[0] * ib.array[0]; 1614 r.ptr[1] = cast(long)ia.array[2] * ib.array[2]; 1615 return cast(__m128i)r; 1616 } 1617 } 1618 unittest 1619 { 1620 __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); 1621 __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); 1622 long2 R = cast(long2) _mm_mul_epi32(A, B); 1623 long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144]; 1624 assert(R.array == correct); 1625 } 1626 1627 /// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, 1628 /// return the low 32 bits of the intermediate integers. 1629 __m128i _mm_mullo_epi32 (__m128i a, __m128i b) @trusted 1630 { 1631 // PERF DMD 1632 // PERF GDC without SSE4.1 could be better 1633 static if (GDC_with_SSE41) 1634 { 1635 int4 ia = cast(int4)a; 1636 int4 ib = cast(int4)b; 1637 // Note: older GDC doesn't have that op, but older GDC 1638 // also has no support for -msse4.1 detection 1639 return cast(__m128i)(a * b); 1640 } 1641 else version(LDC) 1642 { 1643 int4 ia = cast(int4)a; 1644 int4 ib = cast(int4)b; 1645 return cast(__m128i)(a * b); 1646 } 1647 else 1648 { 1649 // DMD doesn't take the above 1650 int4 ia = cast(int4)a; 1651 int4 ib = cast(int4)b; 1652 int4 r; 1653 r.ptr[0] = ia.array[0] * ib.array[0]; 1654 r.ptr[1] = ia.array[1] * ib.array[1]; 1655 r.ptr[2] = ia.array[2] * ib.array[2]; 1656 r.ptr[3] = ia.array[3] * ib.array[3]; 1657 return r; 1658 } 1659 } 1660 unittest 1661 { 1662 __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); 1663 __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); 1664 int4 R = cast(int4) _mm_mullo_epi32(A, B); 1665 int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0]; 1666 assert(R.array == correct); 1667 } 1668 1669 1670 /// Convert packed signed 32-bit integers from `a` and `b` 1671 /// to packed 16-bit integers using unsigned saturation. 1672 __m128i _mm_packus_epi32 (__m128i a, __m128i b) @trusted 1673 { 1674 static if (GDC_with_SSE41) 1675 { 1676 // PERF For some reason doesn't generates the builtin??? 1677 return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); 1678 } 1679 else static if (LDC_with_SSE41) 1680 { 1681 return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); 1682 } 1683 else static if (LDC_with_ARM64) 1684 { 1685 int4 z; 1686 z = 0; 1687 return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)), 1688 vqmovn_u32(vmaxq_s32(z, cast(int4)b))); 1689 } 1690 else 1691 { 1692 // PERF: not great without SSE4.1 1693 int4 sa = cast(int4)a; 1694 int4 sb = cast(int4)b; 1695 align(16) ushort[8] result; 1696 for (int i = 0; i < 4; ++i) 1697 { 1698 int s = sa.array[i]; 1699 if (s < 0) s = 0; 1700 if (s > 65535) s = 65535; 1701 result.ptr[i] = cast(ushort)s; 1702 1703 s = sb.array[i]; 1704 if (s < 0) s = 0; 1705 if (s > 65535) s = 65535; 1706 result.ptr[i+4] = cast(ushort)s; 1707 } 1708 return *cast(__m128i*)(result.ptr); 1709 } 1710 } 1711 unittest 1712 { 1713 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 1714 short8 R = cast(short8) _mm_packus_epi32(A, A); 1715 short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0]; 1716 assert(R.array == correct); 1717 } 1718 1719 1720 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 1721 /// rounding parameter, and store the results as packed double-precision floating-point elements. 1722 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1723 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1724 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1725 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1726 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1727 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1728 __m128d _mm_round_pd(int rounding)(__m128d a) @trusted 1729 { 1730 // PERF DMD 1731 static if (GDC_with_SSE41) 1732 { 1733 return __builtin_ia32_roundpd(a, rounding); 1734 } 1735 else static if (LDC_with_SSE41) 1736 { 1737 return __builtin_ia32_roundpd(a, rounding); 1738 } 1739 else 1740 { 1741 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1742 { 1743 // Convert to 64-bit integers 1744 long lo = _mm_cvtsd_si64(a); 1745 a.ptr[0] = a.array[1]; 1746 long hi = _mm_cvtsd_si64(a); 1747 return _mm_setr_pd(lo, hi); 1748 } 1749 else 1750 { 1751 version(GNU) pragma(inline, false); // else fail unittest with optimizations 1752 1753 uint old = _MM_GET_ROUNDING_MODE(); 1754 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1755 1756 // Convert to 64-bit integers 1757 long lo = _mm_cvtsd_si64(a); 1758 a.ptr[0] = a.array[1]; 1759 long hi = _mm_cvtsd_si64(a); 1760 1761 // Convert back to double to achieve the rounding 1762 // The problem is that a 64-bit double can't represent all the values 1763 // a 64-bit integer can (and vice-versa). So this function won't work for 1764 // large values. (TODO: what range exactly?) 1765 _MM_SET_ROUNDING_MODE(old); 1766 return _mm_setr_pd(lo, hi); 1767 } 1768 } 1769 } 1770 unittest 1771 { 1772 // tested in other intrinsics 1773 } 1774 1775 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 1776 /// rounding parameter, and store the results as packed single-precision floating-point elements. 1777 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1778 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1779 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1780 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1781 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1782 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1783 __m128 _mm_round_ps(int rounding)(__m128 a) @trusted 1784 { 1785 // PERF ARM64: there is duplication because this isn't optimal for ARM64, so it is avoided externally 1786 static if (GDC_or_LDC_with_SSE41) 1787 { 1788 return __builtin_ia32_roundps(a, rounding); 1789 } 1790 else 1791 { 1792 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1793 { 1794 __m128i integers = _mm_cvtps_epi32(a); 1795 return _mm_cvtepi32_ps(integers); 1796 } 1797 else 1798 { 1799 version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled 1800 uint old = _MM_GET_ROUNDING_MODE(); 1801 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1802 scope(exit) _MM_SET_ROUNDING_MODE(old); 1803 1804 // Convert to 64-bit integers 1805 __m128i integers = _mm_cvtps_epi32(a); 1806 1807 // Convert back to float to achieve the rounding 1808 // The problem is that a 32-float can't represent all the values 1809 // a 32-bit integer can (and vice-versa). So this function won't work for 1810 // large values. (TODO: what range exactly?) 1811 __m128 result = _mm_cvtepi32_ps(integers); 1812 1813 return result; 1814 } 1815 } 1816 } 1817 unittest 1818 { 1819 // tested in other intrinsics 1820 } 1821 1822 1823 /// Round the lower double-precision (64-bit) floating-point element in `b` using the 1824 /// rounding parameter, store the result as a double-precision floating-point element 1825 /// in the lower element of result, and copy the upper element from `a` to the upper element of result. 1826 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1827 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1828 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1829 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1830 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1831 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1832 __m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted 1833 { 1834 static if (GDC_with_SSE41) 1835 { 1836 return __builtin_ia32_roundsd(a, b, rounding); 1837 } 1838 else static if (LDC_with_SSE41) 1839 { 1840 return __builtin_ia32_roundsd(a, b, rounding); 1841 } 1842 else 1843 { 1844 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1845 { 1846 // Convert to 64-bit integer 1847 long b0 = _mm_cvtsd_si64(b); 1848 a.ptr[0] = b0; 1849 return a; 1850 } 1851 else 1852 { 1853 version(GNU) pragma(inline, false); // else fail unittest with optimizations 1854 1855 uint old = _MM_GET_ROUNDING_MODE(); 1856 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1857 1858 // Convert to 64-bit integer 1859 long b0 = _mm_cvtsd_si64(b); 1860 a.ptr[0] = b0; 1861 1862 // Convert back to double to achieve the rounding 1863 // The problem is that a 64-bit double can't represent all the values 1864 // a 64-bit integer can (and vice-versa). So this function won't work for 1865 // large values. (TODO: what range exactly?) 1866 _MM_SET_ROUNDING_MODE(old); 1867 return a; 1868 } 1869 } 1870 } 1871 unittest 1872 { 1873 // tested in other intrinsics 1874 } 1875 1876 1877 /// Round the lower single-precision (32-bit) floating-point element in `b` using the 1878 /// rounding parameter, store the result as a single-precision floating-point element 1879 /// in the lower element of result, and copy the upper 3 packed elements from `a` 1880 /// to the upper elements of result. 1881 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1882 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1883 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1884 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1885 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1886 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1887 __m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted 1888 { 1889 static if (GDC_with_SSE41) 1890 { 1891 return __builtin_ia32_roundss(a, b, rounding); 1892 } 1893 else static if (LDC_with_SSE41) 1894 { 1895 return __builtin_ia32_roundss(a, b, rounding); 1896 } 1897 else 1898 { 1899 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1900 { 1901 int b0 = _mm_cvtss_si32(b); 1902 a.ptr[0] = b0; 1903 return a; 1904 } 1905 else version(GNU) 1906 { 1907 pragma(inline, false) 1908 __m128 GDCworkaround() nothrow @nogc @trusted 1909 { 1910 uint old = _MM_GET_ROUNDING_MODE(); 1911 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1912 1913 // Convert to 32-bit integer 1914 int b0 = _mm_cvtss_si32(b); 1915 a.ptr[0] = b0; 1916 1917 // Convert back to double to achieve the rounding 1918 // The problem is that a 32-bit float can't represent all the values 1919 // a 32-bit integer can (and vice-versa). So this function won't work for 1920 // large values. (TODO: what range exactly?) 1921 _MM_SET_ROUNDING_MODE(old); 1922 return a; 1923 } 1924 return GDCworkaround(); 1925 } 1926 else 1927 { 1928 uint old = _MM_GET_ROUNDING_MODE(); 1929 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1930 1931 // Convert to 32-bit integer 1932 int b0 = _mm_cvtss_si32(b); 1933 a.ptr[0] = b0; 1934 1935 // Convert back to double to achieve the rounding 1936 // The problem is that a 32-bit float can't represent all the values 1937 // a 32-bit integer can (and vice-versa). So this function won't work for 1938 // large values. (TODO: what range exactly?) 1939 _MM_SET_ROUNDING_MODE(old); 1940 return a; 1941 } 1942 } 1943 } 1944 unittest 1945 { 1946 // tested in other intrinsics 1947 } 1948 1949 1950 /// Load 128-bits of integer data from memory using a non-temporal memory hint. 1951 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection 1952 /// exception may be generated. 1953 __m128i _mm_stream_load_si128 (__m128i * mem_addr) pure @trusted 1954 { 1955 // PERF DMD D_SIMD 1956 static if (GDC_with_SSE41) 1957 { 1958 return cast(__m128i) __builtin_ia32_movntdqa(cast(long2*)mem_addr); 1959 } 1960 else static if (LDC_with_InlineIREx) 1961 { 1962 enum prefix = `!0 = !{ i32 1 }`; 1963 enum ir = ` 1964 %r = load <4 x i32>, <4 x i32>* %0, !nontemporal !0 1965 ret <4 x i32> %r`; 1966 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", int4, int4*)(mem_addr); 1967 } 1968 else 1969 { 1970 return *mem_addr; // regular move instead 1971 } 1972 } 1973 // TODO unittest 1974 1975 1976 /// Return 1 if all bits in `a` are all 1's. Else return 0. 1977 int _mm_test_all_ones (__m128i a) @safe 1978 { 1979 return _mm_testc_si128(a, _mm_set1_epi32(-1)); 1980 } 1981 unittest 1982 { 1983 __m128i A = _mm_set1_epi32(-1); 1984 __m128i B = _mm_set_epi32(-1, -2, -1, -1); 1985 assert(_mm_test_all_ones(A) == 1); 1986 assert(_mm_test_all_ones(B) == 0); 1987 } 1988 1989 /// Return 1 if all bits in `a` are all 0's. Else return 0. 1990 // This is a #BONUS since it was lacking in Intel Intrinsics API. 1991 int _mm_test_all_zeros (__m128i a) @safe 1992 { 1993 return _mm_testz_si128(a, _mm_set1_epi32(-1)); 1994 } 1995 unittest 1996 { 1997 __m128i A = _mm_set1_epi32(0); 1998 __m128i B = _mm_set_epi32(0, 8, 0, 0); 1999 assert(_mm_test_all_zeros(A) == 1); 2000 assert(_mm_test_all_zeros(B) == 0); 2001 } 2002 2003 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, 2004 /// and return 1 if the result is zero, otherwise return 0. 2005 int _mm_test_all_zeros (__m128i a, __m128i mask) @safe 2006 { 2007 return _mm_testz_si128(a, mask); // it's really the same, but with a good name 2008 } 2009 2010 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and mask, and set ZF to 1 2011 /// if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with 2012 /// mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and 2013 /// CF values are zero, otherwise return 0. 2014 int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted 2015 { 2016 return _mm_testnzc_si128(a, mask); 2017 } 2018 2019 /// Compute the bitwise NOT of a and then AND with b, and return 1 if the 2020 /// result is zero, otherwise return 0. 2021 /// In other words, test if all bits masked by `b` are 1 in `a`. 2022 int _mm_testc_si128 (__m128i a, __m128i b) pure @trusted 2023 { 2024 // PERF DMD 2025 static if (GDC_with_SSE41) 2026 { 2027 return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); 2028 } 2029 else static if (LDC_with_SSE41) 2030 { 2031 return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); 2032 } 2033 else static if (LDC_with_ARM64) 2034 { 2035 // Acceptable since LDC 1.8 -02 2036 long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a); 2037 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); 2038 } 2039 else 2040 { 2041 __m128i c = ~a & b; 2042 int[4] zero = [0, 0, 0, 0]; 2043 return c.array == zero; 2044 } 2045 } 2046 unittest 2047 { 2048 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2049 __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00); 2050 __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); 2051 assert(_mm_testc_si128(A, A) == 1); 2052 assert(_mm_testc_si128(A, M1) == 0); 2053 assert(_mm_testc_si128(A, M2) == 1); 2054 } 2055 2056 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, 2057 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 2058 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 2059 /// result is zero, otherwise set CF to 0. 2060 /// Return 1 if both the ZF and CF values are zero, otherwise return 0. 2061 int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted 2062 { 2063 // PERF DMD 2064 static if (GDC_with_SSE41) 2065 { 2066 return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); 2067 } 2068 else static if (LDC_with_SSE41) 2069 { 2070 return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); 2071 } 2072 else static if (LDC_with_ARM64) 2073 { 2074 long2 s640 = vandq_s64(cast(long2)b, cast(long2)a); 2075 long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a); 2076 2077 return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)) 2078 | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) ); 2079 } 2080 else 2081 { 2082 __m128i c = a & b; 2083 __m128i d = ~a & b; 2084 int[4] zero = [0, 0, 0, 0]; 2085 return !( (c.array == zero) || (d.array == zero)); 2086 } 2087 } 2088 unittest 2089 { 2090 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2091 __m128i M = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00); 2092 __m128i Z = _mm_setzero_si128(); 2093 assert(_mm_testnzc_si128(A, Z) == 0); 2094 assert(_mm_testnzc_si128(A, M) == 1); 2095 assert(_mm_testnzc_si128(A, A) == 0); 2096 } 2097 2098 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, 2099 /// and return 1 if the result is zero, otherwise return 0. 2100 /// In other words, test if all bits masked by `b` are 0 in `a`. 2101 int _mm_testz_si128 (__m128i a, __m128i b) @trusted 2102 { 2103 // PERF DMD 2104 static if (GDC_with_SSE41) 2105 { 2106 return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); 2107 } 2108 else static if (LDC_with_SSE41) 2109 { 2110 return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); 2111 } 2112 else static if (LDC_with_ARM64) 2113 { 2114 // Acceptable since LDC 1.8 -02 2115 long2 s64 = vandq_s64(cast(long2)a, cast(long2)b); 2116 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); 2117 } 2118 else 2119 { 2120 __m128i c = a & b; 2121 int[4] zero = [0, 0, 0, 0]; 2122 return c.array == zero; 2123 } 2124 } 2125 unittest 2126 { 2127 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2128 __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07); 2129 __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); 2130 assert(_mm_testz_si128(A, A) == 0); 2131 assert(_mm_testz_si128(A, M1) == 1); 2132 assert(_mm_testz_si128(A, M2) == 0); 2133 } 2134