1 /** 2 * SSE4.1 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1 4 * 5 * Copyright: Guillaume Piolat 2021. 6 * Johan Engelen 2021. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.smmintrin; 10 11 // SSE4.1 instructions 12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1 13 // Note: this header will work whether you have SSE4.1 enabled or not. 14 // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively 15 // generate SSE4.1 instructions. 16 17 public import inteli.types; 18 import inteli.internals; 19 20 // smmintrin pulls in all previous instruction set intrinsics. 21 public import inteli.tmmintrin; 22 23 nothrow @nogc: 24 25 enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes 26 enum int _MM_FROUND_TO_NEG_INF = 0x01; /// ditto 27 enum int _MM_FROUND_TO_POS_INF = 0x02; /// ditto 28 enum int _MM_FROUND_TO_ZERO = 0x03; /// ditto 29 enum int _MM_FROUND_CUR_DIRECTION = 0x04; /// ditto 30 enum int _MM_FROUND_RAISE_EXC = 0x00; /// ditto 31 enum int _MM_FROUND_NO_EXC = 0x08; /// ditto 32 33 enum int _MM_FROUND_NINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT); 34 enum int _MM_FROUND_FLOOR = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF); 35 enum int _MM_FROUND_CEIL = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF); 36 enum int _MM_FROUND_TRUNC = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO); 37 enum int _MM_FROUND_RINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION); 38 enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION); 39 40 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results. 41 // Note: changed signature, GDC needs a compile-time value for imm8. 42 __m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) @trusted 43 { 44 // PERF DMD 45 static if (GDC_with_SSE41) 46 { 47 return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8); 48 } 49 else 50 { 51 // LDC x86 This generates pblendw since LDC 1.1 and -O2 52 short8 r; 53 short8 sa = cast(short8)a; 54 short8 sb = cast(short8)b; 55 for (int n = 0; n < 8; ++n) 56 { 57 r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n]; 58 } 59 return cast(__m128i)r; 60 } 61 } 62 unittest 63 { 64 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 65 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 66 short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011 67 short[8] correct = [8, 9, 2, 3, 12, 5, 6, 15]; 68 assert(C.array == correct); 69 } 70 71 72 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`. 73 // Note: changed signature, GDC needs a compile-time value for `imm8`. 74 __m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted 75 { 76 static assert(imm8 >= 0 && imm8 < 4); 77 // PERF DMD 78 static if (GDC_with_SSE41) 79 { 80 return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8); 81 } 82 else 83 { 84 // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12 85 double2 r; 86 for (int n = 0; n < 2; ++n) 87 { 88 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 89 } 90 return cast(__m128d)r; 91 } 92 } 93 unittest 94 { 95 __m128d A = _mm_setr_pd(0, 1); 96 __m128d B = _mm_setr_pd(8, 9); 97 double2 C = _mm_blend_pd!2(A, B); 98 double[2] correct = [0, 9]; 99 assert(C.array == correct); 100 } 101 102 103 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control mask `imm8`. 104 // Note: changed signature, GDC needs a compile-time value for imm8. 105 __m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) @trusted 106 { 107 // PERF DMD 108 static assert(imm8 >= 0 && imm8 < 16); 109 static if (GDC_with_SSE41) 110 { 111 return __builtin_ia32_blendps(a, b, imm8); 112 } 113 else version(LDC) 114 { 115 // LDC x86: generates blendps since LDC 1.1 -O2 116 // arm64: pretty good, two instructions worst case 117 return shufflevector!(float4, (imm8 & 1) ? 4 : 0, 118 (imm8 & 2) ? 5 : 1, 119 (imm8 & 4) ? 6 : 2, 120 (imm8 & 8) ? 7 : 3)(a, b); 121 } 122 else 123 { 124 __m128 r; 125 for (int n = 0; n < 4; ++n) 126 { 127 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 128 } 129 return r; 130 } 131 } 132 unittest 133 { 134 __m128 A = _mm_setr_ps(0, 1, 2, 3); 135 __m128 B = _mm_setr_ps(8, 9, 10, 11); 136 float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101 137 float[4] correct = [8, 1, 10, 11]; 138 assert(C.array == correct); 139 } 140 141 /// Blend packed 8-bit integers from `a` and `b` using `mask`. 142 __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) @trusted 143 { 144 // PERF DMD 145 static if (GDC_with_SSE41) 146 { 147 return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask); 148 } 149 else static if (LDC_with_SSE41) 150 { 151 return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask); 152 } 153 else static if (LDC_with_ARM64) 154 { 155 // LDC arm64: two instructions since LDC 1.12 -O2 156 byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7); 157 return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a); 158 } 159 else 160 { 161 __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask); 162 return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b); 163 } 164 } 165 unittest 166 { 167 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 168 8, 9, 10, 11, 12, 13, 14, 15); 169 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 170 24, 25, 26, 27, 28, 29, 30, 31); 171 __m128i M = _mm_setr_epi8( 1, -1, 1, 1, -4, 1, -8, 127, 172 1, 1, -1, -1, 4, 1, 8, -128); 173 byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M); 174 byte[16] correct = [ 0, 17, 2, 3, 20, 5, 22, 7, 175 8, 9, 26, 27, 12, 13, 14, 31 ]; 176 assert(R.array == correct); 177 } 178 179 180 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`. 181 __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted 182 { 183 // PERF DMD 184 static if (GDC_with_SSE42) 185 { 186 // Amazingly enough, GCC/GDC generates the blendvpd instruction 187 // with -msse4.2 but not -msse4.1. 188 // Not sure what is the reason, and there is a replacement sequence. 189 // Sounds like a bug. 190 return __builtin_ia32_blendvpd(a, b, mask); 191 } 192 else static if (LDC_with_SSE41) 193 { 194 return __builtin_ia32_blendvpd(a, b, mask); 195 } 196 else static if (LDC_with_ARM64) 197 { 198 long2 shift; 199 shift = 63; 200 long2 lmask = cast(long2)mask >> shift; 201 return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a); 202 } 203 else 204 { 205 __m128d r; 206 long2 lmask = cast(long2)mask; 207 for (int n = 0; n < 2; ++n) 208 { 209 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 210 } 211 return r; 212 } 213 } 214 unittest 215 { 216 __m128d A = _mm_setr_pd(1.0, 2.0); 217 __m128d B = _mm_setr_pd(3.0, 4.0); 218 __m128d M1 = _mm_setr_pd(-3.0, 2.0); 219 __m128d R1 = _mm_blendv_pd(A, B, M1); 220 double[2] correct1 = [3.0, 2.0]; 221 assert(R1.array == correct1); 222 223 // BUG: LDC _mm_blendv_pd doesn't work with NaN mask in arm64 Linux for some unknown reason. 224 // but it does work in arm64 macOS 225 // yields different results despite FP seemingly not being used 226 version(linux) 227 {} 228 else 229 { 230 __m128d M2 = _mm_setr_pd(double.nan, -double.nan); 231 __m128d R2 = _mm_blendv_pd(A, B, M2); 232 double[2] correct2 = [1.0, 4.0]; 233 assert(R2.array == correct2); 234 } 235 } 236 237 238 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`. 239 __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) @trusted 240 { 241 // PERF DMD 242 static if (GDC_with_SSE41) 243 { 244 return __builtin_ia32_blendvps(a, b, mask); 245 } 246 else static if (LDC_with_SSE41) 247 { 248 return __builtin_ia32_blendvps(a, b, mask); 249 } 250 else static if (LDC_with_ARM64) 251 { 252 int4 shift; 253 shift = 31; 254 int4 lmask = cast(int4)mask >> shift; 255 return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a); 256 } 257 else 258 { 259 __m128 r; 260 int4 lmask = cast(int4)mask; 261 for (int n = 0; n < 4; ++n) 262 { 263 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 264 } 265 return r; 266 } 267 } 268 unittest 269 { 270 __m128 A = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f); 271 __m128 B = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f); 272 __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f); 273 __m128 M2 = _mm_setr_ps(float.nan, -float.nan, -0.0f, +0.0f); 274 __m128 R1 = _mm_blendv_ps(A, B, M1); 275 __m128 R2 = _mm_blendv_ps(A, B, M2); 276 float[4] correct1 = [ 4.0f, 1.0f, 2.0f, 7.0f]; 277 float[4] correct2 = [ 0.0f, 5.0f, 6.0f, 3.0f]; 278 assert(R1.array == correct1); 279 280 // BUG: like above, LDC _mm_blendv_ps doesn't work with NaN mask in arm64 Linux for some unknown reason. 281 // yields different results despite FP seemingly not being used 282 version(linux) 283 {} 284 else 285 { 286 assert(R2.array == correct2); 287 } 288 } 289 290 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, 291 /// and store the results as packed double-precision floating-point elements. 292 __m128d _mm_ceil_pd (__m128d a) @trusted 293 { 294 static if (LDC_with_ARM64) 295 { 296 // LDC arm64 acceptable since 1.8 -O2 297 // Unfortunately x86 intrinsics force a round-trip back to double2 298 // ARM neon semantics wouldn't have that 299 long2 l = vcvtpq_s64_f64(a); 300 double2 r; 301 r.ptr[0] = l.array[0]; 302 r.ptr[1] = l.array[1]; 303 return r; 304 } 305 else 306 { 307 return _mm_round_pd!2(a); 308 } 309 } 310 unittest 311 { 312 __m128d A = _mm_setr_pd(1.3f, -2.12f); 313 __m128d B = _mm_setr_pd(53.6f, -2.7f); 314 A = _mm_ceil_pd(A); 315 B = _mm_ceil_pd(B); 316 double[2] correctA = [2.0, -2.0]; 317 double[2] correctB = [54.0, -2.0]; 318 assert(A.array == correctA); 319 assert(B.array == correctB); 320 } 321 322 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, 323 /// and store the results as packed single-precision floating-point elements. 324 __m128 _mm_ceil_ps (__m128 a) @trusted 325 { 326 static if (LDC_with_ARM64) 327 { 328 // LDC arm64 acceptable since 1.8 -O1 329 int4 l = vcvtpq_s32_f32(a); 330 float4 r; 331 r.ptr[0] = l.array[0]; 332 r.ptr[1] = l.array[1]; 333 r.ptr[2] = l.array[2]; 334 r.ptr[3] = l.array[3]; 335 return r; 336 } 337 else 338 { 339 return _mm_round_ps!2(a); 340 } 341 } 342 unittest 343 { 344 __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); 345 __m128 C = _mm_ceil_ps(A); 346 float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f]; 347 assert(C.array == correct); 348 } 349 350 /// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, 351 /// store the result as a double-precision floating-point element in the lower element of result, 352 /// and copy the upper element from `a` to the upper element of dst. 353 __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted 354 { 355 static if (LDC_with_ARM64) 356 { 357 a[0] = vcvtps_s64_f64(b[0]); 358 return a; 359 } 360 else 361 { 362 return _mm_round_sd!2(a, b); 363 } 364 } 365 unittest 366 { 367 __m128d A = _mm_setr_pd(1.3, -2.12); 368 __m128d B = _mm_setr_pd(53.6, -3.7); 369 __m128d C = _mm_ceil_sd(A, B); 370 double[2] correct = [54.0, -2.12]; 371 assert(C.array == correct); 372 } 373 374 /// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value, 375 /// store the result as a single-precision floating-point element in the lower element of result, 376 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 377 __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted 378 { 379 static if (LDC_with_ARM64) 380 { 381 a[0] = vcvtps_s32_f32(b[0]); 382 return a; 383 } 384 else 385 { 386 return _mm_round_ss!2(a, b); 387 } 388 } 389 unittest 390 { 391 __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); 392 __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f); 393 __m128 C = _mm_ceil_ss(A, B); 394 float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f]; 395 assert(C.array == correct); 396 } 397 398 /// Compare packed 64-bit integers in `a` and `b` for equality. 399 __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted 400 { 401 // PERF DMD 402 static if (GDC_with_SSE41) 403 { 404 return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b); 405 } 406 else version(LDC) 407 { 408 // LDC x86: generates pcmpeqq since LDC 1.1 -O1 409 // arm64: generates cmeq since LDC 1.8 -O1 410 return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b); 411 } 412 else 413 { 414 // Clever pcmpeqd + pand use with LDC 1.24 -O2 415 long2 la = cast(long2)a; 416 long2 lb = cast(long2)b; 417 long2 res; 418 res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0; 419 res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0; 420 return cast(__m128i)res; 421 } 422 } 423 unittest 424 { 425 __m128i A = _mm_setr_epi64(-1, -2); 426 __m128i B = _mm_setr_epi64(-3, -2); 427 __m128i C = _mm_setr_epi64(-1, -4); 428 long2 AB = cast(long2) _mm_cmpeq_epi64(A, B); 429 long2 AC = cast(long2) _mm_cmpeq_epi64(A, C); 430 long[2] correct1 = [0, -1]; 431 long[2] correct2 = [-1, 0]; 432 assert(AB.array == correct1); 433 assert(AC.array == correct2); 434 } 435 436 437 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers. 438 __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted 439 { 440 // PERF DMD 441 static if (GDC_with_SSE41) 442 { 443 return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a); 444 } 445 else version(LDC) 446 { 447 // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64 448 enum ir = ` 449 %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3> 450 %r = sext <4 x i16> %v to <4 x i32> 451 ret <4 x i32> %r`; 452 return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a); 453 } 454 else 455 { 456 short8 sa = cast(short8)a; 457 int4 r; 458 r.ptr[0] = sa.array[0]; 459 r.ptr[1] = sa.array[1]; 460 r.ptr[2] = sa.array[2]; 461 r.ptr[3] = sa.array[3]; 462 return r; 463 } 464 } 465 unittest 466 { 467 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 468 int4 C = cast(int4) _mm_cvtepi16_epi32(A); 469 int[4] correct = [-1, 0, -32768, 32767]; 470 assert(C.array == correct); 471 } 472 473 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers. 474 __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted 475 { 476 // PERF DMD 477 static if (GDC_with_SSE41) 478 { 479 return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a); 480 } 481 else version(LDC) 482 { 483 // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64 484 enum ir = ` 485 %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> <i32 0, i32 1> 486 %r = sext <2 x i16> %v to <2 x i64> 487 ret <2 x i64> %r`; 488 return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a); 489 } 490 else 491 { 492 short8 sa = cast(short8)a; 493 long2 r; 494 r.ptr[0] = sa.array[0]; 495 r.ptr[1] = sa.array[1]; 496 return cast(__m128i)r; 497 } 498 } 499 unittest 500 { 501 __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0); 502 long2 C = cast(long2) _mm_cvtepi16_epi64(A); 503 long[2] correct = [-32768, 32767]; 504 assert(C.array == correct); 505 } 506 507 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers. 508 __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted 509 { 510 // PERF DMD 511 static if (GDC_with_SSE41) 512 { 513 return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a); 514 } 515 else version(LDC) 516 { 517 // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64 518 enum ir = ` 519 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 520 %r = sext <2 x i32> %v to <2 x i64> 521 ret <2 x i64> %r`; 522 return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a); 523 } 524 else 525 { 526 int4 sa = cast(int4)a; 527 long2 r; 528 r.ptr[0] = sa.array[0]; 529 r.ptr[1] = sa.array[1]; 530 return cast(__m128i)r; 531 } 532 } 533 unittest 534 { 535 __m128i A = _mm_setr_epi32(-4, 42, 0, 0); 536 long2 C = cast(long2) _mm_cvtepi32_epi64(A); 537 long[2] correct = [-4, 42]; 538 assert(C.array == correct); 539 } 540 541 542 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers. 543 __m128i _mm_cvtepi8_epi16 (__m128i a) @trusted 544 { 545 // PERF DMD 546 static if (GDC_with_SSE41) 547 { 548 alias ubyte16 = __vector(ubyte[16]); 549 return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a); 550 } 551 else version(LDC) 552 { 553 // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 554 // LDC ARM64: sshll generated since LDC 1.8.0 -O1 555 enum ir = ` 556 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 557 %r = sext <8 x i8> %v to <8 x i16> 558 ret <8 x i16> %r`; 559 return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a); 560 } 561 else 562 { 563 byte16 sa = cast(byte16)a; 564 short8 r; 565 foreach(n; 0..8) 566 r.ptr[n] = sa.array[n]; 567 return cast(__m128i)r; 568 } 569 } 570 unittest 571 { 572 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 573 short8 C = cast(short8) _mm_cvtepi8_epi16(A); 574 short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8]; 575 assert(C.array == correct); 576 } 577 578 579 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers. 580 __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted 581 { 582 // PERF DMD 583 static if (GDC_with_SSE41) 584 { 585 alias ubyte16 = __vector(ubyte[16]); 586 return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a); 587 } 588 else static if (LDC_with_SSE41) 589 { 590 // LDC x86: Generates pmovsxbd since LDC 1.1 -O0 591 enum ir = ` 592 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 593 %r = sext <4 x i8> %v to <4 x i32> 594 ret <4 x i32> %r`; 595 return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a); 596 } 597 else 598 { 599 // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would 600 byte16 sa = cast(byte16)a; 601 int4 r; 602 r.ptr[0] = sa.array[0]; 603 r.ptr[1] = sa.array[1]; 604 r.ptr[2] = sa.array[2]; 605 r.ptr[3] = sa.array[3]; 606 return cast(__m128i)r; 607 } 608 } 609 unittest 610 { 611 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 612 int4 C = cast(int4) _mm_cvtepi8_epi32(A); 613 int[4] correct = [127, -128, 1, -1]; 614 assert(C.array == correct); 615 } 616 617 618 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. 619 __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted 620 { 621 // PERF DMD 622 static if (GDC_with_SSE41) 623 { 624 alias ubyte16 = __vector(ubyte[16]); 625 return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a); 626 } 627 else version(LDC) 628 { 629 // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, 630 // LDC arm64: it's ok since LDC 1.8 -O1 631 enum ir = ` 632 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> <i32 0, i32 1> 633 %r = sext <2 x i8> %v to <2 x i64> 634 ret <2 x i64> %r`; 635 return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a); 636 } 637 else 638 { 639 byte16 sa = cast(byte16)a; 640 long2 r; 641 foreach(n; 0..2) 642 r.ptr[n] = sa.array[n]; 643 return cast(__m128i)r; 644 } 645 } 646 unittest 647 { 648 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 649 long2 C = cast(long2) _mm_cvtepi8_epi64(A); 650 long[2] correct = [127, -128]; 651 assert(C.array == correct); 652 } 653 654 655 /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers. 656 __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted 657 { 658 // PERF DMD 659 static if (GDC_with_SSE41) 660 { 661 return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a); 662 } 663 else 664 { 665 // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 666 // arm64: ushll since LDC 1.12 -O1 667 short8 sa = cast(short8)a; 668 int4 r; 669 r.ptr[0] = cast(ushort)sa.array[0]; 670 r.ptr[1] = cast(ushort)sa.array[1]; 671 r.ptr[2] = cast(ushort)sa.array[2]; 672 r.ptr[3] = cast(ushort)sa.array[3]; 673 return cast(__m128i)r; 674 } 675 } 676 unittest 677 { 678 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 679 int4 C = cast(int4) _mm_cvtepu16_epi32(A); 680 int[4] correct = [65535, 0, 32768, 32767]; 681 assert(C.array == correct); 682 } 683 684 685 /// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers. 686 __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted 687 { 688 // PERF DMD 689 static if (GDC_with_SSE41) 690 { 691 return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a); 692 } 693 else static if (LDC_with_ARM64) 694 { 695 // LDC arm64: a bit shorter than below, in -O2 696 short8 sa = cast(short8)a; 697 long2 r; 698 for(int n = 0; n < 2; ++n) 699 r.ptr[n] = cast(ushort)sa.array[n]; 700 return cast(__m128i)r; 701 } 702 else 703 { 704 // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 705 short8 sa = cast(short8)a; 706 long2 r; 707 r.ptr[0] = cast(ushort)sa.array[0]; 708 r.ptr[1] = cast(ushort)sa.array[1]; 709 return cast(__m128i)r; 710 } 711 } 712 unittest 713 { 714 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 715 long2 C = cast(long2) _mm_cvtepu16_epi64(A); 716 long[2] correct = [65535, 0]; 717 assert(C.array == correct); 718 } 719 720 721 /// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers. 722 __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted 723 { 724 // PERF DMD 725 static if (GDC_with_SSE41) 726 { 727 return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a); 728 } 729 else 730 { 731 // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1 732 // arm64: generates ushll since LDC 1.12 -O1 733 int4 sa = cast(int4)a; 734 long2 r; 735 r.ptr[0] = cast(uint)sa.array[0]; 736 r.ptr[1] = cast(uint)sa.array[1]; 737 return cast(__m128i)r; 738 } 739 } 740 unittest 741 { 742 __m128i A = _mm_setr_epi32(-1, 42, 0, 0); 743 long2 C = cast(long2) _mm_cvtepu32_epi64(A); 744 long[2] correct = [4294967295, 42]; 745 assert(C.array == correct); 746 } 747 748 749 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers. 750 __m128i _mm_cvtepu8_epi16 (__m128i a) @trusted 751 { 752 // PERF DMD 753 static if (GDC_with_SSE41) 754 { 755 return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(short8)a); 756 } 757 else 758 { 759 // LDC x86: generates pmovzxbw since LDC 1.12 -O1 also good without SSE4.1 760 // arm64: ushll since LDC 1.12 -O1 761 // PERF: catastrophic with GDC without SSE4.1 762 byte16 sa = cast(byte16)a; 763 short8 r; 764 r.ptr[0] = cast(ubyte)sa.array[0]; 765 r.ptr[1] = cast(ubyte)sa.array[1]; 766 r.ptr[2] = cast(ubyte)sa.array[2]; 767 r.ptr[3] = cast(ubyte)sa.array[3]; 768 r.ptr[4] = cast(ubyte)sa.array[4]; 769 r.ptr[5] = cast(ubyte)sa.array[5]; 770 r.ptr[6] = cast(ubyte)sa.array[6]; 771 r.ptr[7] = cast(ubyte)sa.array[7]; 772 return cast(__m128i)r; 773 } 774 } 775 unittest 776 { 777 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 778 short8 C = cast(short8) _mm_cvtepu8_epi16(A); 779 short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248]; 780 assert(C.array == correct); 781 } 782 783 784 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers. 785 __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted 786 { 787 // PERF DMD 788 static if (GDC_with_SSE41) 789 { 790 alias ubyte16 = __vector(ubyte[16]); 791 return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a); 792 } 793 else static if (LDC_with_ARM64) 794 { 795 // LDC arm64: a bit better than below in -O2 796 byte16 sa = cast(byte16)a; 797 int4 r; 798 for(int n = 0; n < 4; ++n) 799 r.ptr[n] = cast(ubyte)sa.array[n]; 800 return cast(__m128i)r; 801 } 802 else 803 { 804 // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1 805 // PERF: catastrophic with GDC without SSE4.1 806 byte16 sa = cast(byte16)a; 807 int4 r; 808 r.ptr[0] = cast(ubyte)sa.array[0]; 809 r.ptr[1] = cast(ubyte)sa.array[1]; 810 r.ptr[2] = cast(ubyte)sa.array[2]; 811 r.ptr[3] = cast(ubyte)sa.array[3]; 812 return cast(__m128i)r; 813 } 814 } 815 unittest 816 { 817 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 818 int4 C = cast(int4) _mm_cvtepu8_epi32(A); 819 int[4] correct = [127, 128, 1, 255]; 820 assert(C.array == correct); 821 } 822 823 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. 824 __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted 825 { 826 // PERF DMD 827 static if (GDC_with_SSE41) 828 { 829 alias ubyte16 = __vector(ubyte[16]); 830 return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a); 831 } 832 else static if (LDC_with_ARM64) 833 { 834 // LDC arm64: this optimizes better than the loop below 835 byte16 sa = cast(byte16)a; 836 long2 r; 837 for (int n = 0; n < 2; ++n) 838 r.ptr[n] = cast(ubyte)sa.array[n]; 839 return cast(__m128i)r; 840 } 841 else 842 { 843 // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1 844 byte16 sa = cast(byte16)a; 845 long2 r; 846 r.ptr[0] = cast(ubyte)sa.array[0]; 847 r.ptr[1] = cast(ubyte)sa.array[1]; 848 return cast(__m128i)r; 849 } 850 } 851 unittest 852 { 853 __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 854 long2 C = cast(long2) _mm_cvtepu8_epi64(A); 855 long[2] correct = [127, 254]; 856 assert(C.array == correct); 857 } 858 859 /// Conditionally multiply the packed double-precision (64-bit) floating-point elements 860 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally 861 /// store the sum in dst using the low 4 bits of `imm8`. 862 __m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted 863 { 864 // PERF DMD 865 static if (GDC_with_SSE41) 866 { 867 return __builtin_ia32_dppd(a, b, imm8 & 0x33); 868 } 869 else static if (LDC_with_SSE41) 870 { 871 return __builtin_ia32_dppd(a, b, imm8 & 0x33); 872 } 873 else 874 { 875 __m128d zero = _mm_setzero_pd(); 876 __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b); 877 double sum = temp.array[0] + temp.array[1]; 878 return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum)); 879 } 880 } 881 unittest 882 { 883 __m128d A = _mm_setr_pd(1.0, 2.0); 884 __m128d B = _mm_setr_pd(4.0, 8.0); 885 double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B); 886 double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B); 887 double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B); 888 double[2] correct1 = [ 4.0, 4.0]; 889 double[2] correct2 = [16.0, 0.0]; 890 double[2] correct3 = [ 0.0, 20.0]; 891 assert(R1.array == correct1); 892 assert(R2.array == correct2); 893 assert(R3.array == correct3); 894 } 895 896 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements 897 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, 898 /// and conditionally store the sum in result using the low 4 bits of `imm8`. 899 __m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted 900 { 901 // PERF DMD 902 static if (GDC_with_SSE41) 903 { 904 return __builtin_ia32_dpps(a, b, cast(byte)imm8); 905 } 906 else static if (LDC_with_SSE41) 907 { 908 return __builtin_ia32_dpps(a, b, cast(byte)imm8); 909 } 910 else 911 { 912 __m128 zero = _mm_setzero_ps(); 913 __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b); 914 float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3]; 915 return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum)); 916 } 917 } 918 unittest 919 { 920 __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f); 921 __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f); 922 float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B); 923 float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B); 924 float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B); 925 float[4] correct1 = [67.0f, 67.0f, 67.0f, 67.0f]; 926 float[4] correct2 = [23.0f, 0.0f, 23.0f, 0.0f]; 927 float[4] correct3 = [0.0f, 29.0f, 0.0f, 29.0f]; 928 assert(R1.array == correct1); 929 assert(R2.array == correct2); 930 assert(R3.array == correct3); 931 } 932 933 934 /// Extract a 32-bit integer from `a`, selected with `imm8`. 935 int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted 936 { 937 return (cast(int4)a).array[imm8 & 3]; 938 } 939 unittest 940 { 941 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 942 assert(_mm_extract_epi32(A, 0) == 1); 943 assert(_mm_extract_epi32(A, 1 + 8) == 2); 944 assert(_mm_extract_epi32(A, 3 + 4) == 4); 945 } 946 947 /// Extract a 64-bit integer from `a`, selected with `imm8`. 948 long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted 949 { 950 long2 la = cast(long2)a; 951 return la.array[imm8 & 1]; 952 } 953 unittest 954 { 955 __m128i A = _mm_setr_epi64(45, -67); 956 assert(_mm_extract_epi64(A, 0) == 45); 957 assert(_mm_extract_epi64(A, 1) == -67); 958 assert(_mm_extract_epi64(A, 2) == 45); 959 } 960 961 /// Extract an 8-bit integer from `a`, selected with `imm8`. 962 /// Warning: the returned value is zero-extended to 32-bits. 963 int _mm_extract_epi8 (__m128i a, const int imm8) @trusted 964 { 965 byte16 ba = cast(byte16)a; 966 return cast(ubyte) ba.array[imm8 & 15]; 967 } 968 unittest 969 { 970 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15); 971 assert(_mm_extract_epi8(A, 7) == 7); 972 assert(_mm_extract_epi8(A, 13) == 255); 973 assert(_mm_extract_epi8(A, 7 + 16) == 7); 974 } 975 976 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`. 977 /// Note: returns a 32-bit $(I integer). 978 int _mm_extract_ps (__m128 a, const int imm8) @trusted 979 { 980 return (cast(int4)a).array[imm8 & 3]; 981 } 982 unittest 983 { 984 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f); 985 assert(_mm_extract_ps(A, 0) == 0x3f800000); 986 assert(_mm_extract_ps(A, 1 + 8) == 0x40000000); 987 assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000); 988 } 989 990 991 992 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an 993 /// integer value, and store the results as packed double-precision floating-point elements. 994 __m128d _mm_floor_pd (__m128d a) @trusted 995 { 996 static if (LDC_with_ARM64) 997 { 998 // LDC arm64 acceptable since 1.8 -O2 999 long2 l = vcvtmq_s64_f64(a); 1000 double2 r; 1001 r.ptr[0] = l.array[0]; 1002 r.ptr[1] = l.array[1]; 1003 return r; 1004 } 1005 else 1006 { 1007 return _mm_round_pd!1(a); 1008 } 1009 } 1010 unittest 1011 { 1012 __m128d A = _mm_setr_pd(1.3f, -2.12f); 1013 __m128d B = _mm_setr_pd(53.6f, -2.7f); 1014 A = _mm_floor_pd(A); 1015 B = _mm_floor_pd(B); 1016 double[2] correctA = [1.0, -3.0]; 1017 double[2] correctB = [53.0, -3.0]; 1018 assert(A.array == correctA); 1019 assert(B.array == correctB); 1020 } 1021 1022 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an 1023 /// integer value, and store the results as packed single-precision floating-point elements. 1024 __m128 _mm_floor_ps (__m128 a) @trusted 1025 { 1026 static if (LDC_with_ARM64) 1027 { 1028 // LDC arm64 acceptable since 1.8 -O1 1029 int4 l = vcvtmq_s32_f32(a); 1030 float4 r; 1031 r.ptr[0] = l.array[0]; 1032 r.ptr[1] = l.array[1]; 1033 r.ptr[2] = l.array[2]; 1034 r.ptr[3] = l.array[3]; 1035 return r; 1036 } 1037 else 1038 { 1039 return _mm_round_ps!1(a); 1040 } 1041 } 1042 unittest 1043 { 1044 __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); 1045 __m128 C = _mm_floor_ps(A); 1046 float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f]; 1047 assert(C.array == correct); 1048 } 1049 1050 /// Round the lower double-precision (64-bit) floating-point element in `b` down to an 1051 /// integer value, store the result as a double-precision floating-point element in the 1052 /// lower element, and copy the upper element from `a` to the upper element. 1053 __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted 1054 { 1055 static if (LDC_with_ARM64) 1056 { 1057 a[0] = vcvtms_s64_f64(b[0]); 1058 return a; 1059 } 1060 else 1061 { 1062 return _mm_round_sd!1(a, b); 1063 } 1064 } 1065 unittest 1066 { 1067 __m128d A = _mm_setr_pd(1.3, -2.12); 1068 __m128d B = _mm_setr_pd(-53.1, -3.7); 1069 __m128d C = _mm_floor_sd(A, B); 1070 double[2] correct = [-54.0, -2.12]; 1071 assert(C.array == correct); 1072 } 1073 1074 /// Round the lower single-precision (32-bit) floating-point element in `b` down to an 1075 /// integer value, store the result as a single-precision floating-point element in the 1076 /// lower element, and copy the upper 3 packed elements from `a` to the upper elements. 1077 __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted 1078 { 1079 static if (LDC_with_ARM64) 1080 { 1081 a[0] = vcvtms_s32_f32(b[0]); 1082 return a; 1083 } 1084 else 1085 { 1086 return _mm_round_ss!1(a, b); 1087 } 1088 } 1089 unittest 1090 { 1091 __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); 1092 __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f); 1093 __m128 C = _mm_floor_ss(A, B); 1094 float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f]; 1095 assert(C.array == correct); 1096 } 1097 1098 /// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`. 1099 __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted 1100 { 1101 // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1 1102 // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1 1103 // LDC arm64: ins.s since LDC 1.8 -O2 1104 int4 ia = cast(int4)a; 1105 ia.ptr[imm8 & 3] = i; 1106 return cast(__m128i)ia; 1107 } 1108 unittest 1109 { 1110 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 1111 int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4); 1112 int[4] result = [1, 2, 5, 4]; 1113 assert(C.array == result); 1114 } 1115 1116 /// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`. 1117 __m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted 1118 { 1119 // GDC: nothing special to do, psinrq generated with -O1 -msse4.1 1120 // LDC x86: always do something sensible. 1121 long2 la = cast(long2)a; 1122 la.ptr[imm8 & 1] = i; 1123 return cast(__m128i)la; 1124 } 1125 unittest 1126 { 1127 __m128i A = _mm_setr_epi64(1, 2); 1128 long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2); 1129 long[2] result = [1, 5]; 1130 assert(C.array == result); 1131 } 1132 1133 /// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`. 1134 /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8. 1135 __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted 1136 { 1137 // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1 1138 // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory. 1139 byte16 ba = cast(byte16)a; 1140 ba.ptr[imm8 & 15] = cast(byte)i; 1141 return cast(__m128i)ba; 1142 } 1143 unittest 1144 { 1145 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 1146 byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16); 1147 byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 1148 assert(C.array == result); 1149 } 1150 1151 1152 /// Warning: of course it does something totally different from `_mm_insert_epi32`! 1153 /// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` 1154 /// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` 1155 /// (elements are zeroed out when the corresponding bit is set). 1156 __m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted 1157 { 1158 // PERF DMD 1159 static if (GDC_with_SSE41) 1160 { 1161 return __builtin_ia32_insertps128(a, b, cast(byte)imm8); 1162 } 1163 else static if (LDC_with_SSE41) 1164 { 1165 return __builtin_ia32_insertps128(a, b, cast(byte)imm8); 1166 } 1167 else 1168 { 1169 float4 tmp2 = a; 1170 float tmp1 = b.array[(imm8 >> 6) & 3]; 1171 tmp2.ptr[(imm8 >> 4) & 3] = tmp1; 1172 return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps()); 1173 } 1174 } 1175 unittest 1176 { 1177 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1178 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1179 __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B); 1180 float[4] correct = [1.0f, 2.0f, 0.0f, 7.0f]; 1181 assert(C.array == correct); 1182 } 1183 1184 1185 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. 1186 __m128i _mm_max_epi32 (__m128i a, __m128i b) @trusted 1187 { 1188 static if (GDC_with_SSE41) 1189 { 1190 return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b); 1191 } 1192 else version(LDC) 1193 { 1194 // x86: pmaxsd since LDC 1.1 -O1 1195 // ARM: smax.4s since LDC 1.8 -01 1196 int4 sa = cast(int4)a; 1197 int4 sb = cast(int4)b; 1198 int4 greater = greaterMask!int4(sa, sb); 1199 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1200 } 1201 else 1202 { 1203 __m128i higher = _mm_cmpgt_epi32(a, b); 1204 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1205 __m128i mask = _mm_and_si128(aTob, higher); 1206 return _mm_xor_si128(b, mask); 1207 } 1208 } 1209 unittest 1210 { 1211 int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), 1212 _mm_setr_epi32( -4,-8, 9, -8)); 1213 int[4] correct = [0x7fffffff, 1, 9, 7]; 1214 assert(R.array == correct); 1215 } 1216 1217 /// Compare packed signed 8-bit integers in `a` and `b`, 1218 /// and return packed maximum values. 1219 __m128i _mm_max_epi8 (__m128i a, __m128i b) @trusted 1220 { 1221 // PERF DMD 1222 static if (GDC_with_SSE41) 1223 { 1224 return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b); 1225 } 1226 else version(LDC) 1227 { 1228 // x86: pmaxsb since LDC 1.1 -O1 1229 // ARM64: smax.16b since LDC 1.8.0 -O1 1230 byte16 sa = cast(byte16)a; 1231 byte16 sb = cast(byte16)b; 1232 byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); 1233 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1234 } 1235 else 1236 { 1237 __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else 1238 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1239 __m128i mask = _mm_and_si128(aTob, lower); 1240 return _mm_xor_si128(b, mask); 1241 } 1242 } 1243 unittest 1244 { 1245 __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); 1246 __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 1247 byte16 R = cast(byte16) _mm_max_epi8(A, B); 1248 byte[16] correct = [127, 1, 9, -7, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0]; 1249 assert(R.array == correct); 1250 } 1251 1252 /// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values. 1253 __m128i _mm_max_epu16 (__m128i a, __m128i b) @trusted 1254 { 1255 // PERF DMD 1256 static if (GDC_with_SSE41) 1257 { 1258 return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b); 1259 } 1260 else version(LDC) 1261 { 1262 // x86: pmaxuw since LDC 1.1 -O1 1263 // ARM64: umax.8h since LDC 1.8.0 -O1 1264 // PERF: without sse4.1, LLVM 12 produces a very interesting 1265 // psubusw xmm0, xmm1 1266 // paddw xmm0, xmm1 1267 // sequence that maybe should go in other min/max intrinsics? 1268 ushort8 sa = cast(ushort8)a; 1269 ushort8 sb = cast(ushort8)b; 1270 ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb); 1271 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1272 } 1273 else 1274 { 1275 b = _mm_subs_epu16(b, a); 1276 b = _mm_add_epi16(b, a); 1277 return b; 1278 } 1279 } 1280 unittest 1281 { 1282 short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), 1283 _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); 1284 short[8] correct = [ -4, -8, -4, -7, 9,-32768, 0, 57]; 1285 assert(R.array == correct); 1286 } 1287 1288 /// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values. 1289 __m128i _mm_max_epu32 (__m128i a, __m128i b) @trusted 1290 { 1291 // PERF DMD 1292 static if (GDC_with_SSE41) 1293 { 1294 return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b); 1295 } 1296 else version(LDC) 1297 { 1298 // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1 1299 // ARM64: umax.4s since LDC 1.8.0 -O1 1300 uint4 sa = cast(uint4)a; 1301 uint4 sb = cast(uint4)b; 1302 uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); 1303 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1304 } 1305 else 1306 { 1307 __m128i valueShift = _mm_set1_epi32(-0x80000000); 1308 __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift)); 1309 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1310 __m128i mask = _mm_and_si128(aTob, higher); 1311 return _mm_xor_si128(b, mask); 1312 } 1313 } 1314 unittest 1315 { 1316 int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), 1317 _mm_setr_epi32( -4,-8, 9, -8)); 1318 int[4] correct = [ -4,-8, 9, -7]; 1319 assert(R.array == correct); 1320 } 1321 1322 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. 1323 __m128i _mm_min_epi32 (__m128i a, __m128i b) @trusted 1324 { 1325 // PERF DMD 1326 static if (GDC_with_SSE41) 1327 { 1328 return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b); 1329 } 1330 else version(LDC) 1331 { 1332 // x86: pminsd since LDC 1.1 -O1, also good without sse4.1 1333 // ARM: smin.4s since LDC 1.8 -01 1334 int4 sa = cast(int4)a; 1335 int4 sb = cast(int4)b; 1336 int4 greater = greaterMask!int4(sa, sb); 1337 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1338 } 1339 else 1340 { 1341 __m128i higher = _mm_cmplt_epi32(a, b); 1342 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1343 __m128i mask = _mm_and_si128(aTob, higher); 1344 return _mm_xor_si128(b, mask); 1345 } 1346 } 1347 unittest 1348 { 1349 int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), 1350 _mm_setr_epi32( -4, -8, 9, -8)); 1351 int[4] correct = [ -4, -8, -4, -8]; 1352 assert(R.array == correct); 1353 } 1354 1355 /// Compare packed signed 8-bit integers in `a` and `b`, 1356 /// and return packed minimum values. 1357 __m128i _mm_min_epi8 (__m128i a, __m128i b) @trusted 1358 { 1359 // PERF DMD 1360 static if (GDC_with_SSE41) 1361 { 1362 return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b); 1363 } 1364 else version(LDC) 1365 { 1366 // x86: pminsb since LDC 1.1 -O1 1367 // ARM64: smin.16b since LDC 1.8.0 -O1 1368 byte16 sa = cast(byte16)a; 1369 byte16 sb = cast(byte16)b; 1370 byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); 1371 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1372 } 1373 else 1374 { 1375 __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else 1376 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1377 __m128i mask = _mm_and_si128(aTob, lower); 1378 return _mm_xor_si128(b, mask); 1379 } 1380 } 1381 unittest 1382 { 1383 __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); 1384 __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 1385 byte16 R = cast(byte16) _mm_min_epi8(A, B); 1386 byte[16] correct = [ 4, -8, -4, -8, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 1387 assert(R.array == correct); 1388 } 1389 1390 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst. 1391 __m128i _mm_min_epu16 (__m128i a, __m128i b) @trusted 1392 { 1393 // PERF DMD 1394 static if (GDC_with_SSE41) 1395 { 1396 return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b); 1397 } 1398 else version(LDC) 1399 { 1400 // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1 1401 // ARM64: umin.8h since LDC 1.8.0 -O1 1402 ushort8 sa = cast(ushort8)a; 1403 ushort8 sb = cast(ushort8)b; 1404 ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa); 1405 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1406 } 1407 else 1408 { 1409 __m128i c = _mm_subs_epu16(b, a); 1410 b = _mm_sub_epi16(b, c); 1411 return b; 1412 } 1413 } 1414 unittest 1415 { 1416 short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), 1417 _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); 1418 short[8] correct = [32767, 1, 9, -8, 0, 7, 0, 0]; 1419 assert(R.array == correct); 1420 } 1421 1422 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst. 1423 __m128i _mm_min_epu32 (__m128i a, __m128i b) @trusted 1424 { 1425 // PERF DMD 1426 static if (GDC_with_SSE41) 1427 { 1428 return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b); 1429 } 1430 else version(LDC) 1431 { 1432 // x86: pminud since LDC 1.1 -O1, also good without sse4.1 1433 // ARM64: umin.4s since LDC 1.8.0 -O1 1434 uint4 sa = cast(uint4)a; 1435 uint4 sb = cast(uint4)b; 1436 uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); 1437 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1438 } 1439 else 1440 { 1441 __m128i valueShift = _mm_set1_epi32(-0x80000000); 1442 __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift)); 1443 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1444 __m128i mask = _mm_and_si128(aTob, higher); 1445 return _mm_xor_si128(b, mask); 1446 } 1447 } 1448 unittest 1449 { 1450 int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), 1451 _mm_setr_epi32( -4,-8, 9, -8)); 1452 int[4] correct = [0x7fffffff, 1, 4, -8]; 1453 assert(R.array == correct); 1454 } 1455 1456 /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, 1457 /// store the minimum and index in return value, and zero the remaining bits. 1458 __m128i _mm_minpos_epu16 (__m128i a) @trusted 1459 { 1460 // PERF DMD 1461 static if (GDC_with_SSE41) 1462 { 1463 return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); 1464 } 1465 else static if (LDC_with_SSE41) 1466 { 1467 return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); 1468 } 1469 else static if (LDC_with_ARM64) 1470 { 1471 __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 1472 __m128i combinedLo = _mm_unpacklo_epi16(indices, a); 1473 __m128i combinedHi = _mm_unpackhi_epi16(indices, a); 1474 __m128i best = _mm_min_epu32(combinedLo, combinedHi); 1475 best = _mm_min_epu32(best, _mm_srli_si128!8(best)); 1476 best = _mm_min_epu32(best, _mm_srli_si128!4(best)); 1477 short8 sbest = cast(short8)best; 1478 short8 r; 1479 r[0] = sbest[1]; 1480 r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie 1481 r[2] = 0; 1482 r[3] = 0; 1483 r[4] = 0; 1484 r[5] = 0; 1485 r[6] = 0; 1486 r[7] = 0; 1487 return cast(__m128i)r; 1488 } 1489 else 1490 { 1491 short8 sa = cast(short8)a; 1492 ushort min = 0xffff; 1493 int index = 0; 1494 for(int n = 0; n < 8; ++n) 1495 { 1496 ushort c = sa.array[n]; 1497 if (c < min) 1498 { 1499 min = c; 1500 index = n; 1501 } 1502 } 1503 short8 r; 1504 r.ptr[0] = min; 1505 r.ptr[1] = cast(short)index; 1506 return cast(__m128i)r; 1507 } 1508 } 1509 unittest 1510 { 1511 __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6); 1512 __m128i B = _mm_setr_epi16(14, 4, 4, 2, -3, 2, 5, 6); 1513 short8 R1 = cast(short8) _mm_minpos_epu16(A); 1514 short8 R2 = cast(short8) _mm_minpos_epu16(B); 1515 short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0]; 1516 short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0]; 1517 assert(R1.array == correct1); 1518 assert(R2.array == correct2); 1519 } 1520 1521 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers 1522 /// in `a` compared to those in `b`, and store the 16-bit results in dst. 1523 /// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. 1524 /// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. 1525 /// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting 1526 /// at the offset specified in `imm8[2]`. 1527 __m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted 1528 { 1529 // PERF DMD 1530 static if (GDC_with_SSE41) 1531 { 1532 return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8); 1533 } 1534 else static if (LDC_with_SSE41) 1535 { 1536 return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8); 1537 } 1538 else 1539 { 1540 int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable... 1541 int b_offset = (imm8 & 3) * 4; 1542 1543 byte16 ba = cast(byte16)a; 1544 byte16 bb = cast(byte16)b; 1545 short8 r; 1546 1547 __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0); 1548 1549 for (int j = 0; j < 8; j += 2) 1550 { 1551 int k = a_offset + j; 1552 __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3], 1553 0, 0, 0, 0, 1554 ba[k+1], ba[k+2], ba[k+3], ba[k+4], 1555 0, 0, 0, 0); 1556 short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64 1557 r.ptr[j] = diffs.array[0]; 1558 r.ptr[j+1] = diffs.array[4]; 1559 } 1560 return cast(__m128i)r; 1561 } 1562 } 1563 unittest 1564 { 1565 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 1566 __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5, 5, 5, 12, 13, 14, 15); 1567 short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23]; 1568 short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749]; 1569 short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35]; 1570 short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741]; 1571 short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4]; 1572 short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B,); 1573 short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B,); 1574 short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B,); 1575 short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B,); 1576 short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B,); 1577 assert(r1.array == correct1); 1578 assert(r4.array == correct4); 1579 assert(r5.array == correct5); 1580 assert(r7.array == correct7); 1581 assert(r8.array == correct0); 1582 } 1583 1584 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst. 1585 __m128i _mm_mul_epi32 (__m128i a, __m128i b) @trusted 1586 { 1587 // PERF DMD 1588 static if (GDC_with_SSE41) 1589 { 1590 return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b); 1591 } 1592 else static if (LDC_with_SSE41) 1593 { 1594 // For some reason, clang has the builtin but it's not in IntrinsicsX86.td 1595 // Use IR instead. 1596 // This generates pmuldq with since LDC 1.2.0 -O0 1597 enum ir = ` 1598 %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 2> 1599 %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> <i32 0, i32 2> 1600 %la = sext <2 x i32> %ia to <2 x i64> 1601 %lb = sext <2 x i32> %ib to <2 x i64> 1602 %r = mul <2 x i64> %la, %lb 1603 ret <2 x i64> %r`; 1604 return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b); 1605 } 1606 else static if (LDC_with_ARM64) 1607 { 1608 // 3 instructions since LDC 1.8 -O2 1609 // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull 1610 int2 a_lo = vmovn_s64(cast(long2)a); 1611 int2 b_lo = vmovn_s64(cast(long2)b); 1612 return cast(__m128i) vmull_s32(a_lo, b_lo); 1613 } 1614 else 1615 { 1616 int4 ia = cast(int4)a; 1617 int4 ib = cast(int4)b; 1618 long2 r; 1619 r.ptr[0] = cast(long)ia.array[0] * ib.array[0]; 1620 r.ptr[1] = cast(long)ia.array[2] * ib.array[2]; 1621 return cast(__m128i)r; 1622 } 1623 } 1624 unittest 1625 { 1626 __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); 1627 __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); 1628 long2 R = cast(long2) _mm_mul_epi32(A, B); 1629 long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144]; 1630 assert(R.array == correct); 1631 } 1632 1633 /// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, 1634 /// return the low 32 bits of the intermediate integers. 1635 __m128i _mm_mullo_epi32 (__m128i a, __m128i b) @trusted 1636 { 1637 // PERF DMD 1638 // PERF GDC without SSE4.1 could be better 1639 static if (GDC_with_SSE41) 1640 { 1641 int4 ia = cast(int4)a; 1642 int4 ib = cast(int4)b; 1643 // Note: older GDC doesn't have that op, but older GDC 1644 // also has no support for -msse4.1 detection 1645 return cast(__m128i)(a * b); 1646 } 1647 else version(LDC) 1648 { 1649 int4 ia = cast(int4)a; 1650 int4 ib = cast(int4)b; 1651 return cast(__m128i)(a * b); 1652 } 1653 else 1654 { 1655 // DMD doesn't take the above 1656 int4 ia = cast(int4)a; 1657 int4 ib = cast(int4)b; 1658 int4 r; 1659 r.ptr[0] = ia.array[0] * ib.array[0]; 1660 r.ptr[1] = ia.array[1] * ib.array[1]; 1661 r.ptr[2] = ia.array[2] * ib.array[2]; 1662 r.ptr[3] = ia.array[3] * ib.array[3]; 1663 return r; 1664 } 1665 } 1666 unittest 1667 { 1668 __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); 1669 __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); 1670 int4 R = cast(int4) _mm_mullo_epi32(A, B); 1671 int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0]; 1672 assert(R.array == correct); 1673 } 1674 1675 1676 /// Convert packed signed 32-bit integers from `a` and `b` 1677 /// to packed 16-bit integers using unsigned saturation. 1678 __m128i _mm_packus_epi32 (__m128i a, __m128i b) @trusted 1679 { 1680 static if (GDC_with_SSE41) 1681 { 1682 // PERF For some reason doesn't generates the builtin??? 1683 return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); 1684 } 1685 else static if (LDC_with_SSE41) 1686 { 1687 return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); 1688 } 1689 else static if (LDC_with_ARM64) 1690 { 1691 int4 z; 1692 z = 0; 1693 return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)), 1694 vqmovn_u32(vmaxq_s32(z, cast(int4)b))); 1695 } 1696 else 1697 { 1698 // PERF: not great without SSE4.1 1699 int4 sa = cast(int4)a; 1700 int4 sb = cast(int4)b; 1701 ushort[8] result; 1702 for (int i = 0; i < 4; ++i) 1703 { 1704 int s = sa.array[i]; 1705 if (s < 0) s = 0; 1706 if (s > 65535) s = 65535; 1707 result.ptr[i] = cast(ushort)s; 1708 1709 s = sb.array[i]; 1710 if (s < 0) s = 0; 1711 if (s > 65535) s = 65535; 1712 result.ptr[i+4] = cast(ushort)s; 1713 } 1714 return cast(__m128i) loadUnaligned!(short8)(cast(short*)result.ptr); 1715 } 1716 } 1717 unittest 1718 { 1719 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 1720 short8 R = cast(short8) _mm_packus_epi32(A, A); 1721 short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0]; 1722 assert(R.array == correct); 1723 } 1724 1725 1726 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 1727 /// rounding parameter, and store the results as packed double-precision floating-point elements. 1728 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1729 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1730 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1731 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1732 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1733 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1734 __m128d _mm_round_pd(int rounding)(__m128d a) @trusted 1735 { 1736 // PERF DMD 1737 static if (GDC_with_SSE41) 1738 { 1739 return __builtin_ia32_roundpd(a, rounding); 1740 } 1741 else static if (LDC_with_SSE41) 1742 { 1743 return __builtin_ia32_roundpd(a, rounding); 1744 } 1745 else 1746 { 1747 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1748 { 1749 // Convert to 64-bit integers 1750 long lo = _mm_cvtsd_si64(a); 1751 a.ptr[0] = a.array[1]; 1752 long hi = _mm_cvtsd_si64(a); 1753 return _mm_setr_pd(lo, hi); 1754 } 1755 else 1756 { 1757 version(GNU) pragma(inline, false); // else fail unittest with optimizations 1758 1759 uint old = _MM_GET_ROUNDING_MODE(); 1760 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1761 1762 // Convert to 64-bit integers 1763 long lo = _mm_cvtsd_si64(a); 1764 a.ptr[0] = a.array[1]; 1765 long hi = _mm_cvtsd_si64(a); 1766 1767 // Convert back to double to achieve the rounding 1768 // The problem is that a 64-bit double can't represent all the values 1769 // a 64-bit integer can (and vice-versa). So this function won't work for 1770 // large values. (TODO: what range exactly?) 1771 _MM_SET_ROUNDING_MODE(old); 1772 return _mm_setr_pd(lo, hi); 1773 } 1774 } 1775 } 1776 unittest 1777 { 1778 // tested in other intrinsics 1779 } 1780 1781 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 1782 /// rounding parameter, and store the results as packed single-precision floating-point elements. 1783 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1784 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1785 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1786 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1787 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1788 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1789 __m128 _mm_round_ps(int rounding)(__m128 a) @trusted 1790 { 1791 static if (GDC_with_SSE41) 1792 { 1793 return __builtin_ia32_roundps(a, rounding); 1794 } 1795 else static if (LDC_with_SSE41) 1796 { 1797 return __builtin_ia32_roundps(a, rounding); 1798 } 1799 else 1800 { 1801 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1802 { 1803 __m128i integers = _mm_cvtps_epi32(a); 1804 return _mm_cvtepi32_ps(integers); 1805 } 1806 else 1807 { 1808 version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled 1809 uint old = _MM_GET_ROUNDING_MODE(); 1810 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1811 scope(exit) _MM_SET_ROUNDING_MODE(old); 1812 1813 // Convert to 64-bit integers 1814 __m128i integers = _mm_cvtps_epi32(a); 1815 1816 // Convert back to float to achieve the rounding 1817 // The problem is that a 32-float can't represent all the values 1818 // a 32-bit integer can (and vice-versa). So this function won't work for 1819 // large values. (TODO: what range exactly?) 1820 __m128 result = _mm_cvtepi32_ps(integers); 1821 1822 return result; 1823 } 1824 } 1825 } 1826 unittest 1827 { 1828 // tested in other intrinsics 1829 } 1830 1831 1832 /// Round the lower double-precision (64-bit) floating-point element in `b` using the 1833 /// rounding parameter, store the result as a double-precision floating-point element 1834 /// in the lower element of result, and copy the upper element from `a` to the upper element of result. 1835 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1836 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1837 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1838 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1839 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1840 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1841 __m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted 1842 { 1843 static if (GDC_with_SSE41) 1844 { 1845 return __builtin_ia32_roundsd(a, b, rounding); 1846 } 1847 else static if (LDC_with_SSE41) 1848 { 1849 return __builtin_ia32_roundsd(a, b, rounding); 1850 } 1851 else 1852 { 1853 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1854 { 1855 // Convert to 64-bit integer 1856 long b0 = _mm_cvtsd_si64(b); 1857 a.ptr[0] = b0; 1858 return a; 1859 } 1860 else 1861 { 1862 version(GNU) pragma(inline, false); // else fail unittest with optimizations 1863 1864 uint old = _MM_GET_ROUNDING_MODE(); 1865 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1866 1867 // Convert to 64-bit integer 1868 long b0 = _mm_cvtsd_si64(b); 1869 a.ptr[0] = b0; 1870 1871 // Convert back to double to achieve the rounding 1872 // The problem is that a 64-bit double can't represent all the values 1873 // a 64-bit integer can (and vice-versa). So this function won't work for 1874 // large values. (TODO: what range exactly?) 1875 _MM_SET_ROUNDING_MODE(old); 1876 return a; 1877 } 1878 } 1879 } 1880 unittest 1881 { 1882 // tested in other intrinsics 1883 } 1884 1885 1886 /// Round the lower single-precision (32-bit) floating-point element in `b` using the 1887 /// rounding parameter, store the result as a single-precision floating-point element 1888 /// in the lower element of result, and copy the upper 3 packed elements from `a` 1889 /// to the upper elements of result. 1890 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1891 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1892 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1893 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1894 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1895 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1896 __m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted 1897 { 1898 static if (GDC_with_SSE41) 1899 { 1900 return __builtin_ia32_roundss(a, b, rounding); 1901 } 1902 else static if (LDC_with_SSE41) 1903 { 1904 return __builtin_ia32_roundss(a, b, rounding); 1905 } 1906 else 1907 { 1908 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1909 { 1910 int b0 = _mm_cvtss_si32(b); 1911 a.ptr[0] = b0; 1912 return a; 1913 } 1914 else 1915 { 1916 uint old = _MM_GET_ROUNDING_MODE(); 1917 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1918 1919 // Convert to 32-bit integer 1920 int b0 = _mm_cvtss_si32(b); 1921 a.ptr[0] = b0; 1922 1923 // Convert back to double to achieve the rounding 1924 // The problem is that a 64-bit double can't represent all the values 1925 // a 64-bit integer can (and vice-versa). So this function won't work for 1926 // large values. (TODO: what range exactly?) 1927 _MM_SET_ROUNDING_MODE(old); 1928 return a; 1929 } 1930 } 1931 } 1932 unittest 1933 { 1934 // tested in other intrinsics 1935 } 1936 1937 1938 /// Load 128-bits of integer data from memory using a non-temporal memory hint. 1939 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection 1940 /// exception may be generated. 1941 __m128i _mm_stream_load_si128 (__m128i * mem_addr) @trusted 1942 { 1943 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 1944 return *mem_addr; // it's a regular move instead 1945 } 1946 1947 1948 /// Return 1 if all bits in `a` are all 1's. Else return 0. 1949 int _mm_test_all_ones (__m128i a) @safe 1950 { 1951 return _mm_testc_si128(a, _mm_set1_epi32(-1)); 1952 } 1953 unittest 1954 { 1955 __m128i A = _mm_set1_epi32(-1); 1956 __m128i B = _mm_set_epi32(-1, -2, -1, -1); 1957 assert(_mm_test_all_ones(A) == 1); 1958 assert(_mm_test_all_ones(B) == 0); 1959 } 1960 1961 /// Return 1 if all bits in `a` are all 0's. Else return 0. 1962 // This is a #BONUS since it was lacking in Intel Intrinsics API. 1963 int _mm_test_all_zeros (__m128i a) @safe 1964 { 1965 return _mm_testz_si128(a, _mm_set1_epi32(-1)); 1966 } 1967 unittest 1968 { 1969 __m128i A = _mm_set1_epi32(0); 1970 __m128i B = _mm_set_epi32(0, 8, 0, 0); 1971 assert(_mm_test_all_zeros(A) == 1); 1972 assert(_mm_test_all_zeros(B) == 0); 1973 } 1974 1975 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, 1976 /// and return 1 if the result is zero, otherwise return 0. 1977 int _mm_test_all_zeros (__m128i a, __m128i mask) @safe 1978 { 1979 return _mm_testz_si128(a, mask); // it's really the same, but with a good name 1980 } 1981 1982 /// Compute the bitwise AND of 128 bits (representing integer data) in a and mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 1983 int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted 1984 { 1985 return _mm_testnzc_si128(a, mask); 1986 } 1987 1988 /// Compute the bitwise NOT of a and then AND with b, and return 1 if the 1989 /// result is zero, otherwise return 0. 1990 /// In other words, test if all bits masked by `b` are 1 in `a`. 1991 int _mm_testc_si128 (__m128i a, __m128i b) @trusted 1992 { 1993 // PERF DMD 1994 static if (GDC_with_SSE41) 1995 { 1996 return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); 1997 } 1998 else static if (LDC_with_SSE41) 1999 { 2000 return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); 2001 } 2002 else static if (LDC_with_ARM64) 2003 { 2004 // Acceptable since LDC 1.8 -02 2005 long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a); 2006 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); 2007 } 2008 else 2009 { 2010 __m128i c = ~a & b; 2011 int[4] zero = [0, 0, 0, 0]; 2012 return c.array == zero; 2013 } 2014 } 2015 unittest 2016 { 2017 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2018 __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00); 2019 __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); 2020 assert(_mm_testc_si128(A, A) == 1); 2021 assert(_mm_testc_si128(A, M1) == 0); 2022 assert(_mm_testc_si128(A, M2) == 1); 2023 } 2024 2025 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, 2026 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 2027 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 2028 /// result is zero, otherwise set CF to 0. 2029 /// Return 1 if both the ZF and CF values are zero, otherwise return 0. 2030 int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted 2031 { 2032 // PERF DMD 2033 static if (GDC_with_SSE41) 2034 { 2035 return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); 2036 } 2037 else static if (LDC_with_SSE41) 2038 { 2039 return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); 2040 } 2041 else static if (LDC_with_ARM64) 2042 { 2043 long2 s640 = vandq_s64(cast(long2)b, cast(long2)a); 2044 long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a); 2045 2046 return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)) 2047 | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) ); 2048 } 2049 else 2050 { 2051 __m128i c = a & b; 2052 __m128i d = ~a & b; 2053 int[4] zero = [0, 0, 0, 0]; 2054 return !( (c.array == zero) || (d.array == zero)); 2055 } 2056 } 2057 unittest 2058 { 2059 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2060 __m128i M = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00); 2061 __m128i Z = _mm_setzero_si128(); 2062 assert(_mm_testnzc_si128(A, Z) == 0); 2063 assert(_mm_testnzc_si128(A, M) == 1); 2064 assert(_mm_testnzc_si128(A, A) == 0); 2065 } 2066 2067 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, 2068 /// and return 1 if the result is zero, otherwise return 0. 2069 /// In other words, test if all bits masked by `b` are 0 in `a`. 2070 int _mm_testz_si128 (__m128i a, __m128i b) @trusted 2071 { 2072 // PERF DMD 2073 static if (GDC_with_SSE41) 2074 { 2075 return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); 2076 } 2077 else static if (LDC_with_SSE41) 2078 { 2079 return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); 2080 } 2081 else static if (LDC_with_ARM64) 2082 { 2083 // Acceptable since LDC 1.8 -02 2084 long2 s64 = vandq_s64(cast(long2)a, cast(long2)b); 2085 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); 2086 } 2087 else 2088 { 2089 __m128i c = a & b; 2090 int[4] zero = [0, 0, 0, 0]; 2091 return c.array == zero; 2092 } 2093 } 2094 unittest 2095 { 2096 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2097 __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07); 2098 __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); 2099 assert(_mm_testz_si128(A, A) == 0); 2100 assert(_mm_testz_si128(A, M1) == 1); 2101 assert(_mm_testz_si128(A, M2) == 0); 2102 } 2103