1 /** 2 * SSE4.1 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1 4 * 5 * Copyright: Guillaume Piolat 2021. 6 * Johan Engelen 2021. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.smmintrin; 10 11 // SSE4.1 instructions 12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1 13 // Note: this header will work whether you have SSE4.1 enabled or not. 14 // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively 15 // generate SSE4.1 instructions. 16 // With GDC, use "dflags-gdc": ["-msse4.1"] or equivalent to generate SSE4.1 instructions. 17 18 public import inteli.types; 19 import inteli.internals; 20 21 // smmintrin pulls in all previous instruction set intrinsics. 22 public import inteli.tmmintrin; 23 24 nothrow @nogc: 25 26 enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes 27 enum int _MM_FROUND_TO_NEG_INF = 0x01; /// ditto 28 enum int _MM_FROUND_TO_POS_INF = 0x02; /// ditto 29 enum int _MM_FROUND_TO_ZERO = 0x03; /// ditto 30 enum int _MM_FROUND_CUR_DIRECTION = 0x04; /// ditto 31 enum int _MM_FROUND_RAISE_EXC = 0x00; /// ditto 32 enum int _MM_FROUND_NO_EXC = 0x08; /// ditto 33 34 enum int _MM_FROUND_NINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT); 35 enum int _MM_FROUND_FLOOR = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF); 36 enum int _MM_FROUND_CEIL = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF); 37 enum int _MM_FROUND_TRUNC = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO); 38 enum int _MM_FROUND_RINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION); 39 enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION); 40 41 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results. 42 // Note: changed signature, GDC needs a compile-time value for imm8. 43 __m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) @trusted 44 { 45 // PERF DMD 46 static if (GDC_with_SSE41) 47 { 48 return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8); 49 } 50 else 51 { 52 // LDC x86 This generates pblendw since LDC 1.1 and -O2 53 short8 r; 54 short8 sa = cast(short8)a; 55 short8 sb = cast(short8)b; 56 for (int n = 0; n < 8; ++n) 57 { 58 r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n]; 59 } 60 return cast(__m128i)r; 61 } 62 } 63 unittest 64 { 65 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 66 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 67 short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011 68 short[8] correct = [8, 9, 2, 3, 12, 5, 6, 15]; 69 assert(C.array == correct); 70 } 71 72 73 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`. 74 // Note: changed signature, GDC needs a compile-time value for `imm8`. 75 __m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted 76 { 77 static assert(imm8 >= 0 && imm8 < 4); 78 // PERF DMD 79 static if (GDC_with_SSE41) 80 { 81 return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8); 82 } 83 else 84 { 85 // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12 86 double2 r; 87 for (int n = 0; n < 2; ++n) 88 { 89 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 90 } 91 return cast(__m128d)r; 92 } 93 } 94 unittest 95 { 96 __m128d A = _mm_setr_pd(0, 1); 97 __m128d B = _mm_setr_pd(8, 9); 98 double2 C = _mm_blend_pd!2(A, B); 99 double[2] correct = [0, 9]; 100 assert(C.array == correct); 101 } 102 103 104 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control mask `imm8`. 105 // Note: changed signature, GDC needs a compile-time value for imm8. 106 __m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) @trusted 107 { 108 // PERF DMD 109 static assert(imm8 >= 0 && imm8 < 16); 110 static if (GDC_with_SSE41) 111 { 112 return __builtin_ia32_blendps(a, b, imm8); 113 } 114 else version(LDC) 115 { 116 // LDC x86: generates blendps since LDC 1.1 -O2 117 // arm64: pretty good, two instructions worst case 118 return shufflevector!(float4, (imm8 & 1) ? 4 : 0, 119 (imm8 & 2) ? 5 : 1, 120 (imm8 & 4) ? 6 : 2, 121 (imm8 & 8) ? 7 : 3)(a, b); 122 } 123 else 124 { 125 __m128 r; // PERF =void; 126 for (int n = 0; n < 4; ++n) 127 { 128 r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; 129 } 130 return r; 131 } 132 } 133 unittest 134 { 135 __m128 A = _mm_setr_ps(0, 1, 2, 3); 136 __m128 B = _mm_setr_ps(8, 9, 10, 11); 137 float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101 138 float[4] correct = [8, 1, 10, 11]; 139 assert(C.array == correct); 140 } 141 142 /// Blend packed 8-bit integers from `a` and `b` using `mask`. 143 __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) @trusted 144 { 145 // PERF DMD 146 // TODO BUG GDC version 147 static if (GDC_with_SSE41) 148 { 149 return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask); 150 } 151 else static if (LDC_with_SSE41) 152 { 153 return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask); 154 } 155 else static if (LDC_with_ARM64) 156 { 157 // LDC arm64: two instructions since LDC 1.12 -O2 158 byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7); 159 return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a); 160 } 161 else 162 { 163 __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask); 164 return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b); 165 } 166 } 167 unittest 168 { 169 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 170 8, 9, 10, 11, 12, 13, 14, 15); 171 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 172 24, 25, 26, 27, 28, 29, 30, 31); 173 __m128i M = _mm_setr_epi8( 1, -1, 1, 1, -4, 1, -8, 127, 174 1, 1, -1, -1, 4, 1, 8, -128); 175 byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M); 176 byte[16] correct = [ 0, 17, 2, 3, 20, 5, 22, 7, 177 8, 9, 26, 27, 12, 13, 14, 31 ]; 178 assert(R.array == correct); 179 } 180 181 182 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`. 183 __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted 184 { 185 // PERF DMD 186 static if (GDC_with_SSE42) 187 { 188 // PERF Amazingly enough, GCC/GDC generates the blendvpd instruction 189 // with -msse4.2 but not -msse4.1. 190 // Not sure what is the reason, and there is a replacement sequence. 191 // Sounds like a bug. 192 return __builtin_ia32_blendvpd(a, b, mask); 193 } 194 else static if (LDC_with_SSE41) 195 { 196 return __builtin_ia32_blendvpd(a, b, mask); 197 } 198 else static if (LDC_with_ARM64) 199 { 200 long2 shift; 201 shift = 63; 202 long2 lmask = cast(long2)mask >> shift; 203 return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a); 204 } 205 else 206 { 207 __m128d r; // PERF =void; 208 long2 lmask = cast(long2)mask; 209 for (int n = 0; n < 2; ++n) 210 { 211 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 212 } 213 return r; 214 } 215 } 216 unittest 217 { 218 __m128d A = _mm_setr_pd(1.0, 2.0); 219 __m128d B = _mm_setr_pd(3.0, 4.0); 220 __m128d M1 = _mm_setr_pd(-3.0, 2.0); 221 __m128d R1 = _mm_blendv_pd(A, B, M1); 222 double[2] correct1 = [3.0, 2.0]; 223 assert(R1.array == correct1); 224 225 // BUG: LDC _mm_blendv_pd doesn't work with NaN mask in arm64 Linux for some unknown reason. 226 // but it does work in arm64 macOS 227 // yields different results despite FP seemingly not being used 228 version(linux) 229 {} 230 else 231 { 232 __m128d M2 = _mm_setr_pd(double.nan, -double.nan); 233 __m128d R2 = _mm_blendv_pd(A, B, M2); 234 double[2] correct2 = [1.0, 4.0]; 235 assert(R2.array == correct2); 236 } 237 } 238 239 240 /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`. 241 __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) @trusted 242 { 243 // PERF DMD 244 static if (GDC_with_SSE41) 245 { 246 return __builtin_ia32_blendvps(a, b, mask); 247 } 248 else static if (LDC_with_SSE41) 249 { 250 return __builtin_ia32_blendvps(a, b, mask); 251 } 252 else static if (LDC_with_ARM64) 253 { 254 int4 shift; 255 shift = 31; 256 int4 lmask = cast(int4)mask >> shift; 257 return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a); 258 } 259 else 260 { 261 __m128 r; // PERF =void; 262 int4 lmask = cast(int4)mask; 263 for (int n = 0; n < 4; ++n) 264 { 265 r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; 266 } 267 return r; 268 } 269 } 270 unittest 271 { 272 __m128 A = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f); 273 __m128 B = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f); 274 __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f); 275 __m128 M2 = _mm_setr_ps(float.nan, -float.nan, -0.0f, +0.0f); 276 __m128 R1 = _mm_blendv_ps(A, B, M1); 277 __m128 R2 = _mm_blendv_ps(A, B, M2); 278 float[4] correct1 = [ 4.0f, 1.0f, 2.0f, 7.0f]; 279 float[4] correct2 = [ 0.0f, 5.0f, 6.0f, 3.0f]; 280 assert(R1.array == correct1); 281 282 // BUG: like above, LDC _mm_blendv_ps doesn't work with NaN mask in arm64 Linux for some unknown reason. 283 // yields different results despite FP seemingly not being used 284 version(linux) 285 {} 286 else 287 { 288 assert(R2.array == correct2); 289 } 290 } 291 292 /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, 293 /// and store the results as packed double-precision floating-point elements. 294 __m128d _mm_ceil_pd (__m128d a) @trusted 295 { 296 static if (LDC_with_ARM64) 297 { 298 // LDC arm64 acceptable since 1.8 -O2 299 // Unfortunately x86 intrinsics force a round-trip back to double2 300 // ARM neon semantics wouldn't have that 301 long2 l = vcvtpq_s64_f64(a); 302 double2 r; 303 r.ptr[0] = l.array[0]; 304 r.ptr[1] = l.array[1]; 305 return r; 306 } 307 else 308 { 309 return _mm_round_pd!2(a); 310 } 311 } 312 unittest 313 { 314 __m128d A = _mm_setr_pd(1.3f, -2.12f); 315 __m128d B = _mm_setr_pd(53.6f, -2.7f); 316 A = _mm_ceil_pd(A); 317 B = _mm_ceil_pd(B); 318 double[2] correctA = [2.0, -2.0]; 319 double[2] correctB = [54.0, -2.0]; 320 assert(A.array == correctA); 321 assert(B.array == correctB); 322 } 323 324 /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, 325 /// and store the results as packed single-precision floating-point elements. 326 __m128 _mm_ceil_ps (__m128 a) @trusted 327 { 328 static if (LDC_with_ARM64) 329 { 330 // LDC arm64 acceptable since 1.8 -O1 331 int4 l = vcvtpq_s32_f32(a); 332 float4 r; 333 r.ptr[0] = l.array[0]; 334 r.ptr[1] = l.array[1]; 335 r.ptr[2] = l.array[2]; 336 r.ptr[3] = l.array[3]; 337 return r; 338 } 339 else 340 { 341 return _mm_round_ps!2(a); 342 } 343 } 344 unittest 345 { 346 __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); 347 __m128 C = _mm_ceil_ps(A); 348 float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f]; 349 assert(C.array == correct); 350 } 351 352 /// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, 353 /// store the result as a double-precision floating-point element in the lower element of result, 354 /// and copy the upper element from `a` to the upper element of dst. 355 __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted 356 { 357 static if (LDC_with_ARM64) 358 { 359 a[0] = vcvtps_s64_f64(b[0]); 360 return a; 361 } 362 else 363 { 364 return _mm_round_sd!2(a, b); 365 } 366 } 367 unittest 368 { 369 __m128d A = _mm_setr_pd(1.3, -2.12); 370 __m128d B = _mm_setr_pd(53.6, -3.7); 371 __m128d C = _mm_ceil_sd(A, B); 372 double[2] correct = [54.0, -2.12]; 373 assert(C.array == correct); 374 } 375 376 /// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value, 377 /// store the result as a single-precision floating-point element in the lower element of result, 378 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 379 __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted 380 { 381 static if (LDC_with_ARM64) 382 { 383 a[0] = vcvtps_s32_f32(b[0]); 384 return a; 385 } 386 else 387 { 388 return _mm_round_ss!2(a, b); 389 } 390 } 391 unittest 392 { 393 __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); 394 __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f); 395 __m128 C = _mm_ceil_ss(A, B); 396 float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f]; 397 assert(C.array == correct); 398 } 399 400 /// Compare packed 64-bit integers in `a` and `b` for equality. 401 __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted 402 { 403 // PERF DMD 404 static if (GDC_with_SSE41) 405 { 406 return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b); 407 } 408 else version(LDC) 409 { 410 // LDC x86: generates pcmpeqq since LDC 1.1 -O1 411 // arm64: generates cmeq since LDC 1.8 -O1 412 return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b); 413 } 414 else 415 { 416 // Clever pcmpeqd + pand use with LDC 1.24 -O2 417 long2 la = cast(long2)a; 418 long2 lb = cast(long2)b; 419 long2 res; 420 res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0; 421 res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0; 422 return cast(__m128i)res; 423 } 424 } 425 unittest 426 { 427 __m128i A = _mm_setr_epi64(-1, -2); 428 __m128i B = _mm_setr_epi64(-3, -2); 429 __m128i C = _mm_setr_epi64(-1, -4); 430 long2 AB = cast(long2) _mm_cmpeq_epi64(A, B); 431 long2 AC = cast(long2) _mm_cmpeq_epi64(A, C); 432 long[2] correct1 = [0, -1]; 433 long[2] correct2 = [-1, 0]; 434 assert(AB.array == correct1); 435 assert(AC.array == correct2); 436 } 437 438 439 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers. 440 __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted 441 { 442 // PERF DMD 443 static if (GDC_with_SSE41) 444 { 445 return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a); 446 } 447 else version(LDC) 448 { 449 // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64 450 enum ir = ` 451 %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3> 452 %r = sext <4 x i16> %v to <4 x i32> 453 ret <4 x i32> %r`; 454 return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a); 455 } 456 else 457 { 458 short8 sa = cast(short8)a; 459 int4 r; 460 r.ptr[0] = sa.array[0]; 461 r.ptr[1] = sa.array[1]; 462 r.ptr[2] = sa.array[2]; 463 r.ptr[3] = sa.array[3]; 464 return r; 465 } 466 } 467 unittest 468 { 469 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 470 int4 C = cast(int4) _mm_cvtepi16_epi32(A); 471 int[4] correct = [-1, 0, -32768, 32767]; 472 assert(C.array == correct); 473 } 474 475 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers. 476 __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted 477 { 478 // PERF DMD 479 static if (GDC_with_SSE41) 480 { 481 return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a); 482 } 483 else version(LDC) 484 { 485 // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64 486 enum ir = ` 487 %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> <i32 0, i32 1> 488 %r = sext <2 x i16> %v to <2 x i64> 489 ret <2 x i64> %r`; 490 return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a); 491 } 492 else 493 { 494 short8 sa = cast(short8)a; 495 long2 r; 496 r.ptr[0] = sa.array[0]; 497 r.ptr[1] = sa.array[1]; 498 return cast(__m128i)r; 499 } 500 } 501 unittest 502 { 503 __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0); 504 long2 C = cast(long2) _mm_cvtepi16_epi64(A); 505 long[2] correct = [-32768, 32767]; 506 assert(C.array == correct); 507 } 508 509 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers. 510 __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted 511 { 512 // PERF DMD 513 static if (GDC_with_SSE41) 514 { 515 return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a); 516 } 517 else version(LDC) 518 { 519 // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64 520 enum ir = ` 521 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 522 %r = sext <2 x i32> %v to <2 x i64> 523 ret <2 x i64> %r`; 524 return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a); 525 } 526 else 527 { 528 int4 sa = cast(int4)a; 529 long2 r; 530 r.ptr[0] = sa.array[0]; 531 r.ptr[1] = sa.array[1]; 532 return cast(__m128i)r; 533 } 534 } 535 unittest 536 { 537 __m128i A = _mm_setr_epi32(-4, 42, 0, 0); 538 long2 C = cast(long2) _mm_cvtepi32_epi64(A); 539 long[2] correct = [-4, 42]; 540 assert(C.array == correct); 541 } 542 543 544 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers. 545 __m128i _mm_cvtepi8_epi16 (__m128i a) @trusted 546 { 547 // PERF DMD 548 static if (GDC_with_SSE41) 549 { 550 alias ubyte16 = __vector(ubyte[16]); 551 return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a); 552 } 553 else version(LDC) 554 { 555 // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 556 // LDC ARM64: sshll generated since LDC 1.8.0 -O1 557 enum ir = ` 558 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 559 %r = sext <8 x i8> %v to <8 x i16> 560 ret <8 x i16> %r`; 561 return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a); 562 } 563 else 564 { 565 byte16 sa = cast(byte16)a; 566 short8 r; 567 foreach(n; 0..8) 568 r.ptr[n] = sa.array[n]; 569 return cast(__m128i)r; 570 } 571 } 572 unittest 573 { 574 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 575 short8 C = cast(short8) _mm_cvtepi8_epi16(A); 576 short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8]; 577 assert(C.array == correct); 578 } 579 580 581 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers. 582 __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted 583 { 584 // PERF DMD 585 static if (GDC_with_SSE41) 586 { 587 alias ubyte16 = __vector(ubyte[16]); 588 return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a); 589 } 590 else static if (LDC_with_SSE41) 591 { 592 // LDC x86: Generates pmovsxbd since LDC 1.1 -O0 593 enum ir = ` 594 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 595 %r = sext <4 x i8> %v to <4 x i32> 596 ret <4 x i32> %r`; 597 return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a); 598 } 599 else 600 { 601 // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would 602 byte16 sa = cast(byte16)a; 603 int4 r; 604 r.ptr[0] = sa.array[0]; 605 r.ptr[1] = sa.array[1]; 606 r.ptr[2] = sa.array[2]; 607 r.ptr[3] = sa.array[3]; 608 return cast(__m128i)r; 609 } 610 } 611 unittest 612 { 613 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 614 int4 C = cast(int4) _mm_cvtepi8_epi32(A); 615 int[4] correct = [127, -128, 1, -1]; 616 assert(C.array == correct); 617 } 618 619 620 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. 621 __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted 622 { 623 // PERF DMD 624 static if (GDC_with_SSE41) 625 { 626 alias ubyte16 = __vector(ubyte[16]); 627 return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a); 628 } 629 else version(LDC) 630 { 631 // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, 632 // LDC arm64: it's ok since LDC 1.8 -O1 633 enum ir = ` 634 %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> <i32 0, i32 1> 635 %r = sext <2 x i8> %v to <2 x i64> 636 ret <2 x i64> %r`; 637 return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a); 638 } 639 else 640 { 641 byte16 sa = cast(byte16)a; 642 long2 r; 643 foreach(n; 0..2) 644 r.ptr[n] = sa.array[n]; 645 return cast(__m128i)r; 646 } 647 } 648 unittest 649 { 650 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 651 long2 C = cast(long2) _mm_cvtepi8_epi64(A); 652 long[2] correct = [127, -128]; 653 assert(C.array == correct); 654 } 655 656 657 /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers. 658 __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted 659 { 660 // PERF DMD 661 static if (GDC_with_SSE41) 662 { 663 return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a); 664 } 665 else 666 { 667 // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 668 // arm64: ushll since LDC 1.12 -O1 669 short8 sa = cast(short8)a; 670 int4 r; 671 r.ptr[0] = cast(ushort)sa.array[0]; 672 r.ptr[1] = cast(ushort)sa.array[1]; 673 r.ptr[2] = cast(ushort)sa.array[2]; 674 r.ptr[3] = cast(ushort)sa.array[3]; 675 return cast(__m128i)r; 676 } 677 } 678 unittest 679 { 680 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 681 int4 C = cast(int4) _mm_cvtepu16_epi32(A); 682 int[4] correct = [65535, 0, 32768, 32767]; 683 assert(C.array == correct); 684 } 685 686 687 /// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers. 688 __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted 689 { 690 // PERF DMD 691 static if (GDC_with_SSE41) 692 { 693 return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a); 694 } 695 else static if (LDC_with_ARM64) 696 { 697 // LDC arm64: a bit shorter than below, in -O2 698 short8 sa = cast(short8)a; 699 long2 r; 700 for(int n = 0; n < 2; ++n) 701 r.ptr[n] = cast(ushort)sa.array[n]; 702 return cast(__m128i)r; 703 } 704 else 705 { 706 // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 707 short8 sa = cast(short8)a; 708 long2 r; 709 r.ptr[0] = cast(ushort)sa.array[0]; 710 r.ptr[1] = cast(ushort)sa.array[1]; 711 return cast(__m128i)r; 712 } 713 } 714 unittest 715 { 716 __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); 717 long2 C = cast(long2) _mm_cvtepu16_epi64(A); 718 long[2] correct = [65535, 0]; 719 assert(C.array == correct); 720 } 721 722 723 /// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers. 724 __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted 725 { 726 // PERF DMD 727 static if (GDC_with_SSE41) 728 { 729 return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a); 730 } 731 else 732 { 733 // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1 734 // arm64: generates ushll since LDC 1.12 -O1 735 int4 sa = cast(int4)a; 736 long2 r; 737 r.ptr[0] = cast(uint)sa.array[0]; 738 r.ptr[1] = cast(uint)sa.array[1]; 739 return cast(__m128i)r; 740 } 741 } 742 unittest 743 { 744 __m128i A = _mm_setr_epi32(-1, 42, 0, 0); 745 long2 C = cast(long2) _mm_cvtepu32_epi64(A); 746 long[2] correct = [4294967295, 42]; 747 assert(C.array == correct); 748 } 749 750 751 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers. 752 __m128i _mm_cvtepu8_epi16 (__m128i a) @trusted 753 { 754 // PERF DMD 755 static if (GDC_with_SSE41) 756 { 757 return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(short8)a); 758 } 759 else 760 { 761 // LDC x86: generates pmovzxbw since LDC 1.12 -O1 also good without SSE4.1 762 // arm64: ushll since LDC 1.12 -O1 763 // PERF: catastrophic with GDC without SSE4.1 764 byte16 sa = cast(byte16)a; 765 short8 r; 766 r.ptr[0] = cast(ubyte)sa.array[0]; 767 r.ptr[1] = cast(ubyte)sa.array[1]; 768 r.ptr[2] = cast(ubyte)sa.array[2]; 769 r.ptr[3] = cast(ubyte)sa.array[3]; 770 r.ptr[4] = cast(ubyte)sa.array[4]; 771 r.ptr[5] = cast(ubyte)sa.array[5]; 772 r.ptr[6] = cast(ubyte)sa.array[6]; 773 r.ptr[7] = cast(ubyte)sa.array[7]; 774 return cast(__m128i)r; 775 } 776 } 777 unittest 778 { 779 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 780 short8 C = cast(short8) _mm_cvtepu8_epi16(A); 781 short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248]; 782 assert(C.array == correct); 783 } 784 785 786 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers. 787 __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted 788 { 789 // PERF DMD 790 static if (GDC_with_SSE41) 791 { 792 alias ubyte16 = __vector(ubyte[16]); 793 return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a); 794 } 795 else static if (LDC_with_ARM64) 796 { 797 // LDC arm64: a bit better than below in -O2 798 byte16 sa = cast(byte16)a; 799 int4 r; 800 for(int n = 0; n < 4; ++n) 801 r.ptr[n] = cast(ubyte)sa.array[n]; 802 return cast(__m128i)r; 803 } 804 else 805 { 806 // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1 807 // PERF: catastrophic with GDC without SSE4.1 808 byte16 sa = cast(byte16)a; 809 int4 r; 810 r.ptr[0] = cast(ubyte)sa.array[0]; 811 r.ptr[1] = cast(ubyte)sa.array[1]; 812 r.ptr[2] = cast(ubyte)sa.array[2]; 813 r.ptr[3] = cast(ubyte)sa.array[3]; 814 return cast(__m128i)r; 815 } 816 } 817 unittest 818 { 819 __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 820 int4 C = cast(int4) _mm_cvtepu8_epi32(A); 821 int[4] correct = [127, 128, 1, 255]; 822 assert(C.array == correct); 823 } 824 825 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. 826 __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted 827 { 828 // PERF DMD 829 static if (GDC_with_SSE41) 830 { 831 alias ubyte16 = __vector(ubyte[16]); 832 return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a); 833 } 834 else static if (LDC_with_ARM64) 835 { 836 // LDC arm64: this optimizes better than the loop below 837 byte16 sa = cast(byte16)a; 838 long2 r; 839 for (int n = 0; n < 2; ++n) 840 r.ptr[n] = cast(ubyte)sa.array[n]; 841 return cast(__m128i)r; 842 } 843 else 844 { 845 // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1 846 byte16 sa = cast(byte16)a; 847 long2 r; 848 r.ptr[0] = cast(ubyte)sa.array[0]; 849 r.ptr[1] = cast(ubyte)sa.array[1]; 850 return cast(__m128i)r; 851 } 852 } 853 unittest 854 { 855 __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); 856 long2 C = cast(long2) _mm_cvtepu8_epi64(A); 857 long[2] correct = [127, 254]; 858 assert(C.array == correct); 859 } 860 861 /// Conditionally multiply the packed double-precision (64-bit) floating-point elements 862 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally 863 /// store the sum in dst using the low 4 bits of `imm8`. 864 __m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted 865 { 866 // PERF DMD 867 static if (GDC_with_SSE41) 868 { 869 return __builtin_ia32_dppd(a, b, imm8 & 0x33); 870 } 871 else static if (LDC_with_SSE41) 872 { 873 return __builtin_ia32_dppd(a, b, imm8 & 0x33); 874 } 875 else 876 { 877 __m128d zero = _mm_setzero_pd(); 878 __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b); 879 double sum = temp.array[0] + temp.array[1]; 880 return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum)); 881 } 882 } 883 unittest 884 { 885 __m128d A = _mm_setr_pd(1.0, 2.0); 886 __m128d B = _mm_setr_pd(4.0, 8.0); 887 double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B); 888 double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B); 889 double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B); 890 double[2] correct1 = [ 4.0, 4.0]; 891 double[2] correct2 = [16.0, 0.0]; 892 double[2] correct3 = [ 0.0, 20.0]; 893 assert(R1.array == correct1); 894 assert(R2.array == correct2); 895 assert(R3.array == correct3); 896 } 897 898 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements 899 /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, 900 /// and conditionally store the sum in result using the low 4 bits of `imm8`. 901 __m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted 902 { 903 // PERF DMD 904 static if (GDC_with_SSE41) 905 { 906 return __builtin_ia32_dpps(a, b, cast(byte)imm8); 907 } 908 else static if (LDC_with_SSE41) 909 { 910 return __builtin_ia32_dpps(a, b, cast(byte)imm8); 911 } 912 else 913 { 914 __m128 zero = _mm_setzero_ps(); 915 __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b); 916 float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3]; 917 return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum)); 918 } 919 } 920 unittest 921 { 922 __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f); 923 __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f); 924 float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B); 925 float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B); 926 float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B); 927 float[4] correct1 = [67.0f, 67.0f, 67.0f, 67.0f]; 928 float[4] correct2 = [23.0f, 0.0f, 23.0f, 0.0f]; 929 float[4] correct3 = [0.0f, 29.0f, 0.0f, 29.0f]; 930 assert(R1.array == correct1); 931 assert(R2.array == correct2); 932 assert(R3.array == correct3); 933 } 934 935 936 /// Extract a 32-bit integer from `a`, selected with `imm8`. 937 int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted 938 { 939 return (cast(int4)a).array[imm8 & 3]; 940 } 941 unittest 942 { 943 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 944 assert(_mm_extract_epi32(A, 0) == 1); 945 assert(_mm_extract_epi32(A, 1 + 8) == 2); 946 assert(_mm_extract_epi32(A, 3 + 4) == 4); 947 } 948 949 /// Extract a 64-bit integer from `a`, selected with `imm8`. 950 long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted 951 { 952 long2 la = cast(long2)a; 953 return la.array[imm8 & 1]; 954 } 955 unittest 956 { 957 __m128i A = _mm_setr_epi64(45, -67); 958 assert(_mm_extract_epi64(A, 0) == 45); 959 assert(_mm_extract_epi64(A, 1) == -67); 960 assert(_mm_extract_epi64(A, 2) == 45); 961 } 962 963 /// Extract an 8-bit integer from `a`, selected with `imm8`. 964 /// Warning: the returned value is zero-extended to 32-bits. 965 int _mm_extract_epi8 (__m128i a, const int imm8) @trusted 966 { 967 byte16 ba = cast(byte16)a; 968 return cast(ubyte) ba.array[imm8 & 15]; 969 } 970 unittest 971 { 972 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15); 973 assert(_mm_extract_epi8(A, 7) == 7); 974 assert(_mm_extract_epi8(A, 13) == 255); 975 assert(_mm_extract_epi8(A, 7 + 16) == 7); 976 } 977 978 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`. 979 /// Note: returns a 32-bit $(I integer). 980 int _mm_extract_ps (__m128 a, const int imm8) @trusted 981 { 982 return (cast(int4)a).array[imm8 & 3]; 983 } 984 unittest 985 { 986 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f); 987 assert(_mm_extract_ps(A, 0) == 0x3f800000); 988 assert(_mm_extract_ps(A, 1 + 8) == 0x40000000); 989 assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000); 990 } 991 992 993 994 /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an 995 /// integer value, and store the results as packed double-precision floating-point elements. 996 __m128d _mm_floor_pd (__m128d a) @trusted 997 { 998 static if (LDC_with_ARM64) 999 { 1000 // LDC arm64 acceptable since 1.8 -O2 1001 long2 l = vcvtmq_s64_f64(a); 1002 double2 r; 1003 r.ptr[0] = l.array[0]; 1004 r.ptr[1] = l.array[1]; 1005 return r; 1006 } 1007 else 1008 { 1009 return _mm_round_pd!1(a); 1010 } 1011 } 1012 unittest 1013 { 1014 __m128d A = _mm_setr_pd(1.3f, -2.12f); 1015 __m128d B = _mm_setr_pd(53.6f, -2.7f); 1016 A = _mm_floor_pd(A); 1017 B = _mm_floor_pd(B); 1018 double[2] correctA = [1.0, -3.0]; 1019 double[2] correctB = [53.0, -3.0]; 1020 assert(A.array == correctA); 1021 assert(B.array == correctB); 1022 } 1023 1024 /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an 1025 /// integer value, and store the results as packed single-precision floating-point elements. 1026 __m128 _mm_floor_ps (__m128 a) @trusted 1027 { 1028 static if (LDC_with_ARM64) 1029 { 1030 // LDC arm64 acceptable since 1.8 -O1 1031 int4 l = vcvtmq_s32_f32(a); 1032 float4 r; 1033 r.ptr[0] = l.array[0]; 1034 r.ptr[1] = l.array[1]; 1035 r.ptr[2] = l.array[2]; 1036 r.ptr[3] = l.array[3]; 1037 return r; 1038 } 1039 else 1040 { 1041 return _mm_round_ps!1(a); 1042 } 1043 } 1044 unittest 1045 { 1046 __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); 1047 __m128 C = _mm_floor_ps(A); 1048 float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f]; 1049 assert(C.array == correct); 1050 } 1051 1052 /// Round the lower double-precision (64-bit) floating-point element in `b` down to an 1053 /// integer value, store the result as a double-precision floating-point element in the 1054 /// lower element, and copy the upper element from `a` to the upper element. 1055 __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted 1056 { 1057 static if (LDC_with_ARM64) 1058 { 1059 a[0] = vcvtms_s64_f64(b[0]); 1060 return a; 1061 } 1062 else 1063 { 1064 return _mm_round_sd!1(a, b); 1065 } 1066 } 1067 unittest 1068 { 1069 __m128d A = _mm_setr_pd(1.3, -2.12); 1070 __m128d B = _mm_setr_pd(-53.1, -3.7); 1071 __m128d C = _mm_floor_sd(A, B); 1072 double[2] correct = [-54.0, -2.12]; 1073 assert(C.array == correct); 1074 } 1075 1076 /// Round the lower single-precision (32-bit) floating-point element in `b` down to an 1077 /// integer value, store the result as a single-precision floating-point element in the 1078 /// lower element, and copy the upper 3 packed elements from `a` to the upper elements. 1079 __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted 1080 { 1081 static if (LDC_with_ARM64) 1082 { 1083 a[0] = vcvtms_s32_f32(b[0]); 1084 return a; 1085 } 1086 else 1087 { 1088 return _mm_round_ss!1(a, b); 1089 } 1090 } 1091 unittest 1092 { 1093 __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); 1094 __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f); 1095 __m128 C = _mm_floor_ss(A, B); 1096 float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f]; 1097 assert(C.array == correct); 1098 } 1099 1100 /// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`. 1101 __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted 1102 { 1103 // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1 1104 // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1 1105 // LDC arm64: ins.s since LDC 1.8 -O2 1106 int4 ia = cast(int4)a; 1107 ia.ptr[imm8 & 3] = i; 1108 return cast(__m128i)ia; 1109 } 1110 unittest 1111 { 1112 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 1113 int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4); 1114 int[4] result = [1, 2, 5, 4]; 1115 assert(C.array == result); 1116 } 1117 1118 /// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`. 1119 __m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted 1120 { 1121 // GDC: nothing special to do, psinrq generated with -O1 -msse4.1 1122 // LDC x86: always do something sensible. 1123 long2 la = cast(long2)a; 1124 la.ptr[imm8 & 1] = i; 1125 return cast(__m128i)la; 1126 } 1127 unittest 1128 { 1129 __m128i A = _mm_setr_epi64(1, 2); 1130 long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2); 1131 long[2] result = [1, 5]; 1132 assert(C.array == result); 1133 } 1134 1135 /// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`. 1136 /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8. 1137 __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted 1138 { 1139 // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1 1140 // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory. 1141 byte16 ba = cast(byte16)a; 1142 ba.ptr[imm8 & 15] = cast(byte)i; 1143 return cast(__m128i)ba; 1144 } 1145 unittest 1146 { 1147 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 1148 byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16); 1149 byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 1150 assert(C.array == result); 1151 } 1152 1153 1154 /// Warning: of course it does something totally different from `_mm_insert_epi32`! 1155 /// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` 1156 /// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` 1157 /// (elements are zeroed out when the corresponding bit is set). 1158 __m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted 1159 { 1160 // PERF DMD 1161 static if (GDC_with_SSE41) 1162 { 1163 return __builtin_ia32_insertps128(a, b, cast(byte)imm8); 1164 } 1165 else static if (LDC_with_SSE41) 1166 { 1167 return __builtin_ia32_insertps128(a, b, cast(byte)imm8); 1168 } 1169 else 1170 { 1171 float4 tmp2 = a; 1172 float tmp1 = b.array[(imm8 >> 6) & 3]; 1173 tmp2.ptr[(imm8 >> 4) & 3] = tmp1; 1174 return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps()); 1175 } 1176 } 1177 unittest 1178 { 1179 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1180 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1181 __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B); 1182 float[4] correct = [1.0f, 2.0f, 0.0f, 7.0f]; 1183 assert(C.array == correct); 1184 } 1185 1186 1187 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. 1188 __m128i _mm_max_epi32 (__m128i a, __m128i b) @trusted 1189 { 1190 static if (GDC_with_SSE41) 1191 { 1192 return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b); 1193 } 1194 else version(LDC) 1195 { 1196 // x86: pmaxsd since LDC 1.1 -O1 1197 // ARM: smax.4s since LDC 1.8 -01 1198 int4 sa = cast(int4)a; 1199 int4 sb = cast(int4)b; 1200 int4 greater = greaterMask!int4(sa, sb); 1201 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1202 } 1203 else 1204 { 1205 __m128i higher = _mm_cmpgt_epi32(a, b); 1206 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1207 __m128i mask = _mm_and_si128(aTob, higher); 1208 return _mm_xor_si128(b, mask); 1209 } 1210 } 1211 unittest 1212 { 1213 int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), 1214 _mm_setr_epi32( -4,-8, 9, -8)); 1215 int[4] correct = [0x7fffffff, 1, 9, 7]; 1216 assert(R.array == correct); 1217 } 1218 1219 /// Compare packed signed 8-bit integers in `a` and `b`, 1220 /// and return packed maximum values. 1221 __m128i _mm_max_epi8 (__m128i a, __m128i b) @trusted 1222 { 1223 // PERF DMD 1224 static if (GDC_with_SSE41) 1225 { 1226 return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b); 1227 } 1228 else version(LDC) 1229 { 1230 // x86: pmaxsb since LDC 1.1 -O1 1231 // ARM64: smax.16b since LDC 1.8.0 -O1 1232 byte16 sa = cast(byte16)a; 1233 byte16 sb = cast(byte16)b; 1234 byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); 1235 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1236 } 1237 else 1238 { 1239 __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else 1240 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1241 __m128i mask = _mm_and_si128(aTob, lower); 1242 return _mm_xor_si128(b, mask); 1243 } 1244 } 1245 unittest 1246 { 1247 __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); 1248 __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 1249 byte16 R = cast(byte16) _mm_max_epi8(A, B); 1250 byte[16] correct = [127, 1, 9, -7, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0]; 1251 assert(R.array == correct); 1252 } 1253 1254 /// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values. 1255 __m128i _mm_max_epu16 (__m128i a, __m128i b) @trusted 1256 { 1257 // PERF DMD 1258 static if (GDC_with_SSE41) 1259 { 1260 return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b); 1261 } 1262 else version(LDC) 1263 { 1264 // x86: pmaxuw since LDC 1.1 -O1 1265 // ARM64: umax.8h since LDC 1.8.0 -O1 1266 // PERF: without sse4.1, LLVM 12 produces a very interesting 1267 // psubusw xmm0, xmm1 1268 // paddw xmm0, xmm1 1269 // sequence that maybe should go in other min/max intrinsics? 1270 ushort8 sa = cast(ushort8)a; 1271 ushort8 sb = cast(ushort8)b; 1272 ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb); 1273 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1274 } 1275 else 1276 { 1277 b = _mm_subs_epu16(b, a); 1278 b = _mm_add_epi16(b, a); 1279 return b; 1280 } 1281 } 1282 unittest 1283 { 1284 short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), 1285 _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); 1286 short[8] correct = [ -4, -8, -4, -7, 9,-32768, 0, 57]; 1287 assert(R.array == correct); 1288 } 1289 1290 /// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values. 1291 __m128i _mm_max_epu32 (__m128i a, __m128i b) @trusted 1292 { 1293 // PERF DMD 1294 static if (GDC_with_SSE41) 1295 { 1296 return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b); 1297 } 1298 else version(LDC) 1299 { 1300 // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1 1301 // ARM64: umax.4s since LDC 1.8.0 -O1 1302 uint4 sa = cast(uint4)a; 1303 uint4 sb = cast(uint4)b; 1304 uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); 1305 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1306 } 1307 else 1308 { 1309 __m128i valueShift = _mm_set1_epi32(-0x80000000); 1310 __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift)); 1311 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1312 __m128i mask = _mm_and_si128(aTob, higher); 1313 return _mm_xor_si128(b, mask); 1314 } 1315 } 1316 unittest 1317 { 1318 int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), 1319 _mm_setr_epi32( -4,-8, 9, -8)); 1320 int[4] correct = [ -4,-8, 9, -7]; 1321 assert(R.array == correct); 1322 } 1323 1324 /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. 1325 __m128i _mm_min_epi32 (__m128i a, __m128i b) @trusted 1326 { 1327 // PERF DMD 1328 static if (GDC_with_SSE41) 1329 { 1330 return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b); 1331 } 1332 else version(LDC) 1333 { 1334 // x86: pminsd since LDC 1.1 -O1, also good without sse4.1 1335 // ARM: smin.4s since LDC 1.8 -01 1336 int4 sa = cast(int4)a; 1337 int4 sb = cast(int4)b; 1338 int4 greater = greaterMask!int4(sa, sb); 1339 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1340 } 1341 else 1342 { 1343 __m128i higher = _mm_cmplt_epi32(a, b); 1344 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1345 __m128i mask = _mm_and_si128(aTob, higher); 1346 return _mm_xor_si128(b, mask); 1347 } 1348 } 1349 unittest 1350 { 1351 int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), 1352 _mm_setr_epi32( -4, -8, 9, -8)); 1353 int[4] correct = [ -4, -8, -4, -8]; 1354 assert(R.array == correct); 1355 } 1356 1357 /// Compare packed signed 8-bit integers in `a` and `b`, 1358 /// and return packed minimum values. 1359 __m128i _mm_min_epi8 (__m128i a, __m128i b) @trusted 1360 { 1361 // PERF DMD 1362 static if (GDC_with_SSE41) 1363 { 1364 return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b); 1365 } 1366 else version(LDC) 1367 { 1368 // x86: pminsb since LDC 1.1 -O1 1369 // ARM64: smin.16b since LDC 1.8.0 -O1 1370 byte16 sa = cast(byte16)a; 1371 byte16 sb = cast(byte16)b; 1372 byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); 1373 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1374 } 1375 else 1376 { 1377 __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else 1378 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1379 __m128i mask = _mm_and_si128(aTob, lower); 1380 return _mm_xor_si128(b, mask); 1381 } 1382 } 1383 unittest 1384 { 1385 __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); 1386 __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 1387 byte16 R = cast(byte16) _mm_min_epi8(A, B); 1388 byte[16] correct = [ 4, -8, -4, -8, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 1389 assert(R.array == correct); 1390 } 1391 1392 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst. 1393 __m128i _mm_min_epu16 (__m128i a, __m128i b) @trusted 1394 { 1395 // PERF DMD 1396 static if (GDC_with_SSE41) 1397 { 1398 return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b); 1399 } 1400 else version(LDC) 1401 { 1402 // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1 1403 // ARM64: umin.8h since LDC 1.8.0 -O1 1404 ushort8 sa = cast(ushort8)a; 1405 ushort8 sb = cast(ushort8)b; 1406 ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa); 1407 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 1408 } 1409 else 1410 { 1411 __m128i c = _mm_subs_epu16(b, a); 1412 b = _mm_sub_epi16(b, c); 1413 return b; 1414 } 1415 } 1416 unittest 1417 { 1418 short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), 1419 _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); 1420 short[8] correct = [32767, 1, 9, -8, 0, 7, 0, 0]; 1421 assert(R.array == correct); 1422 } 1423 1424 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst. 1425 __m128i _mm_min_epu32 (__m128i a, __m128i b) @trusted 1426 { 1427 // PERF DMD 1428 static if (GDC_with_SSE41) 1429 { 1430 return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b); 1431 } 1432 else version(LDC) 1433 { 1434 // x86: pminud since LDC 1.1 -O1, also good without sse4.1 1435 // ARM64: umin.4s since LDC 1.8.0 -O1 1436 uint4 sa = cast(uint4)a; 1437 uint4 sb = cast(uint4)b; 1438 uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); 1439 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 1440 } 1441 else 1442 { 1443 __m128i valueShift = _mm_set1_epi32(-0x80000000); 1444 __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift)); 1445 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1446 __m128i mask = _mm_and_si128(aTob, higher); 1447 return _mm_xor_si128(b, mask); 1448 } 1449 } 1450 unittest 1451 { 1452 int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), 1453 _mm_setr_epi32( -4,-8, 9, -8)); 1454 int[4] correct = [0x7fffffff, 1, 4, -8]; 1455 assert(R.array == correct); 1456 } 1457 1458 /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, 1459 /// store the minimum and index in return value, and zero the remaining bits. 1460 __m128i _mm_minpos_epu16 (__m128i a) @trusted 1461 { 1462 // PERF DMD 1463 static if (GDC_with_SSE41) 1464 { 1465 return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); 1466 } 1467 else static if (LDC_with_SSE41) 1468 { 1469 return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); 1470 } 1471 else static if (LDC_with_ARM64) 1472 { 1473 __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 1474 __m128i combinedLo = _mm_unpacklo_epi16(indices, a); 1475 __m128i combinedHi = _mm_unpackhi_epi16(indices, a); 1476 __m128i best = _mm_min_epu32(combinedLo, combinedHi); 1477 best = _mm_min_epu32(best, _mm_srli_si128!8(best)); 1478 best = _mm_min_epu32(best, _mm_srli_si128!4(best)); 1479 short8 sbest = cast(short8)best; 1480 short8 r; 1481 r[0] = sbest[1]; 1482 r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie 1483 r[2] = 0; 1484 r[3] = 0; 1485 r[4] = 0; 1486 r[5] = 0; 1487 r[6] = 0; 1488 r[7] = 0; 1489 return cast(__m128i)r; 1490 } 1491 else 1492 { 1493 short8 sa = cast(short8)a; 1494 ushort min = 0xffff; 1495 int index = 0; 1496 for(int n = 0; n < 8; ++n) 1497 { 1498 ushort c = sa.array[n]; 1499 if (c < min) 1500 { 1501 min = c; 1502 index = n; 1503 } 1504 } 1505 short8 r; 1506 r.ptr[0] = min; 1507 r.ptr[1] = cast(short)index; 1508 return cast(__m128i)r; 1509 } 1510 } 1511 unittest 1512 { 1513 __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6); 1514 __m128i B = _mm_setr_epi16(14, 4, 4, 2, -3, 2, 5, 6); 1515 short8 R1 = cast(short8) _mm_minpos_epu16(A); 1516 short8 R2 = cast(short8) _mm_minpos_epu16(B); 1517 short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0]; 1518 short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0]; 1519 assert(R1.array == correct1); 1520 assert(R2.array == correct2); 1521 } 1522 1523 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers 1524 /// in `a` compared to those in `b`, and store the 16-bit results in dst. 1525 /// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. 1526 /// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. 1527 /// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting 1528 /// at the offset specified in `imm8[2]`. 1529 __m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted 1530 { 1531 // PERF DMD 1532 static if (GDC_with_SSE41) 1533 { 1534 return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8); 1535 } 1536 else static if (LDC_with_SSE41) 1537 { 1538 return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8); 1539 } 1540 else 1541 { 1542 int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable... 1543 int b_offset = (imm8 & 3) * 4; 1544 1545 byte16 ba = cast(byte16)a; 1546 byte16 bb = cast(byte16)b; 1547 short8 r; 1548 1549 __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0); 1550 1551 for (int j = 0; j < 8; j += 2) 1552 { 1553 int k = a_offset + j; 1554 __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3], 1555 0, 0, 0, 0, 1556 ba[k+1], ba[k+2], ba[k+3], ba[k+4], 1557 0, 0, 0, 0); 1558 short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64 1559 r.ptr[j] = diffs.array[0]; 1560 r.ptr[j+1] = diffs.array[4]; 1561 } 1562 return cast(__m128i)r; 1563 } 1564 } 1565 unittest 1566 { 1567 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 1568 __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5, 5, 5, 12, 13, 14, 15); 1569 short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23]; 1570 short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749]; 1571 short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35]; 1572 short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741]; 1573 short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4]; 1574 short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B,); 1575 short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B,); 1576 short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B,); 1577 short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B,); 1578 short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B,); 1579 assert(r1.array == correct1); 1580 assert(r4.array == correct4); 1581 assert(r5.array == correct5); 1582 assert(r7.array == correct7); 1583 assert(r8.array == correct0); 1584 } 1585 1586 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst. 1587 __m128i _mm_mul_epi32 (__m128i a, __m128i b) @trusted 1588 { 1589 // PERF DMD 1590 static if (GDC_with_SSE41) 1591 { 1592 return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b); 1593 } 1594 else static if (LDC_with_SSE41) 1595 { 1596 // For some reason, clang has the builtin but it's not in IntrinsicsX86.td 1597 // Use IR instead. 1598 // This generates pmuldq with since LDC 1.2.0 -O0 1599 enum ir = ` 1600 %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 2> 1601 %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> <i32 0, i32 2> 1602 %la = sext <2 x i32> %ia to <2 x i64> 1603 %lb = sext <2 x i32> %ib to <2 x i64> 1604 %r = mul <2 x i64> %la, %lb 1605 ret <2 x i64> %r`; 1606 return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b); 1607 } 1608 else static if (LDC_with_ARM64) 1609 { 1610 // 3 instructions since LDC 1.8 -O2 1611 // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull 1612 int2 a_lo = vmovn_s64(cast(long2)a); 1613 int2 b_lo = vmovn_s64(cast(long2)b); 1614 return cast(__m128i) vmull_s32(a_lo, b_lo); 1615 } 1616 else 1617 { 1618 int4 ia = cast(int4)a; 1619 int4 ib = cast(int4)b; 1620 long2 r; 1621 r.ptr[0] = cast(long)ia.array[0] * ib.array[0]; 1622 r.ptr[1] = cast(long)ia.array[2] * ib.array[2]; 1623 return cast(__m128i)r; 1624 } 1625 } 1626 unittest 1627 { 1628 __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); 1629 __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); 1630 long2 R = cast(long2) _mm_mul_epi32(A, B); 1631 long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144]; 1632 assert(R.array == correct); 1633 } 1634 1635 /// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, 1636 /// return the low 32 bits of the intermediate integers. 1637 __m128i _mm_mullo_epi32 (__m128i a, __m128i b) @trusted 1638 { 1639 // PERF DMD 1640 // PERF GDC without SSE4.1 could be better 1641 static if (GDC_with_SSE41) 1642 { 1643 int4 ia = cast(int4)a; 1644 int4 ib = cast(int4)b; 1645 // Note: older GDC doesn't have that op, but older GDC 1646 // also has no support for -msse4.1 detection 1647 return cast(__m128i)(a * b); 1648 } 1649 else version(LDC) 1650 { 1651 int4 ia = cast(int4)a; 1652 int4 ib = cast(int4)b; 1653 return cast(__m128i)(a * b); 1654 } 1655 else 1656 { 1657 // DMD doesn't take the above 1658 int4 ia = cast(int4)a; 1659 int4 ib = cast(int4)b; 1660 int4 r; 1661 r.ptr[0] = ia.array[0] * ib.array[0]; 1662 r.ptr[1] = ia.array[1] * ib.array[1]; 1663 r.ptr[2] = ia.array[2] * ib.array[2]; 1664 r.ptr[3] = ia.array[3] * ib.array[3]; 1665 return r; 1666 } 1667 } 1668 unittest 1669 { 1670 __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); 1671 __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); 1672 int4 R = cast(int4) _mm_mullo_epi32(A, B); 1673 int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0]; 1674 assert(R.array == correct); 1675 } 1676 1677 1678 /// Convert packed signed 32-bit integers from `a` and `b` 1679 /// to packed 16-bit integers using unsigned saturation. 1680 __m128i _mm_packus_epi32 (__m128i a, __m128i b) @trusted 1681 { 1682 static if (GDC_with_SSE41) 1683 { 1684 // PERF For some reason doesn't generates the builtin??? 1685 return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); 1686 } 1687 else static if (LDC_with_SSE41) 1688 { 1689 return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); 1690 } 1691 else static if (LDC_with_ARM64) 1692 { 1693 int4 z; 1694 z = 0; 1695 return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)), 1696 vqmovn_u32(vmaxq_s32(z, cast(int4)b))); 1697 } 1698 else 1699 { 1700 // PERF: not great without SSE4.1 1701 int4 sa = cast(int4)a; 1702 int4 sb = cast(int4)b; 1703 align(16) ushort[8] result; 1704 for (int i = 0; i < 4; ++i) 1705 { 1706 int s = sa.array[i]; 1707 if (s < 0) s = 0; 1708 if (s > 65535) s = 65535; 1709 result.ptr[i] = cast(ushort)s; 1710 1711 s = sb.array[i]; 1712 if (s < 0) s = 0; 1713 if (s > 65535) s = 65535; 1714 result.ptr[i+4] = cast(ushort)s; 1715 } 1716 return *cast(__m128i*)(result.ptr); 1717 } 1718 } 1719 unittest 1720 { 1721 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 1722 short8 R = cast(short8) _mm_packus_epi32(A, A); 1723 short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0]; 1724 assert(R.array == correct); 1725 } 1726 1727 1728 /// Round the packed double-precision (64-bit) floating-point elements in `a` using the 1729 /// rounding parameter, and store the results as packed double-precision floating-point elements. 1730 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1731 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1732 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1733 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1734 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1735 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1736 __m128d _mm_round_pd(int rounding)(__m128d a) @trusted 1737 { 1738 // PERF DMD 1739 static if (GDC_with_SSE41) 1740 { 1741 return __builtin_ia32_roundpd(a, rounding); 1742 } 1743 else static if (LDC_with_SSE41) 1744 { 1745 return __builtin_ia32_roundpd(a, rounding); 1746 } 1747 else 1748 { 1749 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1750 { 1751 // Convert to 64-bit integers 1752 long lo = _mm_cvtsd_si64(a); 1753 a.ptr[0] = a.array[1]; 1754 long hi = _mm_cvtsd_si64(a); 1755 return _mm_setr_pd(lo, hi); 1756 } 1757 else 1758 { 1759 version(GNU) pragma(inline, false); // else fail unittest with optimizations 1760 1761 uint old = _MM_GET_ROUNDING_MODE(); 1762 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1763 1764 // Convert to 64-bit integers 1765 long lo = _mm_cvtsd_si64(a); 1766 a.ptr[0] = a.array[1]; 1767 long hi = _mm_cvtsd_si64(a); 1768 1769 // Convert back to double to achieve the rounding 1770 // The problem is that a 64-bit double can't represent all the values 1771 // a 64-bit integer can (and vice-versa). So this function won't work for 1772 // large values. (TODO: what range exactly?) 1773 _MM_SET_ROUNDING_MODE(old); 1774 return _mm_setr_pd(lo, hi); 1775 } 1776 } 1777 } 1778 unittest 1779 { 1780 // tested in other intrinsics 1781 } 1782 1783 /// Round the packed single-precision (32-bit) floating-point elements in `a` using the 1784 /// rounding parameter, and store the results as packed single-precision floating-point elements. 1785 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1786 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1787 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1788 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1789 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1790 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1791 __m128 _mm_round_ps(int rounding)(__m128 a) @trusted 1792 { 1793 static if (GDC_with_SSE41) 1794 { 1795 return __builtin_ia32_roundps(a, rounding); 1796 } 1797 else static if (LDC_with_SSE41) 1798 { 1799 return __builtin_ia32_roundps(a, rounding); 1800 } 1801 else 1802 { 1803 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1804 { 1805 __m128i integers = _mm_cvtps_epi32(a); 1806 return _mm_cvtepi32_ps(integers); 1807 } 1808 else 1809 { 1810 version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled 1811 uint old = _MM_GET_ROUNDING_MODE(); 1812 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1813 scope(exit) _MM_SET_ROUNDING_MODE(old); 1814 1815 // Convert to 64-bit integers 1816 __m128i integers = _mm_cvtps_epi32(a); 1817 1818 // Convert back to float to achieve the rounding 1819 // The problem is that a 32-float can't represent all the values 1820 // a 32-bit integer can (and vice-versa). So this function won't work for 1821 // large values. (TODO: what range exactly?) 1822 __m128 result = _mm_cvtepi32_ps(integers); 1823 1824 return result; 1825 } 1826 } 1827 } 1828 unittest 1829 { 1830 // tested in other intrinsics 1831 } 1832 1833 1834 /// Round the lower double-precision (64-bit) floating-point element in `b` using the 1835 /// rounding parameter, store the result as a double-precision floating-point element 1836 /// in the lower element of result, and copy the upper element from `a` to the upper element of result. 1837 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1838 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1839 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1840 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1841 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1842 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1843 __m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted 1844 { 1845 static if (GDC_with_SSE41) 1846 { 1847 return __builtin_ia32_roundsd(a, b, rounding); 1848 } 1849 else static if (LDC_with_SSE41) 1850 { 1851 return __builtin_ia32_roundsd(a, b, rounding); 1852 } 1853 else 1854 { 1855 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1856 { 1857 // Convert to 64-bit integer 1858 long b0 = _mm_cvtsd_si64(b); 1859 a.ptr[0] = b0; 1860 return a; 1861 } 1862 else 1863 { 1864 version(GNU) pragma(inline, false); // else fail unittest with optimizations 1865 1866 uint old = _MM_GET_ROUNDING_MODE(); 1867 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1868 1869 // Convert to 64-bit integer 1870 long b0 = _mm_cvtsd_si64(b); 1871 a.ptr[0] = b0; 1872 1873 // Convert back to double to achieve the rounding 1874 // The problem is that a 64-bit double can't represent all the values 1875 // a 64-bit integer can (and vice-versa). So this function won't work for 1876 // large values. (TODO: what range exactly?) 1877 _MM_SET_ROUNDING_MODE(old); 1878 return a; 1879 } 1880 } 1881 } 1882 unittest 1883 { 1884 // tested in other intrinsics 1885 } 1886 1887 1888 /// Round the lower single-precision (32-bit) floating-point element in `b` using the 1889 /// rounding parameter, store the result as a single-precision floating-point element 1890 /// in the lower element of result, and copy the upper 3 packed elements from `a` 1891 /// to the upper elements of result. 1892 /// Rounding is done according to the rounding[3:0] parameter, which can be one of: 1893 /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions 1894 /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions 1895 /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions 1896 /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions 1897 /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE 1898 __m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted 1899 { 1900 static if (GDC_with_SSE41) 1901 { 1902 return __builtin_ia32_roundss(a, b, rounding); 1903 } 1904 else static if (LDC_with_SSE41) 1905 { 1906 return __builtin_ia32_roundss(a, b, rounding); 1907 } 1908 else 1909 { 1910 static if (rounding & _MM_FROUND_CUR_DIRECTION) 1911 { 1912 int b0 = _mm_cvtss_si32(b); 1913 a.ptr[0] = b0; 1914 return a; 1915 } 1916 else version(GNU) 1917 { 1918 pragma(inline, false) 1919 __m128 GDCworkaround() nothrow @nogc @trusted 1920 { 1921 uint old = _MM_GET_ROUNDING_MODE(); 1922 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1923 1924 // Convert to 32-bit integer 1925 int b0 = _mm_cvtss_si32(b); 1926 a.ptr[0] = b0; 1927 1928 // Convert back to double to achieve the rounding 1929 // The problem is that a 64-bit double can't represent all the values 1930 // a 64-bit integer can (and vice-versa). So this function won't work for 1931 // large values. (TODO: what range exactly?) 1932 _MM_SET_ROUNDING_MODE(old); 1933 return a; 1934 } 1935 return GDCworkaround(); 1936 } 1937 else 1938 { 1939 uint old = _MM_GET_ROUNDING_MODE(); 1940 _MM_SET_ROUNDING_MODE((rounding & 3) << 13); 1941 1942 // Convert to 32-bit integer 1943 int b0 = _mm_cvtss_si32(b); 1944 a.ptr[0] = b0; 1945 1946 // Convert back to double to achieve the rounding 1947 // The problem is that a 64-bit double can't represent all the values 1948 // a 64-bit integer can (and vice-versa). So this function won't work for 1949 // large values. (TODO: what range exactly?) 1950 _MM_SET_ROUNDING_MODE(old); 1951 return a; 1952 } 1953 } 1954 } 1955 unittest 1956 { 1957 // tested in other intrinsics 1958 } 1959 1960 1961 /// Load 128-bits of integer data from memory using a non-temporal memory hint. 1962 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection 1963 /// exception may be generated. 1964 __m128i _mm_stream_load_si128 (__m128i * mem_addr) @trusted 1965 { 1966 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 1967 return *mem_addr; // it's a regular move instead 1968 } 1969 1970 1971 /// Return 1 if all bits in `a` are all 1's. Else return 0. 1972 int _mm_test_all_ones (__m128i a) @safe 1973 { 1974 return _mm_testc_si128(a, _mm_set1_epi32(-1)); 1975 } 1976 unittest 1977 { 1978 __m128i A = _mm_set1_epi32(-1); 1979 __m128i B = _mm_set_epi32(-1, -2, -1, -1); 1980 assert(_mm_test_all_ones(A) == 1); 1981 assert(_mm_test_all_ones(B) == 0); 1982 } 1983 1984 /// Return 1 if all bits in `a` are all 0's. Else return 0. 1985 // This is a #BONUS since it was lacking in Intel Intrinsics API. 1986 int _mm_test_all_zeros (__m128i a) @safe 1987 { 1988 return _mm_testz_si128(a, _mm_set1_epi32(-1)); 1989 } 1990 unittest 1991 { 1992 __m128i A = _mm_set1_epi32(0); 1993 __m128i B = _mm_set_epi32(0, 8, 0, 0); 1994 assert(_mm_test_all_zeros(A) == 1); 1995 assert(_mm_test_all_zeros(B) == 0); 1996 } 1997 1998 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, 1999 /// and return 1 if the result is zero, otherwise return 0. 2000 int _mm_test_all_zeros (__m128i a, __m128i mask) @safe 2001 { 2002 return _mm_testz_si128(a, mask); // it's really the same, but with a good name 2003 } 2004 2005 /// Compute the bitwise AND of 128 bits (representing integer data) in a and mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0. 2006 int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted 2007 { 2008 return _mm_testnzc_si128(a, mask); 2009 } 2010 2011 /// Compute the bitwise NOT of a and then AND with b, and return 1 if the 2012 /// result is zero, otherwise return 0. 2013 /// In other words, test if all bits masked by `b` are 1 in `a`. 2014 int _mm_testc_si128 (__m128i a, __m128i b) @trusted 2015 { 2016 // PERF DMD 2017 static if (GDC_with_SSE41) 2018 { 2019 return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); 2020 } 2021 else static if (LDC_with_SSE41) 2022 { 2023 return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); 2024 } 2025 else static if (LDC_with_ARM64) 2026 { 2027 // Acceptable since LDC 1.8 -02 2028 long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a); 2029 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); 2030 } 2031 else 2032 { 2033 __m128i c = ~a & b; 2034 int[4] zero = [0, 0, 0, 0]; 2035 return c.array == zero; 2036 } 2037 } 2038 unittest 2039 { 2040 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2041 __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00); 2042 __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); 2043 assert(_mm_testc_si128(A, A) == 1); 2044 assert(_mm_testc_si128(A, M1) == 0); 2045 assert(_mm_testc_si128(A, M2) == 1); 2046 } 2047 2048 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, 2049 /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 2050 /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 2051 /// result is zero, otherwise set CF to 0. 2052 /// Return 1 if both the ZF and CF values are zero, otherwise return 0. 2053 int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted 2054 { 2055 // PERF DMD 2056 static if (GDC_with_SSE41) 2057 { 2058 return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); 2059 } 2060 else static if (LDC_with_SSE41) 2061 { 2062 return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); 2063 } 2064 else static if (LDC_with_ARM64) 2065 { 2066 long2 s640 = vandq_s64(cast(long2)b, cast(long2)a); 2067 long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a); 2068 2069 return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)) 2070 | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) ); 2071 } 2072 else 2073 { 2074 __m128i c = a & b; 2075 __m128i d = ~a & b; 2076 int[4] zero = [0, 0, 0, 0]; 2077 return !( (c.array == zero) || (d.array == zero)); 2078 } 2079 } 2080 unittest 2081 { 2082 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2083 __m128i M = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00); 2084 __m128i Z = _mm_setzero_si128(); 2085 assert(_mm_testnzc_si128(A, Z) == 0); 2086 assert(_mm_testnzc_si128(A, M) == 1); 2087 assert(_mm_testnzc_si128(A, A) == 0); 2088 } 2089 2090 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, 2091 /// and return 1 if the result is zero, otherwise return 0. 2092 /// In other words, test if all bits masked by `b` are 0 in `a`. 2093 int _mm_testz_si128 (__m128i a, __m128i b) @trusted 2094 { 2095 // PERF DMD 2096 static if (GDC_with_SSE41) 2097 { 2098 return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); 2099 } 2100 else static if (LDC_with_SSE41) 2101 { 2102 return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); 2103 } 2104 else static if (LDC_with_ARM64) 2105 { 2106 // Acceptable since LDC 1.8 -02 2107 long2 s64 = vandq_s64(cast(long2)a, cast(long2)b); 2108 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); 2109 } 2110 else 2111 { 2112 __m128i c = a & b; 2113 int[4] zero = [0, 0, 0, 0]; 2114 return c.array == zero; 2115 } 2116 } 2117 unittest 2118 { 2119 __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); 2120 __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07); 2121 __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); 2122 assert(_mm_testz_si128(A, A) == 0); 2123 assert(_mm_testz_si128(A, M1) == 1); 2124 assert(_mm_testz_si128(A, M2) == 0); 2125 } 2126