1 /** 2 * SSSE3 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSSE3 4 * 5 * Copyright: Guillaume Piolat 2021. 6 * Johan Engelen 2021. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.tmmintrin; 10 11 public import inteli.types; 12 import inteli.internals; 13 14 public import inteli.pmmintrin; 15 import inteli.mmx; 16 17 nothrow @nogc: 18 19 20 // SSSE3 instructions 21 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSSE3 22 // Note: this header will work whether you have SSSE3 enabled or not. 23 // With LDC, use "dflags-ldc": ["-mattr=+ssse3"] or equivalent to actively 24 // generate SSE3 instructions. 25 26 /// Compute the absolute value of packed signed 16-bit integers in `a`. 27 __m128i _mm_abs_epi16 (__m128i a) @trusted 28 { 29 static if (DMD_with_DSIMD) 30 { 31 return cast(__m128i)__simd(XMM.PABSW, a); 32 } 33 else static if (GDC_with_SSSE3) 34 { 35 return cast(__m128i) __builtin_ia32_pabsw128(cast(short8)a); 36 } 37 else static if (LDC_with_ARM64) 38 { 39 return cast(__m128i) vabsq_s16(cast(short8)a); 40 } 41 else 42 { 43 // LDC x86: generate pabsw since LDC 1.1 -O2 44 short8 sa = cast(short8)a; 45 for (int i = 0; i < 8; ++i) 46 { 47 short s = sa.array[i]; 48 sa.ptr[i] = s >= 0 ? s : cast(short)(-cast(int)(s)); 49 } 50 return cast(__m128i)sa; 51 } 52 } 53 unittest 54 { 55 __m128i A = _mm_setr_epi16(0, -1, -32768, 32767, 10, -10, 1000, -1000); 56 short8 B = cast(short8) _mm_abs_epi16(A); 57 short[8] correct = [0, 1, -32768, 32767, 10, 10, 1000, 1000]; 58 assert(B.array == correct); 59 } 60 61 /// Compute the absolute value of packed signed 32-bit integers in `a`. 62 __m128i _mm_abs_epi32 (__m128i a) @trusted 63 { 64 static if (DMD_with_DSIMD) 65 { 66 return cast(__m128i)__simd(XMM.PABSD, cast(int4)a); 67 } 68 else static if (GDC_with_SSSE3) 69 { 70 return cast(__m128i) __builtin_ia32_pabsd128(cast(int4)a); 71 } 72 else static if (LDC_with_ARM64) 73 { 74 return cast(__m128i) vabsq_s32(cast(int4)a); 75 } 76 else 77 { 78 // LDC x86: generates pabsd since LDC 1.1 -O2 79 int4 sa = cast(int4)a; 80 for (int i = 0; i < 4; ++i) 81 { 82 int s = sa.array[i]; 83 sa.ptr[i] = s >= 0 ? s : -s; 84 } 85 return cast(__m128i)sa; 86 } 87 } 88 unittest 89 { 90 __m128i A = _mm_setr_epi32(0, -1, -2_147_483_648, -2_147_483_647); 91 int4 B = cast(int4) _mm_abs_epi32(A); 92 int[4] correct = [0, 1, -2_147_483_648, 2_147_483_647]; 93 assert(B.array == correct); 94 } 95 96 /// Compute the absolute value of packed signed 8-bit integers in `a`. 97 __m128i _mm_abs_epi8 (__m128i a) @trusted 98 { 99 static if (DMD_with_DSIMD) 100 { 101 return cast(__m128i)__simd(XMM.PABSB, cast(byte16)a); 102 } 103 else static if (GDC_with_SSSE3) 104 { 105 alias ubyte16 = __vector(ubyte[16]); 106 return cast(__m128i) __builtin_ia32_pabsb128(cast(ubyte16)a); 107 } 108 else static if (LDC_with_ARM64) 109 { 110 return cast(__m128i) vabsq_s8(cast(byte16)a); 111 } 112 else version(LDC) 113 { 114 // LDC x86: generates pabsb since LDC 1.1 -O1 115 // arm64: generates abs since LDC 1.8 -O1 116 enum ir = ` 117 %n = sub <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %0 118 %s = icmp slt <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %0 119 %r = select <16 x i1> %s, <16 x i8> %0, <16 x i8> %n 120 ret <16 x i8> %r`; 121 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16)(cast(byte16)a); 122 } 123 else 124 { 125 // A loop version like in _mm_abs_epi16/_mm_abs_epi32 would be very slow 126 // in LDC x86 and wouldn't vectorize. Doesn't generate pabsb in LDC though. 127 return _mm_min_epu8(a, _mm_sub_epi8(_mm_setzero_si128(), a)); 128 } 129 } 130 unittest 131 { 132 __m128i A = _mm_setr_epi8(0, -1, -128, -127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 133 byte16 B = cast(byte16) _mm_abs_epi8(A); 134 byte[16] correct = [0, 1, -128, 127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 135 assert(B.array == correct); 136 } 137 138 /// Compute the absolute value of packed 64-bit floating-point elements in `a`. 139 /// #BONUS. 140 __m128d _mm_abs_pd (__m128d a) @trusted 141 { 142 long2 mask = 0x7fff_ffff_ffff_ffff; 143 return cast(__m128d)((cast(long2)a) & mask); 144 } 145 unittest 146 { 147 __m128d A = _mm_setr_pd(-42.0f, -double.infinity); 148 __m128d R = _mm_abs_pd(A); 149 double[2] correct = [42.0f, +double.infinity]; 150 assert(R.array == correct); 151 } 152 153 /// Compute the absolute value of packed signed 16-bit integers in `a`. 154 __m64 _mm_abs_pi16 (__m64 a) @trusted 155 { 156 return to_m64(_mm_abs_epi16(to_m128i(a))); 157 } 158 unittest 159 { 160 __m64 A = _mm_setr_pi16(0, -1, -32768, 32767); 161 short4 B = cast(short4) _mm_abs_pi16(A); 162 short[4] correct = [0, 1, -32768, 32767]; 163 assert(B.array == correct); 164 } 165 166 /// Compute the absolute value of packed signed 32-bit integers in `a`. 167 __m64 _mm_abs_pi32 (__m64 a) @trusted 168 { 169 return to_m64(_mm_abs_epi32(to_m128i(a))); 170 } 171 unittest 172 { 173 __m64 A = _mm_setr_pi32(-1, -2_147_483_648); 174 int2 B = cast(int2) _mm_abs_pi32(A); 175 int[2] correct = [1, -2_147_483_648]; 176 assert(B.array == correct); 177 } 178 179 /// Compute the absolute value of packed signed 8-bit integers in `a`. 180 __m64 _mm_abs_pi8 (__m64 a) @trusted 181 { 182 return to_m64(_mm_abs_epi8(to_m128i(a))); 183 } 184 unittest 185 { 186 __m64 A = _mm_setr_pi8(0, -1, -128, -127, 127, 0, 0, 0); 187 byte8 B = cast(byte8) _mm_abs_pi8(A); 188 byte[8] correct = [0, 1, -128, 127, 127, 0, 0, 0]; 189 assert(B.array == correct); 190 } 191 192 /// Compute the absolute value of packed 32-bit floating-point elements in `a`. 193 /// #BONUS. 194 __m128 _mm_abs_ps (__m128 a) @trusted 195 { 196 __m128i mask = 0x7fffffff; 197 return cast(__m128)((cast(__m128i)a) & mask); 198 } 199 unittest 200 { 201 __m128 A = _mm_setr_ps(-0.0f, 10.0f, -42.0f, -float.infinity); 202 __m128 R = _mm_abs_ps(A); 203 float[4] correct = [0.0f, 10.0f, 42.0f, +float.infinity]; 204 assert(R.array == correct); 205 } 206 207 /// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result, shift the result right by `count` bytes, and return the low 16 bytes. 208 __m128i _mm_alignr_epi8(ubyte count)(__m128i a, __m128i b) @trusted 209 { 210 // PERF DMD 211 static if (GDC_with_SSSE3) 212 { 213 return cast(__m128i)__builtin_ia32_palignr128(cast(long2)a, cast(long2)b, count * 8); 214 } 215 else version(LDC) 216 { 217 static if (count >= 32) 218 { 219 return _mm_setzero_si128(); 220 } 221 else static if (count < 16) 222 { 223 // Generates palignr since LDC 1.1 -O1 224 // Also generates a single ext instruction on arm64. 225 return cast(__m128i) shufflevector!(byte16, ( 0 + count), 226 ( 1 + count), 227 ( 2 + count), 228 ( 3 + count), 229 ( 4 + count), 230 ( 5 + count), 231 ( 6 + count), 232 ( 7 + count), 233 ( 8 + count), 234 ( 9 + count), 235 (10 + count), 236 (11 + count), 237 (12 + count), 238 (13 + count), 239 (14 + count), 240 (15 + count))(cast(byte16)b, cast(byte16)a); 241 } 242 else 243 { 244 return cast(__m128i) shufflevector!(byte16, ( 0 + count) % 32, 245 ( 1 + count) % 32, 246 ( 2 + count) % 32, 247 ( 3 + count) % 32, 248 ( 4 + count) % 32, 249 ( 5 + count) % 32, 250 ( 6 + count) % 32, 251 ( 7 + count) % 32, 252 ( 8 + count) % 32, 253 ( 9 + count) % 32, 254 (10 + count) % 32, 255 (11 + count) % 32, 256 (12 + count) % 32, 257 (13 + count) % 32, 258 (14 + count) % 32, 259 (15 + count) % 32)(cast(byte16)_mm_setzero_si128(), cast(byte16)a); 260 } 261 } 262 else 263 { 264 byte16 ab = cast(byte16)a; 265 byte16 bb = cast(byte16)b; 266 byte16 r; 267 268 for (int i = 0; i < 16; ++i) 269 { 270 const int srcpos = count + cast(int)i; 271 if (srcpos > 31) 272 { 273 r.ptr[i] = 0; 274 } 275 else if (srcpos > 15) 276 { 277 r.ptr[i] = ab.array[(srcpos) & 15]; 278 } 279 else 280 { 281 r.ptr[i] = bb.array[srcpos]; 282 } 283 } 284 return cast(__m128i)r; 285 } 286 } 287 unittest 288 { 289 __m128i A = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); 290 __m128i B = _mm_setr_epi8(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); 291 292 { 293 byte16 C = cast(byte16)_mm_alignr_epi8!0(A ,B); 294 byte[16] correct = [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]; 295 assert(C.array == correct); 296 } 297 { 298 byte16 C = cast(byte16)_mm_alignr_epi8!20(A ,B); 299 byte[16] correct = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0]; 300 assert(C.array == correct); 301 } 302 { 303 byte16 C = cast(byte16)_mm_alignr_epi8!34(A ,B); 304 byte[16] correct = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 305 assert(C.array == correct); 306 } 307 308 __m128i D = _mm_setr_epi8(-123, -82, 103, -69, 103, -26, 9, 106, 58, -11, 79, -91, 114, -13, 110, 60); 309 __m128i E = _mm_setr_epi8(25, -51, -32, 91, -85, -39, -125, 31, -116, 104, 5, -101, 127, 82, 14, 81); 310 byte16 F = cast(byte16)_mm_alignr_epi8!8(D, E); 311 byte[16] correct = [-116, 104, 5, -101, 127, 82, 14, 81, -123, -82, 103, -69, 103, -26, 9, 106]; 312 assert(F.array == correct); 313 } 314 315 /// Concatenate 8-byte blocks in `a` and `b` into a 16-byte temporary result, shift the result right by `count` bytes, and return the low 8 bytes. 316 __m64 _mm_alignr_pi8(ubyte count)(__m64 a, __m64 b) @trusted 317 { 318 // PERF DMD 319 static if (GDC_with_SSSE3) 320 { 321 return cast(__m64)__builtin_ia32_palignr(cast(long1)a, cast(long1)b, count * 8); 322 } 323 else version(LDC) 324 { 325 static if (count >= 16) 326 { 327 return _mm_setzero_si64(); 328 } 329 else static if (count < 8) 330 { 331 // Note: in LDC x86 this uses a pshufb. 332 // Generates ext in arm64. 333 return cast(__m64) shufflevector!(byte8, (0 + count), 334 (1 + count), 335 (2 + count), 336 (3 + count), 337 (4 + count), 338 (5 + count), 339 (6 + count), 340 (7 + count))(cast(byte8)b, cast(byte8)a); 341 } 342 else 343 { 344 return cast(__m64) shufflevector!(byte8, (0 + count)%16, 345 (1 + count)%16, 346 (2 + count)%16, 347 (3 + count)%16, 348 (4 + count)%16, 349 (5 + count)%16, 350 (6 + count)%16, 351 (7 + count)%16)(cast(byte8)_mm_setzero_si64(), cast(byte8)a); 352 } 353 } 354 else 355 { 356 byte8 ab = cast(byte8)a; 357 byte8 bb = cast(byte8)b; 358 byte8 r; 359 360 for (int i = 0; i < 8; ++i) 361 { 362 const int srcpos = count + cast(int)i; 363 if (srcpos > 15) 364 { 365 r.ptr[i] = 0; 366 } 367 else if (srcpos > 7) 368 { 369 r.ptr[i] = ab.array[(srcpos) & 7]; 370 } 371 else 372 { 373 r.ptr[i] = bb.array[srcpos]; 374 } 375 } 376 return cast(__m64)r; 377 } 378 } 379 unittest 380 { 381 __m64 A = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8); 382 __m64 B = _mm_setr_pi8(17, 18, 19, 20, 21, 22, 23, 24); 383 384 { 385 byte8 C = cast(byte8)_mm_alignr_pi8!0(A ,B); 386 byte[8] correct = [17, 18, 19, 20, 21, 22, 23, 24]; 387 assert(C.array == correct); 388 } 389 390 { 391 byte8 C = cast(byte8)_mm_alignr_pi8!3(A ,B); 392 byte[8] correct = [ 20, 21, 22, 23, 24, 1, 2, 3]; 393 assert(C.array == correct); 394 } 395 { 396 byte8 C = cast(byte8)_mm_alignr_pi8!11(A ,B); 397 byte[8] correct = [4, 5, 6, 7, 8, 0, 0, 0]; 398 assert(C.array == correct); 399 } 400 { 401 byte8 C = cast(byte8)_mm_alignr_pi8!17(A ,B); 402 byte[8] correct = [0, 0, 0, 0, 0, 0, 0, 0]; 403 assert(C.array == correct); 404 } 405 } 406 407 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results. 408 __m128i _mm_hadd_epi16 (__m128i a, __m128i b) @trusted 409 { 410 // PERF DMD 411 static if (GDC_with_SSSE3) 412 { 413 return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b); 414 } 415 else static if (LDC_with_SSSE3) 416 { 417 return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b); 418 } 419 else static if (LDC_with_ARM64) 420 { 421 return cast(__m128i)vpaddq_s16(cast(short8)a, cast(short8)b); 422 } 423 else 424 { 425 short8 sa = cast(short8)a; 426 short8 sb = cast(short8)b; 427 short8 r; 428 r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]); 429 r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]); 430 r.ptr[2] = cast(short)(sa.array[4] + sa.array[5]); 431 r.ptr[3] = cast(short)(sa.array[6] + sa.array[7]); 432 r.ptr[4] = cast(short)(sb.array[0] + sb.array[1]); 433 r.ptr[5] = cast(short)(sb.array[2] + sb.array[3]); 434 r.ptr[6] = cast(short)(sb.array[4] + sb.array[5]); 435 r.ptr[7] = cast(short)(sb.array[6] + sb.array[7]); 436 return cast(__m128i)r; 437 } 438 } 439 unittest 440 { 441 __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768); 442 short8 C = cast(short8) _mm_hadd_epi16(A, A); 443 short[8] correct = [ -1, 12, 48, 32767, -1, 12, 48, 32767]; 444 assert(C.array == correct); 445 } 446 447 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results. 448 __m128i _mm_hadd_epi32 (__m128i a, __m128i b) @trusted 449 { 450 // PERF DMD 451 static if (GDC_with_SSSE3) 452 { 453 return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b); 454 } 455 else static if (LDC_with_SSSE3) 456 { 457 return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b); 458 } 459 else static if (LDC_with_ARM64) 460 { 461 return cast(__m128i)vpaddq_s32(cast(int4)a, cast(int4)b); 462 } 463 else 464 { 465 int4 ia = cast(int4)a; 466 int4 ib = cast(int4)b; 467 int4 r; 468 r.ptr[0] = ia.array[0] + ia.array[1]; 469 r.ptr[1] = ia.array[2] + ia.array[3]; 470 r.ptr[2] = ib.array[0] + ib.array[1]; 471 r.ptr[3] = ib.array[2] + ib.array[3]; 472 return cast(__m128i)r; 473 } 474 } 475 unittest 476 { 477 __m128i A = _mm_setr_epi32(1, -2, int.min, -1); 478 __m128i B = _mm_setr_epi32(1, int.max, 4, -4); 479 int4 C = cast(int4) _mm_hadd_epi32(A, B); 480 int[4] correct = [ -1, int.max, int.min, 0 ]; 481 assert(C.array == correct); 482 } 483 484 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results. 485 __m64 _mm_hadd_pi16 (__m64 a, __m64 b) @trusted 486 { 487 // PERF DMD 488 static if (GDC_with_SSSE3) 489 { 490 return cast(__m64) __builtin_ia32_phaddw(cast(short4)a, cast(short4)b); 491 } 492 else static if (LDC_with_ARM64) 493 { 494 return cast(__m64) vpadd_s16(cast(short4)a, cast(short4)b); 495 } 496 else 497 { 498 // LDC x86: generates phaddw since LDC 1.24 -O2. 499 short4 r; 500 short4 sa = cast(short4)a; 501 short4 sb = cast(short4)b; 502 r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]); 503 r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]); 504 r.ptr[2] = cast(short)(sb.array[0] + sb.array[1]); 505 r.ptr[3] = cast(short)(sb.array[2] + sb.array[3]); 506 return cast(__m64)r; 507 } 508 } 509 unittest 510 { 511 __m64 A = _mm_setr_pi16(1, -2, 4, 8); 512 __m64 B = _mm_setr_pi16(16, 32, -1, -32768); 513 short4 C = cast(short4) _mm_hadd_pi16(A, B); 514 short[4] correct = [ -1, 12, 48, 32767 ]; 515 assert(C.array == correct); 516 } 517 518 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, 519 /// and pack the signed 32-bit results. 520 __m64 _mm_hadd_pi32 (__m64 a, __m64 b) @trusted 521 { 522 // PERF DMD 523 static if (GDC_with_SSSE3) 524 { 525 return cast(__m64) __builtin_ia32_phaddd(cast(int2)a, cast(int2)b); 526 } 527 else static if (LDC_with_ARM64) 528 { 529 return cast(__m64)vpadd_s32(cast(int2)a, cast(int2)b); 530 } 531 else 532 { 533 // LDC x86: generates phaddd since LDC 1.24 -O2 534 int2 ia = cast(int2)a; 535 int2 ib = cast(int2)b; 536 int2 r; 537 r.ptr[0] = ia.array[0] + ia.array[1]; 538 r.ptr[1] = ib.array[0] + ib.array[1]; 539 return cast(__m64)r; 540 } 541 } 542 unittest 543 { 544 __m64 A = _mm_setr_pi32(int.min, -1); 545 __m64 B = _mm_setr_pi32(1, int.max); 546 int2 C = cast(int2) _mm_hadd_pi32(A, B); 547 int[2] correct = [ int.max, int.min ]; 548 assert(C.array == correct); 549 } 550 551 /// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 552 /// and pack the signed 16-bit results. 553 __m128i _mm_hadds_epi16 (__m128i a, __m128i b) @trusted 554 { 555 // PERF DMD 556 static if (GDC_with_SSSE3) 557 { 558 return cast(__m128i)__builtin_ia32_phaddsw128(cast(short8)a, cast(short8)b); 559 } 560 else static if (LDC_with_SSSE3) 561 { 562 return cast(__m128i)__builtin_ia32_phaddsw128(cast(short8)a, cast(short8)b); 563 } 564 else static if (LDC_with_ARM64) 565 { 566 // uzp1/uzp2/sqadd sequence 567 short8 sa = cast(short8)a; 568 short8 sb = cast(short8)b; 569 short8 c = shufflevector!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb); 570 short8 d = shufflevector!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb); 571 return cast(__m128i)vqaddq_s16(c, d); 572 } 573 else 574 { 575 short8 sa = cast(short8)a; 576 short8 sb = cast(short8)b; 577 short8 r; 578 r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] + sa.array[1]); 579 r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] + sa.array[3]); 580 r.ptr[2] = saturateSignedIntToSignedShort(sa.array[4] + sa.array[5]); 581 r.ptr[3] = saturateSignedIntToSignedShort(sa.array[6] + sa.array[7]); 582 r.ptr[4] = saturateSignedIntToSignedShort(sb.array[0] + sb.array[1]); 583 r.ptr[5] = saturateSignedIntToSignedShort(sb.array[2] + sb.array[3]); 584 r.ptr[6] = saturateSignedIntToSignedShort(sb.array[4] + sb.array[5]); 585 r.ptr[7] = saturateSignedIntToSignedShort(sb.array[6] + sb.array[7]); 586 return cast(__m128i)r; 587 } 588 } 589 unittest 590 { 591 __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768); 592 short8 C = cast(short8) _mm_hadds_epi16(A, A); 593 short[8] correct = [ -1, 12, 48, -32768, -1, 12, 48, -32768]; 594 assert(C.array == correct); 595 } 596 597 /// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 598 /// and pack the signed 16-bit results. 599 __m64 _mm_hadds_pi16 (__m64 a, __m64 b) @trusted 600 { 601 static if (GDC_with_SSSE3) 602 { 603 return cast(__m64)__builtin_ia32_phaddsw(cast(short4)a, cast(short4)b); 604 } 605 else static if (LDC_with_SSSE3) 606 { 607 // Note: LDC doesn't have __builtin_ia32_phaddsw 608 long2 la; 609 la.ptr[0] = a.array[0]; 610 long2 lb; 611 lb.ptr[0] = b.array[0]; 612 int4 sum = cast(int4)__builtin_ia32_phaddsw128(cast(short8)la, cast(short8)lb); 613 int2 r; 614 r.ptr[0] = sum.array[0]; 615 r.ptr[1] = sum.array[2]; 616 return cast(__m64)r; 617 } 618 else static if (LDC_with_ARM64) 619 { 620 // uzp1/uzp2/sqadd sequence 621 short4 sa = cast(short4)a; 622 short4 sb = cast(short4)b; 623 short4 c = shufflevector!(short4, 0, 2, 4, 6)(sa, sb); 624 short4 d = shufflevector!(short4, 1, 3, 5, 7)(sa, sb); 625 return cast(__m64)vqadd_s16(c, d); 626 } 627 else 628 { 629 short4 sa = cast(short4)a; 630 short4 sb = cast(short4)b; 631 short4 r; 632 r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] + sa.array[1]); 633 r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] + sa.array[3]); 634 r.ptr[2] = saturateSignedIntToSignedShort(sb.array[0] + sb.array[1]); 635 r.ptr[3] = saturateSignedIntToSignedShort(sb.array[2] + sb.array[3]); 636 return cast(__m64)r; 637 } 638 } 639 unittest 640 { 641 __m64 A = _mm_setr_pi16(-16, 32, -100, -32768); 642 __m64 B = _mm_setr_pi16( 64, 32, 1, 32767); 643 short4 C = cast(short4) _mm_hadds_pi16(A, B); 644 short[4] correct = [ 16, -32768, 96, 32767]; 645 assert(C.array == correct); 646 } 647 648 649 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results. 650 __m128i _mm_hsub_epi16 (__m128i a, __m128i b) @trusted 651 { 652 // PERF DMD 653 static if (GDC_with_SSSE3) 654 { 655 return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b); 656 } 657 else static if (LDC_with_SSSE3) 658 { 659 return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b); 660 } 661 else static if (LDC_with_ARM64) 662 { 663 // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 664 short8 sa = cast(short8)a; 665 short8 sb = cast(short8)b; 666 short8 c = shufflevector!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb); 667 short8 d = shufflevector!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb); 668 return cast(__m128i)(c - d); 669 } 670 else 671 { 672 short8 sa = cast(short8)a; 673 short8 sb = cast(short8)b; 674 short8 r; 675 r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]); 676 r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]); 677 r.ptr[2] = cast(short)(sa.array[4] - sa.array[5]); 678 r.ptr[3] = cast(short)(sa.array[6] - sa.array[7]); 679 r.ptr[4] = cast(short)(sb.array[0] - sb.array[1]); 680 r.ptr[5] = cast(short)(sb.array[2] - sb.array[3]); 681 r.ptr[6] = cast(short)(sb.array[4] - sb.array[5]); 682 r.ptr[7] = cast(short)(sb.array[6] - sb.array[7]); 683 return cast(__m128i)r; 684 } 685 } 686 unittest 687 { 688 __m128i A = _mm_setr_epi16(short.min, 1, 4, 8, 16, 32, 1, -32768); 689 short8 C = cast(short8) _mm_hsub_epi16(A, A); 690 short[8] correct = [ short.max, -4, -16, -32767, short.max, -4, -16, -32767]; 691 assert(C.array == correct); 692 } 693 694 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results. 695 __m128i _mm_hsub_epi32 (__m128i a, __m128i b) @trusted 696 { 697 // PERF DMD 698 static if (GDC_with_SSSE3) 699 { 700 return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b); 701 } 702 else static if (LDC_with_SSSE3) 703 { 704 return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b); 705 } 706 else static if (LDC_with_ARM64) 707 { 708 // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 709 int4 ia = cast(int4)a; 710 int4 ib = cast(int4)b; 711 int4 c = shufflevector!(int4, 0, 2, 4, 6)(ia, ib); 712 int4 d = shufflevector!(int4, 1, 3, 5, 7)(ia, ib); 713 return cast(__m128i)(c - d); 714 } 715 else 716 { 717 int4 ia = cast(int4)a; 718 int4 ib = cast(int4)b; 719 int4 r; 720 r.ptr[0] = ia.array[0] - ia.array[1]; 721 r.ptr[1] = ia.array[2] - ia.array[3]; 722 r.ptr[2] = ib.array[0] - ib.array[1]; 723 r.ptr[3] = ib.array[2] - ib.array[3]; 724 return cast(__m128i)r; 725 } 726 } 727 unittest 728 { 729 __m128i A = _mm_setr_epi32(1, 2, int.min, 1); 730 __m128i B = _mm_setr_epi32(int.max, -1, 4, 4); 731 int4 C = cast(int4) _mm_hsub_epi32(A, B); 732 int[4] correct = [ -1, int.max, int.min, 0 ]; 733 assert(C.array == correct); 734 } 735 736 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`, 737 /// and pack the signed 16-bit results. 738 __m64 _mm_hsub_pi16 (__m64 a, __m64 b) @trusted 739 { 740 // PERF DMD 741 static if (GDC_with_SSSE3) 742 { 743 return cast(__m64)__builtin_ia32_phsubw(cast(short4)a, cast(short4)b); 744 } 745 else static if (LDC_with_ARM64) 746 { 747 // Produce uzp1 uzp2 sub sequence since LDC 1.3 -O1 748 short4 sa = cast(short4)a; 749 short4 sb = cast(short4)b; 750 short4 c = shufflevector!(short4, 0, 2, 4, 6)(sa, sb); 751 short4 d = shufflevector!(short4, 1, 3, 5, 7)(sa, sb); 752 return cast(__m64)(c - d); 753 } 754 else 755 { 756 // LDC x86: generates phsubw since LDC 1.24 -O2 757 short4 sa = cast(short4)a; 758 short4 sb = cast(short4)b; 759 short4 r; 760 r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]); 761 r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]); 762 r.ptr[2] = cast(short)(sb.array[0] - sb.array[1]); 763 r.ptr[3] = cast(short)(sb.array[2] - sb.array[3]); 764 return cast(__m64)r; 765 } 766 } 767 unittest 768 { 769 __m64 A = _mm_setr_pi16(short.min, 1, 4, 8); 770 __m64 B = _mm_setr_pi16(16, 32, 1, -32768); 771 short4 C = cast(short4) _mm_hsub_pi16(A, B); 772 short[4] correct = [ short.max, -4, -16, -32767]; 773 assert(C.array == correct); 774 } 775 776 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`, 777 /// and pack the signed 32-bit results. 778 __m64 _mm_hsub_pi32 (__m64 a, __m64 b) @trusted 779 { 780 // PERF DMD 781 static if (GDC_with_SSSE3) 782 { 783 return cast(__m64)__builtin_ia32_phsubd(cast(int2)a, cast(int2)b); 784 } 785 else static if (LDC_with_ARM64) 786 { 787 // LDC arm64: generates zip1+zip2+sub sequence since LDC 1.8 -O1 788 int2 ia = cast(int2)a; 789 int2 ib = cast(int2)b; 790 int2 c = shufflevector!(int2, 0, 2)(ia, ib); 791 int2 d = shufflevector!(int2, 1, 3)(ia, ib); 792 return cast(__m64)(c - d); 793 } 794 else 795 { 796 // LDC x86: generates phsubd since LDC 1.24 -O2 797 int2 ia = cast(int2)a; 798 int2 ib = cast(int2)b; 799 int2 r; 800 r.ptr[0] = ia.array[0] - ia.array[1]; 801 r.ptr[1] = ib.array[0] - ib.array[1]; 802 return cast(__m64)r; 803 } 804 } 805 unittest 806 { 807 __m64 A = _mm_setr_pi32(int.min, 1); 808 __m64 B = _mm_setr_pi32(int.max, -1); 809 int2 C = cast(int2) _mm_hsub_pi32(A, B); 810 int[2] correct = [ int.max, int.min ]; 811 assert(C.array == correct); 812 } 813 814 /// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 815 /// and pack the signed 16-bit results. 816 __m128i _mm_hsubs_epi16 (__m128i a, __m128i b) @trusted 817 { 818 // PERF DMD 819 static if (GDC_with_SSSE3) 820 { 821 return cast(__m128i)__builtin_ia32_phsubsw128(cast(short8)a, cast(short8)b); 822 } 823 else static if (LDC_with_SSSE3) 824 { 825 return cast(__m128i)__builtin_ia32_phsubsw128(cast(short8)a, cast(short8)b); 826 } 827 else static if (LDC_with_ARM64) 828 { 829 // uzp1/uzp2/sqsub sequence 830 short8 sa = cast(short8)a; 831 short8 sb = cast(short8)b; 832 short8 c = shufflevector!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb); 833 short8 d = shufflevector!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb); 834 return cast(__m128i)vqsubq_s16(c, d); 835 } 836 else 837 { 838 short8 sa = cast(short8)a; 839 short8 sb = cast(short8)b; 840 short8 r; 841 r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] - sa.array[1]); 842 r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] - sa.array[3]); 843 r.ptr[2] = saturateSignedIntToSignedShort(sa.array[4] - sa.array[5]); 844 r.ptr[3] = saturateSignedIntToSignedShort(sa.array[6] - sa.array[7]); 845 r.ptr[4] = saturateSignedIntToSignedShort(sb.array[0] - sb.array[1]); 846 r.ptr[5] = saturateSignedIntToSignedShort(sb.array[2] - sb.array[3]); 847 r.ptr[6] = saturateSignedIntToSignedShort(sb.array[4] - sb.array[5]); 848 r.ptr[7] = saturateSignedIntToSignedShort(sb.array[6] - sb.array[7]); 849 return cast(__m128i)r; 850 } 851 } 852 unittest 853 { 854 __m128i A = _mm_setr_epi16(1, -2, 4, 8, 32767, -1, -10, 32767); 855 short8 C = cast(short8) _mm_hsubs_epi16(A, A); 856 short[8] correct = [ 3, -4, 32767, -32768, 3, -4, 32767, -32768 ]; 857 assert(C.array == correct); 858 } 859 860 861 /// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 862 /// and pack the signed 16-bit results. 863 __m64 _mm_hsubs_pi16 (__m64 a, __m64 b) @trusted 864 { 865 static if (GDC_with_SSSE3) 866 { 867 return cast(__m64)__builtin_ia32_phsubsw(cast(short4)a, cast(short4)b); 868 } 869 else static if (LDC_with_SSSE3) 870 { 871 // Note: LDC doesn't have __builtin_ia32_phsubsw 872 long2 la; 873 la.ptr[0] = a.array[0]; 874 long2 lb; 875 lb.ptr[0] = b.array[0]; 876 int4 sum = cast(int4)__builtin_ia32_phsubsw128(cast(short8)la, cast(short8)lb); 877 int2 r; 878 r.ptr[0] = sum.array[0]; 879 r.ptr[1] = sum.array[2]; 880 return cast(__m64)r; 881 } 882 else static if (LDC_with_ARM64) 883 { 884 // uzp1/uzp2/sqsub sequence in -O1 885 short4 sa = cast(short4)a; 886 short4 sb = cast(short4)b; 887 short4 c = shufflevector!(short4, 0, 2, 4, 6)(sa, sb); 888 short4 d = shufflevector!(short4, 1, 3, 5, 7)(sa, sb); 889 return cast(__m64)vqsub_s16(c, d); 890 } 891 else 892 { 893 short4 sa = cast(short4)a; 894 short4 sb = cast(short4)b; 895 short4 r; 896 r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] - sa.array[1]); 897 r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] - sa.array[3]); 898 r.ptr[2] = saturateSignedIntToSignedShort(sb.array[0] - sb.array[1]); 899 r.ptr[3] = saturateSignedIntToSignedShort(sb.array[2] - sb.array[3]); 900 return cast(__m64)r; 901 } 902 } 903 unittest 904 { 905 __m64 A = _mm_setr_pi16(-16, 32, 100, -32768); 906 __m64 B = _mm_setr_pi16( 64, 30, -9, 32767); 907 short4 C = cast(short4) _mm_hsubs_pi16(A, B); 908 short[4] correct = [ -48, 32767, 34, -32768]; 909 assert(C.array == correct); 910 } 911 912 913 /// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 914 /// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 915 /// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 916 /// and pack the saturated results. 917 __m128i _mm_maddubs_epi16 (__m128i a, __m128i b) @trusted 918 { 919 static if (GDC_with_SSSE3) 920 { 921 return cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16)a, cast(byte16)b); 922 } 923 else static if (LDC_with_SSSE3) 924 { 925 return cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16)a, cast(byte16)b); 926 } 927 else 928 { 929 // zero-extend a to 16-bit 930 __m128i zero = _mm_setzero_si128(); 931 __m128i a_lo = _mm_unpacklo_epi8(a, zero); 932 __m128i a_hi = _mm_unpackhi_epi8(a, zero); 933 934 // sign-extend b to 16-bit 935 __m128i b_lo = _mm_unpacklo_epi8(b, zero); 936 __m128i b_hi = _mm_unpackhi_epi8(b, zero); 937 b_lo = _mm_srai_epi16( _mm_slli_epi16(b_lo, 8), 8); 938 b_hi = _mm_srai_epi16( _mm_slli_epi16(b_hi, 8), 8); 939 940 // Multiply element-wise, no overflow can occur 941 __m128i c_lo = _mm_mullo_epi16(a_lo, b_lo); 942 __m128i c_hi = _mm_mullo_epi16(a_hi, b_hi); 943 944 // Add pairwise with saturating horizontal add 945 return _mm_hadds_epi16(c_lo, c_hi); 946 } 947 } 948 unittest 949 { 950 __m128i A = _mm_setr_epi8( -1, 10, 100, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // u8 951 __m128i B = _mm_setr_epi8(-128, -30, 100, 127, -1, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0); // i8 952 short8 C = cast(short8) _mm_maddubs_epi16(A, B); 953 short[8] correct = [ -32768, 26256, 0, 0, 0, 0, 0, 0]; 954 assert(C.array == correct); 955 } 956 957 /// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 958 /// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 959 /// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 960 /// and pack the saturated results. 961 __m64 _mm_maddubs_pi16 (__m64 a, __m64 b) @trusted 962 { 963 static if (GDC_with_SSSE3) 964 { 965 return cast(__m64)__builtin_ia32_pmaddubsw(cast(byte8)a, cast(byte8)b); 966 } 967 else static if (LDC_with_SSSE3) 968 { 969 __m128i A = to_m128i(a); 970 __m128i B = to_m128i(b); 971 return to_m64( cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16) to_m128i(a), cast(byte16) to_m128i(b))); 972 } 973 else 974 { 975 // zero-extend a to 16-bit 976 __m128i zero = _mm_setzero_si128(); 977 __m128i A = _mm_unpacklo_epi8(to_m128i(a), zero); 978 979 // sign-extend b to 16-bit 980 __m128i B = _mm_unpacklo_epi8(to_m128i(b), zero); 981 B = _mm_srai_epi16( _mm_slli_epi16(B, 8), 8); 982 983 // Multiply element-wise, no overflow can occur 984 __m128i c = _mm_mullo_epi16(A, B); 985 986 // Add pairwise with saturating horizontal add 987 return to_m64( _mm_hadds_epi16(c, zero)); 988 } 989 } 990 unittest 991 { 992 __m64 A = _mm_setr_pi8( -1, 10, 100, -128, 0, 0, 0, 0); // u8 993 __m64 B = _mm_setr_pi8(-128, -30, 100, 127, -1, 2, 4, 6); // i8 994 short4 C = cast(short4) _mm_maddubs_pi16(A, B); 995 short[4] correct = [ -32768, 26256, 0, 0]; 996 assert(C.array == correct); 997 } 998 999 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers. 1000 /// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return bits `[16:1]`. 1001 __m128i _mm_mulhrs_epi16 (__m128i a, __m128i b) @trusted 1002 { 1003 // PERF DMD 1004 static if (GDC_with_SSSE3) 1005 { 1006 return cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8)a, cast(short8)b); 1007 } 1008 else static if (LDC_with_SSSE3) 1009 { 1010 return cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8)a, cast(short8)b); 1011 } 1012 else static if (LDC_with_ARM64) 1013 { 1014 int4 mul_lo = vmull_s16(vget_low_s16(cast(short8)a), 1015 vget_low_s16(cast(short8)b)); 1016 int4 mul_hi = vmull_s16(vget_high_s16(cast(short8)a), 1017 vget_high_s16(cast(short8)b)); 1018 1019 // Rounding narrowing shift right 1020 // narrow = (int16_t)((mul + 16384) >> 15); 1021 short4 narrow_lo = vrshrn_n_s32(mul_lo, 15); 1022 short4 narrow_hi = vrshrn_n_s32(mul_hi, 15); 1023 1024 // Join together. 1025 return cast(__m128i) vcombine_s16(narrow_lo, narrow_hi); 1026 } 1027 else 1028 { 1029 short8 sa = cast(short8)a; 1030 short8 sb = cast(short8)b; 1031 short8 r; 1032 1033 for (int i = 0; i < 8; ++i) 1034 { 1035 // I doubted it at first, but an exhaustive search show this to be equivalent to Intel pseudocode. 1036 r.ptr[i] = cast(short) ( (sa.array[i] * sb.array[i] + 0x4000) >> 15); 1037 } 1038 1039 return cast(__m128i)r; 1040 } 1041 } 1042 1043 unittest 1044 { 1045 __m128i A = _mm_setr_epi16(12345, -32768, 32767, 0, 1, 845, -6999, -1); 1046 __m128i B = _mm_setr_epi16(8877, -24487, 15678, 32760, 1, 0, -149, -1); 1047 short8 C = cast(short8) _mm_mulhrs_epi16(A, B); 1048 short[8] correct = [3344, 24487, 15678, 0, 0, 0, 32, 0]; 1049 assert(C.array == correct); 1050 } 1051 1052 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers. 1053 /// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return bits `[16:1]`. 1054 __m64 _mm_mulhrs_pi16 (__m64 a, __m64 b) @trusted 1055 { 1056 // PERF DMD 1057 static if (GDC_with_SSSE3) 1058 { 1059 return cast(__m64) __builtin_ia32_pmulhrsw(cast(short4)a, cast(short4)b); 1060 } 1061 else static if (LDC_with_SSSE3) 1062 { 1063 return cast(__m64) to_m64( cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8) to_m128i(a), cast(short8) to_m128i(b))); 1064 } 1065 else static if (LDC_with_ARM64) 1066 { 1067 int4 mul = vmull_s16(cast(short4)a, cast(short4)b); 1068 1069 // Rounding narrowing shift right 1070 // (int16_t)((mul + 16384) >> 15); 1071 return cast(__m64) vrshrn_n_s32(mul, 15); 1072 } 1073 else 1074 { 1075 short4 sa = cast(short4)a; 1076 short4 sb = cast(short4)b; 1077 short4 r; 1078 1079 for (int i = 0; i < 4; ++i) 1080 { 1081 r.ptr[i] = cast(short) ( (sa.array[i] * sb.array[i] + 0x4000) >> 15); 1082 } 1083 return cast(__m64)r; 1084 } 1085 } 1086 unittest 1087 { 1088 __m64 A = _mm_setr_pi16(12345, -32768, 32767, 0); 1089 __m64 B = _mm_setr_pi16(8877, -24487, 15678, 32760); 1090 short4 C = cast(short4) _mm_mulhrs_pi16(A, B); 1091 short[4] correct = [3344, 24487, 15678, 0]; 1092 assert(C.array == correct); 1093 } 1094 1095 1096 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`. 1097 __m128i _mm_shuffle_epi8 (__m128i a, __m128i b) @trusted 1098 { 1099 // This is the lovely pshufb. 1100 // PERF DMD 1101 static if (GDC_with_SSSE3) 1102 { 1103 return cast(__m128i) __builtin_ia32_pshufb128(cast(byte16) a, cast(byte16) b); 1104 } 1105 else static if (LDC_with_SSSE3) 1106 { 1107 return cast(__m128i) __builtin_ia32_pshufb128(cast(byte16) a, cast(byte16) b); 1108 } 1109 else static if (LDC_with_ARM64) 1110 { 1111 byte16 bb = cast(byte16)b; 1112 byte16 mask; 1113 mask = cast(byte)(0x8F); 1114 bb = bb & mask; 1115 byte16 r = vqtbl1q_s8(cast(byte16)a, bb); 1116 return cast(__m128i)r; 1117 } 1118 else 1119 { 1120 byte16 r; 1121 byte16 ba = cast(byte16)a; 1122 byte16 bb = cast(byte16)b; 1123 for (int i = 0; i < 16; ++i) 1124 { 1125 byte s = bb.array[i]; 1126 r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 15 ]; 1127 } 1128 return cast(__m128i)r; 1129 } 1130 } 1131 unittest 1132 { 1133 __m128i A = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 1134 __m128i B = _mm_setr_epi8(15, -128, 13 + 16, -12, 11, -10, 9, 8, 7, 6, -5, 4, 3, -2, 1, 0); 1135 byte16 C = cast(byte16) _mm_shuffle_epi8(A, B); 1136 byte[16] correct = [0, 0, 2, 0, 4, 0, 6, 7, 8, 9, 0, 11, 12, 0, 14, 15]; 1137 assert(C.array == correct); 1138 } 1139 1140 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`. 1141 __m64 _mm_shuffle_pi8 (__m64 a, __m64 b) @trusted 1142 { 1143 // PERF DMD 1144 static if (GDC_with_SSSE3) 1145 { 1146 alias ubyte8 =__vector(ubyte[8]); 1147 return cast(__m64) __builtin_ia32_pshufb(cast(ubyte8) a, cast(ubyte8) b); 1148 } 1149 else static if (LDC_with_SSSE3) 1150 { 1151 // GDC does proper dance to avoid mmx registers, do it manually in LDC since __builtin_ia32_pshufb doesn't exist there 1152 __m128i A = to_m128i(a); 1153 __m128i index = to_m128i(b); 1154 index = index & _mm_set1_epi32(0xF7F7F7F7); 1155 return to_m64( cast(__m128i) __builtin_ia32_pshufb128(cast(byte16)A, cast(byte16) index) ); 1156 } 1157 else static if (LDC_with_ARM64) 1158 { 1159 byte8 bb = cast(byte8)b; 1160 byte8 mask; 1161 mask = cast(byte)(0x87); 1162 bb = bb & mask; 1163 __m128i l = to_m128i(a); 1164 byte8 r = vtbl1_s8(cast(byte16)l, cast(byte8)bb); 1165 return cast(__m64)r; 1166 } 1167 else 1168 { 1169 byte8 r; 1170 byte8 ba = cast(byte8)a; 1171 byte8 bb = cast(byte8)b; 1172 for (int i = 0; i < 8; ++i) 1173 { 1174 byte s = bb.array[i]; 1175 r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 7 ]; 1176 } 1177 return cast(__m64)r; 1178 } 1179 } 1180 unittest 1181 { 1182 __m64 A = _mm_setr_pi8(7, 6, 5, 4, 3, 2, 1, 0); 1183 __m64 B = _mm_setr_pi8(7, 6, -5, 4, 3 + 8, -2, 1, 0); 1184 byte8 C = cast(byte8) _mm_shuffle_pi8(A, B); 1185 byte[8] correct = [0, 1, 0, 3, 4, 0, 6, 7]; 1186 assert(C.array == correct); 1187 } 1188 1189 /// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit integer in `b` is negative. 1190 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 1191 __m128i _mm_sign_epi16 (__m128i a, __m128i b) @trusted 1192 { 1193 // PERF DMD 1194 static if (GDC_with_SSSE3) 1195 { 1196 return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b); 1197 } 1198 else static if (LDC_with_SSSE3) 1199 { 1200 return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b); 1201 } 1202 else 1203 { 1204 // LDC arm64: 5 instructions 1205 __m128i mask = _mm_srai_epi16(b, 15); 1206 __m128i zeromask = _mm_cmpeq_epi16(b, _mm_setzero_si128()); 1207 return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi16(a, mask), mask)); 1208 } 1209 } 1210 unittest 1211 { 1212 __m128i A = _mm_setr_epi16(-2, -1, 0, 1, 2, short.min, short.min, short.min); 1213 __m128i B = _mm_setr_epi16(-1, 0,-1, 1, -2, -50, 0, 50); 1214 short8 C = cast(short8) _mm_sign_epi16(A, B); 1215 short[8] correct = [ 2, 0, 0, 1, -2, short.min, 0, short.min]; 1216 assert(C.array == correct); 1217 } 1218 1219 /// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit integer in `b` is negative. 1220 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 1221 __m128i _mm_sign_epi32 (__m128i a, __m128i b) @trusted 1222 { 1223 // PERF DMD 1224 static if (GDC_with_SSSE3) 1225 { 1226 return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b); 1227 } 1228 else static if (LDC_with_SSSE3) 1229 { 1230 return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b); 1231 } 1232 else 1233 { 1234 __m128i mask = _mm_srai_epi32(b, 31); 1235 __m128i zeromask = _mm_cmpeq_epi32(b, _mm_setzero_si128()); 1236 return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi32(a, mask), mask)); 1237 } 1238 } 1239 unittest 1240 { 1241 __m128i A = _mm_setr_epi32(-2, -1, 0, int.max); 1242 __m128i B = _mm_setr_epi32(-1, 0, -1, 1); 1243 int4 C = cast(int4) _mm_sign_epi32(A, B); 1244 int[4] correct = [ 2, 0, 0, int.max]; 1245 assert(C.array == correct); 1246 } 1247 1248 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 1249 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 1250 __m128i _mm_sign_epi8 (__m128i a, __m128i b) @trusted 1251 { 1252 // PERF DMD 1253 static if (GDC_with_SSSE3) 1254 { 1255 return cast(__m128i) __builtin_ia32_psignb128(cast(byte16)a, cast(byte16)b); 1256 } 1257 else static if (LDC_with_SSSE3) 1258 { 1259 return cast(__m128i) __builtin_ia32_psignb128(cast(byte16)a, cast(byte16)b); 1260 } 1261 else 1262 { 1263 __m128i mask = _mm_cmplt_epi8(b, _mm_setzero_si128()); // extend sign bit 1264 __m128i zeromask = _mm_cmpeq_epi8(b, _mm_setzero_si128()); 1265 return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi8(a, mask), mask)); 1266 } 1267 } 1268 unittest 1269 { 1270 __m128i A = _mm_setr_epi8(-2, -1, 0, 1, 2, byte.min, byte.min, byte.min, -1, 0,-1, 1, -2, -50, 0, 50); 1271 __m128i B = _mm_setr_epi8(-1, 0,-1, 1, -2, -50, 0, 50, -2, -1, 0, 1, 2, byte.min, byte.min, byte.min); 1272 byte16 C = cast(byte16) _mm_sign_epi8(A, B); 1273 byte[16] correct = [ 2, 0, 0, 1, -2, byte.min, 0, byte.min, 1, 0, 0, 1, -2, 50, 0, -50]; 1274 assert(C.array == correct); 1275 } 1276 1277 /// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit integer in `b` is negative. 1278 /// Element in result are zeroed out when the corresponding element in `b` is zero. 1279 __m64 _mm_sign_pi16 (__m64 a, __m64 b) @trusted 1280 { 1281 return to_m64( _mm_sign_epi16( to_m128i(a), to_m128i(b)) ); 1282 } 1283 unittest 1284 { 1285 __m64 A = _mm_setr_pi16( 2, short.min, short.min, short.min); 1286 __m64 B = _mm_setr_pi16(-2, -50, 0, 50); 1287 short4 C = cast(short4) _mm_sign_pi16(A, B); 1288 short[4] correct = [-2, short.min, 0, short.min]; 1289 assert(C.array == correct); 1290 } 1291 1292 /// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit integer in `b` is negative. 1293 /// Element in result are zeroed out when the corresponding element in `b` is zero. 1294 __m64 _mm_sign_pi32 (__m64 a, __m64 b) @trusted 1295 { 1296 return to_m64( _mm_sign_epi32( to_m128i(a), to_m128i(b)) ); 1297 } 1298 unittest 1299 { 1300 __m64 A = _mm_setr_pi32(-2, -100); 1301 __m64 B = _mm_setr_pi32(-1, 0); 1302 int2 C = cast(int2) _mm_sign_pi32(A, B); 1303 int[2] correct = [ 2, 0]; 1304 assert(C.array == correct); 1305 } 1306 1307 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 1308 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 1309 __m64 _mm_sign_pi8 (__m64 a, __m64 b) @trusted 1310 { 1311 return to_m64( _mm_sign_epi8( to_m128i(a), to_m128i(b)) ); 1312 } 1313 unittest 1314 { 1315 __m64 A = _mm_setr_pi8(-2, -1, 0, 1, 2, byte.min, byte.min, byte.min); 1316 __m64 B = _mm_setr_pi8(-1, 0,-1, 1, -2, -50, 0, 50); 1317 byte8 C = cast(byte8) _mm_sign_pi8(A, B); 1318 byte[8] correct = [ 2, 0, 0, 1, -2, byte.min, 0, byte.min]; 1319 assert(C.array == correct); 1320 }