1 /** 2 * SSSE3 intrinsics. 3 * 4 * Copyright: Guillaume Piolat 2021. 5 * Johan Engelen 2021. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.tmmintrin; 9 10 public import inteli.types; 11 import inteli.internals; 12 13 public import inteli.pmmintrin; 14 import inteli.mmx; 15 16 nothrow @nogc: 17 18 19 // SSSE3 instructions 20 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSSE3 21 // Note: this header will work whether you have SSSE3 enabled or not. 22 // With LDC, use "dflags-ldc": ["-mattr=+ssse3"] or equivalent to actively 23 // generate SSE3 instructions. 24 25 /// Compute the absolute value of packed signed 16-bit integers in `a`. 26 __m128i _mm_abs_epi16 (__m128i a) @trusted 27 { 28 static if (DMD_with_DSIMD) 29 { 30 return cast(__m128i)__simd(XMM.PABSW, a); 31 } 32 else static if (GDC_with_SSSE3) 33 { 34 return cast(__m128i) __builtin_ia32_pabsw128(cast(short8)a); 35 } 36 else static if (LDC_with_ARM64) 37 { 38 return cast(__m128i) vabsq_s16(cast(short8)a); 39 } 40 else 41 { 42 // LDC x86: generate pabsw since LDC 1.1 -O2 43 short8 sa = cast(short8)a; 44 for (int i = 0; i < 8; ++i) 45 { 46 short s = sa.array[i]; 47 sa.ptr[i] = s >= 0 ? s : cast(short)(-cast(int)(s)); 48 } 49 return cast(__m128i)sa; 50 } 51 } 52 unittest 53 { 54 __m128i A = _mm_setr_epi16(0, -1, -32768, 32767, 10, -10, 1000, -1000); 55 short8 B = cast(short8) _mm_abs_epi16(A); 56 short[8] correct = [0, 1, -32768, 32767, 10, 10, 1000, 1000]; 57 assert(B.array == correct); 58 } 59 60 /// Compute the absolute value of packed signed 32-bit integers in `a`. 61 __m128i _mm_abs_epi32 (__m128i a) @trusted 62 { 63 static if (DMD_with_DSIMD) 64 { 65 return cast(__m128i)__simd(XMM.PABSD, cast(int4)a); 66 } 67 else static if (GDC_with_SSSE3) 68 { 69 return cast(__m128i) __builtin_ia32_pabsd128(cast(int4)a); 70 } 71 else static if (LDC_with_ARM64) 72 { 73 return cast(__m128i) vabsq_s32(cast(int4)a); 74 } 75 else 76 { 77 // LDC x86: generates pabsd since LDC 1.1 -O2 78 int4 sa = cast(int4)a; 79 for (int i = 0; i < 4; ++i) 80 { 81 int s = sa.array[i]; 82 sa.ptr[i] = s >= 0 ? s : -s; 83 } 84 return cast(__m128i)sa; 85 } 86 } 87 unittest 88 { 89 __m128i A = _mm_setr_epi32(0, -1, -2_147_483_648, -2_147_483_647); 90 int4 B = cast(int4) _mm_abs_epi32(A); 91 int[4] correct = [0, 1, -2_147_483_648, 2_147_483_647]; 92 assert(B.array == correct); 93 } 94 95 /// Compute the absolute value of packed signed 8-bit integers in `a`. 96 __m128i _mm_abs_epi8 (__m128i a) @trusted 97 { 98 static if (DMD_with_DSIMD) 99 { 100 return cast(__m128i)__simd(XMM.PABSB, cast(byte16)a); 101 } 102 else static if (GDC_with_SSSE3) 103 { 104 alias ubyte16 = __vector(ubyte[16]); 105 return cast(__m128i) __builtin_ia32_pabsb128(cast(ubyte16)a); 106 } 107 else static if (LDC_with_ARM64) 108 { 109 return cast(__m128i) vabsq_s8(cast(byte16)a); 110 } 111 else static if (LDC_with_SSSE3) 112 { 113 return __asm!__m128i("pabsb $1,$0","=x,x",a); 114 } 115 else 116 { 117 // A loop version like in _mm_abs_epi16/_mm_abs_epi32 would be very slow 118 // in LDC x86 and wouldn't vectorize. Doesn't generate pabsb in LDC though. 119 return _mm_min_epu8(a, _mm_sub_epi8(_mm_setzero_si128(), a)); 120 } 121 } 122 unittest 123 { 124 __m128i A = _mm_setr_epi8(0, -1, -128, -127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 125 byte16 B = cast(byte16) _mm_abs_epi8(A); 126 byte[16] correct = [0, 1, -128, 127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 127 assert(B.array == correct); 128 } 129 130 /// Compute the absolute value of packed signed 16-bit integers in `a`. 131 __m64 _mm_abs_pi16 (__m64 a) @trusted 132 { 133 return to_m64(_mm_abs_epi16(to_m128i(a))); 134 } 135 unittest 136 { 137 __m64 A = _mm_setr_pi16(0, -1, -32768, 32767); 138 short4 B = cast(short4) _mm_abs_pi16(A); 139 short[4] correct = [0, 1, -32768, 32767]; 140 assert(B.array == correct); 141 } 142 143 /// Compute the absolute value of packed signed 32-bit integers in `a`. 144 __m64 _mm_abs_pi32 (__m64 a) @trusted 145 { 146 return to_m64(_mm_abs_epi32(to_m128i(a))); 147 } 148 unittest 149 { 150 __m64 A = _mm_setr_pi32(-1, -2_147_483_648); 151 int2 B = cast(int2) _mm_abs_pi32(A); 152 int[2] correct = [1, -2_147_483_648]; 153 assert(B.array == correct); 154 } 155 156 /// Compute the absolute value of packed signed 8-bit integers in `a`. 157 __m64 _mm_abs_pi8 (__m64 a) @trusted 158 { 159 return to_m64(_mm_abs_epi8(to_m128i(a))); 160 } 161 unittest 162 { 163 __m64 A = _mm_setr_pi8(0, -1, -128, -127, 127, 0, 0, 0); 164 byte8 B = cast(byte8) _mm_abs_pi8(A); 165 byte[8] correct = [0, 1, -128, 127, 127, 0, 0, 0]; 166 assert(B.array == correct); 167 } 168 169 /// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result, shift the result right by `count` bytes, and return the low 16 bytes. 170 __m128i _mm_alignr_epi8(ubyte count)(__m128i a, __m128i b) @trusted 171 { 172 // PERF DMD 173 static if (GDC_with_SSSE3) 174 { 175 return cast(__m128i)__builtin_ia32_palignr128(cast(long2)a, cast(long2)b, count * 8); 176 } 177 else 178 { 179 // Generates palignr since LDC 1.1 -O1 180 // Also generates a single ext instruction on arm64. 181 return cast(__m128i) shufflevector!(byte16, ( 0 + count) % 32, 182 ( 1 + count) % 32, 183 ( 2 + count) % 32, 184 ( 3 + count) % 32, 185 ( 4 + count) % 32, 186 ( 5 + count) % 32, 187 ( 6 + count) % 32, 188 ( 7 + count) % 32, 189 ( 8 + count) % 32, 190 ( 9 + count) % 32, 191 (10 + count) % 32, 192 (11 + count) % 32, 193 (12 + count) % 32, 194 (13 + count) % 32, 195 (14 + count) % 32, 196 (15 + count) % 32)(cast(byte16)a, cast(byte16)b); 197 } 198 } 199 unittest 200 { 201 __m128i A = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); 202 __m128i B = _mm_setr_epi8(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); 203 204 { 205 byte16 C = cast(byte16)_mm_alignr_epi8!7(A ,B); 206 byte[16] correct = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]; 207 assert(C.array == correct); 208 } 209 { 210 byte16 C = cast(byte16)_mm_alignr_epi8!20(A ,B); 211 byte[16] correct = [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4]; 212 assert(C.array == correct); 213 } 214 } 215 216 /// Concatenate 8-byte blocks in `a` and `b` into a 16-byte temporary result, shift the result right by `count` bytes, and return the low 8 bytes. 217 __m64 _mm_alignr_pi8(ubyte count)(__m64 a, __m64 b) @trusted 218 { 219 // PERF DMD 220 static if (GDC_with_SSSE3) 221 { 222 return cast(__m64)__builtin_ia32_palignr(cast(long)a, cast(long)b, count * 8); 223 } 224 else 225 { 226 // Note: in LDC x86 this uses a pshufb. 227 // Generates ext in arm64. 228 return cast(__m64) shufflevector!(byte8, (0 + count) % 16, 229 (1 + count) % 16, 230 (2 + count) % 16, 231 (3 + count) % 16, 232 (4 + count) % 16, 233 (5 + count) % 16, 234 (6 + count) % 16, 235 (7 + count) % 16)(cast(byte8)a, cast(byte8)b); 236 } 237 } 238 unittest 239 { 240 __m64 A = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8); 241 __m64 B = _mm_setr_pi8(17, 18, 19, 20, 21, 22, 23, 24); 242 243 { 244 byte8 C = cast(byte8)_mm_alignr_pi8!3(A ,B); 245 byte[8] correct = [4, 5, 6, 7, 8, 17, 18, 19]; 246 assert(C.array == correct); 247 } 248 { 249 byte8 C = cast(byte8)_mm_alignr_pi8!10(A ,B); 250 byte[8] correct = [19, 20, 21, 22, 23, 24, 1, 2]; 251 assert(C.array == correct); 252 } 253 } 254 255 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results. 256 __m128i _mm_hadd_epi16 (__m128i a, __m128i b) @trusted 257 { 258 // PERF DMD 259 static if (GDC_with_SSSE3) 260 { 261 return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b); 262 } 263 else static if (LDC_with_SSSE3) 264 { 265 return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b); 266 } 267 else static if (LDC_with_ARM64) 268 { 269 return cast(__m128i)vpaddq_s16(cast(short8)a, cast(short8)b); 270 } 271 else 272 { 273 short8 sa = cast(short8)a; 274 short8 sb = cast(short8)b; 275 short8 r; 276 r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]); 277 r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]); 278 r.ptr[2] = cast(short)(sa.array[4] + sa.array[5]); 279 r.ptr[3] = cast(short)(sa.array[6] + sa.array[7]); 280 r.ptr[4] = cast(short)(sb.array[0] + sb.array[1]); 281 r.ptr[5] = cast(short)(sb.array[2] + sb.array[3]); 282 r.ptr[6] = cast(short)(sb.array[4] + sb.array[5]); 283 r.ptr[7] = cast(short)(sb.array[6] + sb.array[7]); 284 return cast(__m128i)r; 285 } 286 } 287 unittest 288 { 289 __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768); 290 short8 C = cast(short8) _mm_hadd_epi16(A, A); 291 short[8] correct = [ -1, 12, 48, 32767, -1, 12, 48, 32767]; 292 assert(C.array == correct); 293 } 294 295 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results. 296 __m128i _mm_hadd_epi32 (__m128i a, __m128i b) @trusted 297 { 298 // PERF DMD 299 static if (GDC_with_SSSE3) 300 { 301 return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b); 302 } 303 else static if (LDC_with_SSSE3) 304 { 305 return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b); 306 } 307 else static if (LDC_with_ARM64) 308 { 309 return cast(__m128i)vpaddq_s32(cast(int4)a, cast(int4)b); 310 } 311 else 312 { 313 int4 ia = cast(int4)a; 314 int4 ib = cast(int4)b; 315 int4 r; 316 r.ptr[0] = ia.array[0] + ia.array[1]; 317 r.ptr[1] = ia.array[2] + ia.array[3]; 318 r.ptr[2] = ib.array[0] + ib.array[1]; 319 r.ptr[3] = ib.array[2] + ib.array[3]; 320 return cast(__m128i)r; 321 } 322 } 323 unittest 324 { 325 __m128i A = _mm_setr_epi32(1, -2, int.min, -1); 326 __m128i B = _mm_setr_epi32(1, int.max, 4, -4); 327 int4 C = cast(int4) _mm_hadd_epi32(A, B); 328 int[4] correct = [ -1, int.max, int.min, 0 ]; 329 assert(C.array == correct); 330 } 331 332 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results. 333 __m64 _mm_hadd_pi16 (__m64 a, __m64 b) @trusted 334 { 335 // PERF DMD 336 static if (GDC_with_SSSE3) 337 { 338 return cast(__m64) __builtin_ia32_phaddw(cast(short4)a, cast(short4)b); 339 } 340 else static if (LDC_with_ARM64) 341 { 342 return cast(__m64) vpadd_s16(cast(short4)a, cast(short4)b); 343 } 344 else 345 { 346 // LDC x86: generates phaddw since LDC 1.24 -O2. 347 short4 r; 348 short4 sa = cast(short4)a; 349 short4 sb = cast(short4)b; 350 r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]); 351 r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]); 352 r.ptr[2] = cast(short)(sb.array[0] + sb.array[1]); 353 r.ptr[3] = cast(short)(sb.array[2] + sb.array[3]); 354 return cast(__m64)r; 355 } 356 } 357 unittest 358 { 359 __m64 A = _mm_setr_pi16(1, -2, 4, 8); 360 __m64 B = _mm_setr_pi16(16, 32, -1, -32768); 361 short4 C = cast(short4) _mm_hadd_pi16(A, B); 362 short[4] correct = [ -1, 12, 48, 32767 ]; 363 assert(C.array == correct); 364 } 365 366 367 __m64 _mm_hadd_pi32 (__m64 a, __m64 b) @trusted 368 { 369 // PERF DMD 370 static if (GDC_with_SSSE3) 371 { 372 return cast(__m64) __builtin_ia32_phaddd(cast(int2)a, cast(int2)b); 373 } 374 else static if (LDC_with_ARM64) 375 { 376 return cast(__m64)vpadd_s32(cast(int2)a, cast(int2)b); 377 } 378 else 379 { 380 // LDC x86: generates phaddd since LDC 1.24 -O2 381 int2 ia = cast(int2)a; 382 int2 ib = cast(int2)b; 383 int2 r; 384 r.ptr[0] = ia.array[0] + ia.array[1]; 385 r.ptr[1] = ib.array[0] + ib.array[1]; 386 return cast(__m64)r; 387 } 388 } 389 unittest 390 { 391 __m64 A = _mm_setr_pi32(int.min, -1); 392 __m64 B = _mm_setr_pi32(1, int.max); 393 int2 C = cast(int2) _mm_hadd_pi32(A, B); 394 int[2] correct = [ int.max, int.min ]; 395 assert(C.array == correct); 396 } 397 398 399 /* 400 __m128i _mm_hadds_epi16 (__m128i a, __m128i b) 401 { 402 } 403 unittest 404 { 405 } 406 */ 407 /* 408 __m64 _mm_hadds_pi16 (__m64 a, __m64 b) 409 { 410 } 411 unittest 412 { 413 } 414 */ 415 416 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results. 417 __m128i _mm_hsub_epi16 (__m128i a, __m128i b) @trusted 418 { 419 // PERF DMD 420 static if (GDC_with_SSSE3) 421 { 422 return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b); 423 } 424 else static if (LDC_with_SSSE3) 425 { 426 return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b); 427 } 428 else static if (LDC_with_ARM64) 429 { 430 // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 431 short8 sa = cast(short8)a; 432 short8 sb = cast(short8)b; 433 short8 c = shufflevector!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb); 434 short8 d = shufflevector!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb); 435 return cast(__m128i)(c - d); 436 } 437 else 438 { 439 short8 sa = cast(short8)a; 440 short8 sb = cast(short8)b; 441 short8 r; 442 r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]); 443 r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]); 444 r.ptr[2] = cast(short)(sa.array[4] - sa.array[5]); 445 r.ptr[3] = cast(short)(sa.array[6] - sa.array[7]); 446 r.ptr[4] = cast(short)(sb.array[0] - sb.array[1]); 447 r.ptr[5] = cast(short)(sb.array[2] - sb.array[3]); 448 r.ptr[6] = cast(short)(sb.array[4] - sb.array[5]); 449 r.ptr[7] = cast(short)(sb.array[6] - sb.array[7]); 450 return cast(__m128i)r; 451 } 452 } 453 unittest 454 { 455 __m128i A = _mm_setr_epi16(short.min, 1, 4, 8, 16, 32, 1, -32768); 456 short8 C = cast(short8) _mm_hsub_epi16(A, A); 457 short[8] correct = [ short.max, -4, -16, -32767, short.max, -4, -16, -32767]; 458 assert(C.array == correct); 459 } 460 461 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results. 462 __m128i _mm_hsub_epi32 (__m128i a, __m128i b) @trusted 463 { 464 // PERF DMD 465 static if (GDC_with_SSSE3) 466 { 467 return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b); 468 } 469 else static if (LDC_with_SSSE3) 470 { 471 return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b); 472 } 473 else static if (LDC_with_ARM64) 474 { 475 // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 476 int4 ia = cast(int4)a; 477 int4 ib = cast(int4)b; 478 int4 c = shufflevector!(int4, 0, 2, 4, 6)(ia, ib); 479 int4 d = shufflevector!(int4, 1, 3, 5, 7)(ia, ib); 480 return cast(__m128i)(c - d); 481 } 482 else 483 { 484 int4 ia = cast(int4)a; 485 int4 ib = cast(int4)b; 486 int4 r; 487 r.ptr[0] = ia.array[0] - ia.array[1]; 488 r.ptr[1] = ia.array[2] - ia.array[3]; 489 r.ptr[2] = ib.array[0] - ib.array[1]; 490 r.ptr[3] = ib.array[2] - ib.array[3]; 491 return cast(__m128i)r; 492 } 493 } 494 unittest 495 { 496 __m128i A = _mm_setr_epi32(1, 2, int.min, 1); 497 __m128i B = _mm_setr_epi32(int.max, -1, 4, 4); 498 int4 C = cast(int4) _mm_hsub_epi32(A, B); 499 int[4] correct = [ -1, int.max, int.min, 0 ]; 500 assert(C.array == correct); 501 } 502 503 __m64 _mm_hsub_pi16 (__m64 a, __m64 b) @trusted 504 { 505 // PERF DMD 506 static if (GDC_with_SSSE3) 507 { 508 return cast(__m64)__builtin_ia32_phsubw(cast(short4)a, cast(short4)b); 509 } 510 else static if (LDC_with_ARM64) 511 { 512 // Produce uzp1 uzp2 sub sequence since LDC 1.3 -O1 513 short4 sa = cast(short4)a; 514 short4 sb = cast(short4)b; 515 short4 c = shufflevector!(short4, 0, 2, 4, 6)(sa, sb); 516 short4 d = shufflevector!(short4, 1, 3, 5, 7)(sa, sb); 517 return cast(__m64)(c - d); 518 } 519 else 520 { 521 // LDC x86: generates phsubw since LDC 1.24 -O2 522 short4 sa = cast(short4)a; 523 short4 sb = cast(short4)b; 524 short4 r; 525 r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]); 526 r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]); 527 r.ptr[2] = cast(short)(sb.array[0] - sb.array[1]); 528 r.ptr[3] = cast(short)(sb.array[2] - sb.array[3]); 529 return cast(__m64)r; 530 } 531 } 532 unittest 533 { 534 __m64 A = _mm_setr_pi16(short.min, 1, 4, 8); 535 __m64 B = _mm_setr_pi16(16, 32, 1, -32768); 536 short4 C = cast(short4) _mm_hsub_pi16(A, B); 537 short[4] correct = [ short.max, -4, -16, -32767]; 538 assert(C.array == correct); 539 } 540 541 __m64 _mm_hsub_pi32 (__m64 a, __m64 b) 542 { 543 // PERF DMD 544 static if (GDC_with_SSSE3) 545 { 546 return cast(__m64)__builtin_ia32_phsubd(cast(int2)a, cast(int2)b); 547 } 548 else static if (LDC_with_ARM64) 549 { 550 // LDC arm64: generates zip1+zip2+sub sequence since LDC 1.8 -O1 551 int2 ia = cast(int2)a; 552 int2 ib = cast(int2)b; 553 int2 c = shufflevector!(int2, 0, 2)(ia, ib); 554 int2 d = shufflevector!(int2, 1, 3)(ia, ib); 555 return cast(__m64)(c - d); 556 } 557 else 558 { 559 // LDC x86: generates phsubd since LDC 1.24 -O2 560 int2 ia = cast(int2)a; 561 int2 ib = cast(int2)b; 562 int2 r; 563 r.ptr[0] = ia.array[0] - ia.array[1]; 564 r.ptr[1] = ib.array[0] - ib.array[1]; 565 return cast(__m64)r; 566 } 567 } 568 unittest 569 { 570 __m64 A = _mm_setr_pi32(int.min, 1); 571 __m64 B = _mm_setr_pi32(int.max, -1); 572 int2 C = cast(int2) _mm_hsub_pi32(A, B); 573 int[2] correct = [ int.max, int.min ]; 574 assert(C.array == correct); 575 } 576 577 /* 578 __m128i _mm_hsubs_epi16 (__m128i a, __m128i b) 579 { 580 } 581 unittest 582 { 583 } 584 */ 585 /* 586 __m64 _mm_hsubs_pi16 (__m64 a, __m64 b) 587 { 588 } 589 unittest 590 { 591 } 592 */ 593 594 595 /* 596 __m128i _mm_maddubs_epi16 (__m128i a, __m128i b) 597 { 598 } 599 unittest 600 { 601 } 602 */ 603 /* 604 __m64 _mm_maddubs_pi16 (__m64 a, __m64 b) 605 { 606 } 607 unittest 608 { 609 } 610 */ 611 /* 612 __m128i _mm_mulhrs_epi16 (__m128i a, __m128i b) 613 { 614 } 615 unittest 616 { 617 } 618 */ 619 /* 620 __m64 _mm_mulhrs_pi16 (__m64 a, __m64 b) 621 { 622 } 623 unittest 624 { 625 } 626 */ 627 628 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`. 629 __m128i _mm_shuffle_epi8 (__m128i a, __m128i b) @trusted 630 { 631 // This is the lovely pshufb. 632 // PERF DMD 633 static if (GDC_with_SSSE3) 634 { 635 return cast(__m128i) __builtin_ia32_pshufb128(cast(byte16) a, cast(byte16) b); 636 } 637 else static if (LDC_with_SSSE3) 638 { 639 return cast(__m128i) __builtin_ia32_pshufb128(cast(byte16) a, cast(byte16) b); 640 } 641 else static if (LDC_with_ARM64) 642 { 643 byte16 bb = cast(byte16)b; 644 byte16 mask; 645 mask = cast(byte)(0x8F); 646 bb = bb & mask; 647 byte16 r = vqtbl1q_s8(cast(byte16)a, bb); 648 return cast(__m128i)r; 649 } 650 else 651 { 652 byte16 r; 653 byte16 ba = cast(byte16)a; 654 byte16 bb = cast(byte16)b; 655 for (int i = 0; i < 16; ++i) 656 { 657 byte s = bb.array[i]; 658 r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 15 ]; 659 } 660 return cast(__m128i)r; 661 } 662 } 663 unittest 664 { 665 __m128i A = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 666 __m128i B = _mm_setr_epi8(15, -128, 13 + 16, -12, 11, -10, 9, 8, 7, 6, -5, 4, 3, -2, 1, 0); 667 byte16 C = cast(byte16) _mm_shuffle_epi8(A, B); 668 byte[16] correct = [0, 0, 2, 0, 4, 0, 6, 7, 8, 9, 0, 11, 12, 0, 14, 15]; 669 assert(C.array == correct); 670 } 671 672 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`. 673 __m64 _mm_shuffle_pi8 (__m64 a, __m64 b) 674 { 675 // PERF DMD 676 static if (GDC_with_SSSE3) 677 { 678 alias ubyte8 =__vector(ubyte[8]); 679 return cast(__m64) __builtin_ia32_pshufb(cast(ubyte8) a, cast(ubyte8) b); 680 } 681 else static if (LDC_with_SSSE3) 682 { 683 // GDC does proper dance to avoid mmx registers, do it manually in LDC since __builtin_ia32_pshufb doesn't exist there 684 __m128i A = to_m128i(a); 685 __m128i index = to_m128i(b); 686 index = index & _mm_set1_epi32(0xF7F7F7F7); 687 return to_m64( cast(__m128i) __builtin_ia32_pshufb128(cast(byte16)A, cast(byte16) index) ); 688 } 689 else static if (LDC_with_ARM64) 690 { 691 byte8 bb = cast(byte8)b; 692 byte8 mask; 693 mask = cast(byte)(0x87); 694 bb = bb & mask; 695 __m128i l = to_m128i(a); 696 byte8 r = vtbl1_s8(cast(byte16)l, cast(byte8)bb); 697 return cast(__m64)r; 698 } 699 else 700 { 701 byte8 r; 702 byte8 ba = cast(byte8)a; 703 byte8 bb = cast(byte8)b; 704 for (int i = 0; i < 8; ++i) 705 { 706 byte s = bb.array[i]; 707 r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 7 ]; 708 } 709 return cast(__m64)r; 710 } 711 } 712 unittest 713 { 714 __m64 A = _mm_setr_pi8(7, 6, 5, 4, 3, 2, 1, 0); 715 __m64 B = _mm_setr_pi8(7, 6, -5, 4, 3 + 8, -2, 1, 0); 716 byte8 C = cast(byte8) _mm_shuffle_pi8(A, B); 717 byte[8] correct = [0, 1, 0, 3, 4, 0, 6, 7]; 718 assert(C.array == correct); 719 } 720 721 /// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit integer in `b` is negative. 722 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 723 __m128i _mm_sign_epi16 (__m128i a, __m128i b) 724 { 725 // PERF DMD 726 static if (GDC_with_SSSE3) 727 { 728 return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b); 729 } 730 else static if (LDC_with_SSSE3) 731 { 732 return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b); 733 } 734 else 735 { 736 // LDC arm64: 5 instructions 737 __m128i mask = _mm_srai_epi16(b, 15); 738 __m128i zeromask = _mm_cmpeq_epi16(b, _mm_setzero_si128()); 739 return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi16(a, mask), mask)); 740 } 741 } 742 unittest 743 { 744 __m128i A = _mm_setr_epi16(-2, -1, 0, 1, 2, short.min, short.min, short.min); 745 __m128i B = _mm_setr_epi16(-1, 0,-1, 1, -2, -50, 0, 50); 746 short8 C = cast(short8) _mm_sign_epi16(A, B); 747 short[8] correct = [ 2, 0, 0, 1, -2, short.min, 0, short.min]; 748 assert(C.array == correct); 749 } 750 751 /// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit integer in `b` is negative. 752 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 753 __m128i _mm_sign_epi32 (__m128i a, __m128i b) 754 { 755 // PERF DMD 756 static if (GDC_with_SSSE3) 757 { 758 return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b); 759 } 760 else static if (LDC_with_SSSE3) 761 { 762 return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b); 763 } 764 else 765 { 766 __m128i mask = _mm_srai_epi32(b, 31); 767 __m128i zeromask = _mm_cmpeq_epi32(b, _mm_setzero_si128()); 768 return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi32(a, mask), mask)); 769 } 770 } 771 unittest 772 { 773 __m128i A = _mm_setr_epi32(-2, -1, 0, int.max); 774 __m128i B = _mm_setr_epi32(-1, 0, -1, 1); 775 int4 C = cast(int4) _mm_sign_epi32(A, B); 776 int[4] correct = [ 2, 0, 0, int.max]; 777 assert(C.array == correct); 778 } 779 780 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 781 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 782 __m128i _mm_sign_epi8 (__m128i a, __m128i b) 783 { 784 // PERF DMD 785 static if (GDC_with_SSSE3) 786 { 787 return cast(__m128i) __builtin_ia32_psignb128(cast(byte16)a, cast(byte16)b); 788 } 789 else static if (LDC_with_SSSE3) 790 { 791 return cast(__m128i) __builtin_ia32_psignb128(cast(byte16)a, cast(byte16)b); 792 } 793 else 794 { 795 __m128i mask = _mm_cmplt_epi8(b, _mm_setzero_si128()); // extend sign bit 796 __m128i zeromask = _mm_cmpeq_epi8(b, _mm_setzero_si128()); 797 return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi8(a, mask), mask)); 798 } 799 } 800 unittest 801 { 802 __m128i A = _mm_setr_epi8(-2, -1, 0, 1, 2, byte.min, byte.min, byte.min, -1, 0,-1, 1, -2, -50, 0, 50); 803 __m128i B = _mm_setr_epi8(-1, 0,-1, 1, -2, -50, 0, 50, -2, -1, 0, 1, 2, byte.min, byte.min, byte.min); 804 byte16 C = cast(byte16) _mm_sign_epi8(A, B); 805 byte[16] correct = [ 2, 0, 0, 1, -2, byte.min, 0, byte.min, 1, 0, 0, 1, -2, 50, 0, -50]; 806 assert(C.array == correct); 807 } 808 809 /// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit integer in `b` is negative. 810 /// Element in result are zeroed out when the corresponding element in `b` is zero. 811 __m64 _mm_sign_pi16 (__m64 a, __m64 b) 812 { 813 return to_m64( _mm_sign_epi16( to_m128i(a), to_m128i(b)) ); 814 } 815 unittest 816 { 817 __m64 A = _mm_setr_pi16( 2, short.min, short.min, short.min); 818 __m64 B = _mm_setr_pi16(-2, -50, 0, 50); 819 short4 C = cast(short4) _mm_sign_pi16(A, B); 820 short[4] correct = [-2, short.min, 0, short.min]; 821 assert(C.array == correct); 822 } 823 824 /// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit integer in `b` is negative. 825 /// Element in result are zeroed out when the corresponding element in `b` is zero. 826 __m64 _mm_sign_pi32 (__m64 a, __m64 b) 827 { 828 return to_m64( _mm_sign_epi32( to_m128i(a), to_m128i(b)) ); 829 } 830 unittest 831 { 832 __m64 A = _mm_setr_pi32(-2, -100); 833 __m64 B = _mm_setr_pi32(-1, 0); 834 int2 C = cast(int2) _mm_sign_pi32(A, B); 835 int[2] correct = [ 2, 0]; 836 assert(C.array == correct); 837 } 838 839 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 840 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 841 __m64 _mm_sign_pi8 (__m64 a, __m64 b) 842 { 843 return to_m64( _mm_sign_epi8( to_m128i(a), to_m128i(b)) ); 844 } 845 unittest 846 { 847 __m64 A = _mm_setr_pi8(-2, -1, 0, 1, 2, byte.min, byte.min, byte.min); 848 __m64 B = _mm_setr_pi8(-1, 0,-1, 1, -2, -50, 0, 50); 849 byte8 C = cast(byte8) _mm_sign_pi8(A, B); 850 byte[8] correct = [ 2, 0, 0, 1, -2, byte.min, 0, byte.min]; 851 assert(C.array == correct); 852 } 853 854 855 856 /* 857 858 859 Note: LDC 1.0 to 1.27 have the following builtins: 860 861 pragma(LDC_intrinsic, "llvm.x86.ssse3.phadd.sw.128") 862 short8 __builtin_ia32_phaddsw128(short8, short8) pure @safe; 863 864 pragma(LDC_intrinsic, "llvm.x86.ssse3.phsub.sw.128") 865 short8 __builtin_ia32_phsubsw128(short8, short8) pure @safe; 866 867 pragma(LDC_intrinsic, "llvm.x86.ssse3.pmadd.ub.sw.128") 868 short8 __builtin_ia32_pmaddubsw128(byte16, byte16) pure @safe; 869 870 pragma(LDC_intrinsic, "llvm.x86.ssse3.pmul.hr.sw.128") 871 short8 __builtin_ia32_pmulhrsw128(short8, short8) pure @safe; 872 873 */