1 /** 2 * SSSE3 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSSE3 4 * 5 * Copyright: Guillaume Piolat 2021. 6 * Johan Engelen 2021. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.tmmintrin; 10 11 public import inteli.types; 12 import inteli.internals; 13 14 public import inteli.pmmintrin; 15 import inteli.mmx; 16 17 nothrow @nogc: 18 19 20 // SSSE3 instructions 21 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSSE3 22 // Note: this header will work whether you have SSSE3 enabled or not. 23 // With LDC, use "dflags-ldc": ["-mattr=+ssse3"] or equivalent to actively 24 // generate SSE3 instructions. 25 // With GDC, use "dflags-gdc": ["-mssse3"] or equivalent to generate SSSE3 instructions. 26 27 /// Compute the absolute value of packed signed 16-bit integers in `a`. 28 __m128i _mm_abs_epi16 (__m128i a) @trusted 29 { 30 static if (DMD_with_DSIMD) 31 { 32 return cast(__m128i)__simd(XMM.PABSW, a); 33 } 34 else static if (GDC_with_SSSE3) 35 { 36 return cast(__m128i) __builtin_ia32_pabsw128(cast(short8)a); 37 } 38 else static if (LDC_with_ARM64) 39 { 40 return cast(__m128i) vabsq_s16(cast(short8)a); 41 } 42 else 43 { 44 // LDC x86: generate pabsw since LDC 1.1 -O2 45 short8 sa = cast(short8)a; 46 for (int i = 0; i < 8; ++i) 47 { 48 short s = sa.array[i]; 49 sa.ptr[i] = s >= 0 ? s : cast(short)(-cast(int)(s)); 50 } 51 return cast(__m128i)sa; 52 } 53 } 54 unittest 55 { 56 __m128i A = _mm_setr_epi16(0, -1, -32768, 32767, 10, -10, 1000, -1000); 57 short8 B = cast(short8) _mm_abs_epi16(A); 58 short[8] correct = [0, 1, -32768, 32767, 10, 10, 1000, 1000]; 59 assert(B.array == correct); 60 } 61 62 /// Compute the absolute value of packed signed 32-bit integers in `a`. 63 __m128i _mm_abs_epi32 (__m128i a) @trusted 64 { 65 static if (DMD_with_DSIMD) 66 { 67 return cast(__m128i)__simd(XMM.PABSD, cast(int4)a); 68 } 69 else static if (GDC_with_SSSE3) 70 { 71 return cast(__m128i) __builtin_ia32_pabsd128(cast(int4)a); 72 } 73 else static if (LDC_with_ARM64) 74 { 75 return cast(__m128i) vabsq_s32(cast(int4)a); 76 } 77 else 78 { 79 // LDC x86: generates pabsd since LDC 1.1 -O2 80 int4 sa = cast(int4)a; 81 for (int i = 0; i < 4; ++i) 82 { 83 int s = sa.array[i]; 84 sa.ptr[i] = s >= 0 ? s : -s; 85 } 86 return cast(__m128i)sa; 87 } 88 } 89 unittest 90 { 91 __m128i A = _mm_setr_epi32(0, -1, -2_147_483_648, -2_147_483_647); 92 int4 B = cast(int4) _mm_abs_epi32(A); 93 int[4] correct = [0, 1, -2_147_483_648, 2_147_483_647]; 94 assert(B.array == correct); 95 } 96 97 /// Compute the absolute value of packed signed 8-bit integers in `a`. 98 __m128i _mm_abs_epi8 (__m128i a) @trusted 99 { 100 static if (DMD_with_DSIMD) 101 { 102 return cast(__m128i)__simd(XMM.PABSB, cast(byte16)a); 103 } 104 else static if (GDC_with_SSSE3) 105 { 106 alias ubyte16 = __vector(ubyte[16]); 107 return cast(__m128i) __builtin_ia32_pabsb128(cast(ubyte16)a); 108 } 109 else static if (LDC_with_ARM64) 110 { 111 return cast(__m128i) vabsq_s8(cast(byte16)a); 112 } 113 else version(LDC) 114 { 115 // LDC x86: generates pabsb since LDC 1.1 -O1 116 // arm64: generates abs since LDC 1.8 -O1 117 enum ir = ` 118 %n = sub <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %0 119 %s = icmp slt <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %0 120 %r = select <16 x i1> %s, <16 x i8> %0, <16 x i8> %n 121 ret <16 x i8> %r`; 122 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16)(cast(byte16)a); 123 } 124 else 125 { 126 // A loop version like in _mm_abs_epi16/_mm_abs_epi32 would be very slow 127 // in LDC x86 and wouldn't vectorize. Doesn't generate pabsb in LDC though. 128 return _mm_min_epu8(a, _mm_sub_epi8(_mm_setzero_si128(), a)); 129 } 130 } 131 unittest 132 { 133 __m128i A = _mm_setr_epi8(0, -1, -128, -127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 134 byte16 B = cast(byte16) _mm_abs_epi8(A); 135 byte[16] correct = [0, 1, -128, 127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 136 assert(B.array == correct); 137 } 138 139 /// Compute the absolute value of packed 64-bit floating-point elements in `a`. 140 /// #BONUS. 141 __m128d _mm_abs_pd (__m128d a) @trusted 142 { 143 long2 mask = 0x7fff_ffff_ffff_ffff; 144 return cast(__m128d)((cast(long2)a) & mask); 145 } 146 unittest 147 { 148 __m128d A = _mm_setr_pd(-42.0f, -double.infinity); 149 __m128d R = _mm_abs_pd(A); 150 double[2] correct = [42.0f, +double.infinity]; 151 assert(R.array == correct); 152 } 153 154 /// Compute the absolute value of packed signed 16-bit integers in `a`. 155 __m64 _mm_abs_pi16 (__m64 a) @trusted 156 { 157 return to_m64(_mm_abs_epi16(to_m128i(a))); 158 } 159 unittest 160 { 161 __m64 A = _mm_setr_pi16(0, -1, -32768, 32767); 162 short4 B = cast(short4) _mm_abs_pi16(A); 163 short[4] correct = [0, 1, -32768, 32767]; 164 assert(B.array == correct); 165 } 166 167 /// Compute the absolute value of packed signed 32-bit integers in `a`. 168 __m64 _mm_abs_pi32 (__m64 a) @trusted 169 { 170 return to_m64(_mm_abs_epi32(to_m128i(a))); 171 } 172 unittest 173 { 174 __m64 A = _mm_setr_pi32(-1, -2_147_483_648); 175 int2 B = cast(int2) _mm_abs_pi32(A); 176 int[2] correct = [1, -2_147_483_648]; 177 assert(B.array == correct); 178 } 179 180 /// Compute the absolute value of packed signed 8-bit integers in `a`. 181 __m64 _mm_abs_pi8 (__m64 a) @trusted 182 { 183 return to_m64(_mm_abs_epi8(to_m128i(a))); 184 } 185 unittest 186 { 187 __m64 A = _mm_setr_pi8(0, -1, -128, -127, 127, 0, 0, 0); 188 byte8 B = cast(byte8) _mm_abs_pi8(A); 189 byte[8] correct = [0, 1, -128, 127, 127, 0, 0, 0]; 190 assert(B.array == correct); 191 } 192 193 /// Compute the absolute value of packed 32-bit floating-point elements in `a`. 194 /// #BONUS. 195 __m128 _mm_abs_ps (__m128 a) @trusted 196 { 197 __m128i mask = 0x7fffffff; 198 return cast(__m128)((cast(__m128i)a) & mask); 199 } 200 unittest 201 { 202 __m128 A = _mm_setr_ps(-0.0f, 10.0f, -42.0f, -float.infinity); 203 __m128 R = _mm_abs_ps(A); 204 float[4] correct = [0.0f, 10.0f, 42.0f, +float.infinity]; 205 assert(R.array == correct); 206 } 207 208 /// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result, shift the result right by `count` bytes, and return the low 16 bytes. 209 __m128i _mm_alignr_epi8(ubyte count)(__m128i a, __m128i b) @trusted 210 { 211 // PERF DMD 212 static if (GDC_with_SSSE3) 213 { 214 return cast(__m128i)__builtin_ia32_palignr128(cast(long2)a, cast(long2)b, count * 8); 215 } 216 else version(LDC) 217 { 218 static if (count >= 32) 219 { 220 return _mm_setzero_si128(); 221 } 222 else static if (count < 16) 223 { 224 // Generates palignr since LDC 1.1 -O1 225 // Also generates a single ext instruction on arm64. 226 return cast(__m128i) shufflevectorLDC!(byte16, ( 0 + count), 227 ( 1 + count), 228 ( 2 + count), 229 ( 3 + count), 230 ( 4 + count), 231 ( 5 + count), 232 ( 6 + count), 233 ( 7 + count), 234 ( 8 + count), 235 ( 9 + count), 236 (10 + count), 237 (11 + count), 238 (12 + count), 239 (13 + count), 240 (14 + count), 241 (15 + count))(cast(byte16)b, cast(byte16)a); 242 } 243 else 244 { 245 return cast(__m128i) shufflevectorLDC!(byte16, ( 0 + count) % 32, 246 ( 1 + count) % 32, 247 ( 2 + count) % 32, 248 ( 3 + count) % 32, 249 ( 4 + count) % 32, 250 ( 5 + count) % 32, 251 ( 6 + count) % 32, 252 ( 7 + count) % 32, 253 ( 8 + count) % 32, 254 ( 9 + count) % 32, 255 (10 + count) % 32, 256 (11 + count) % 32, 257 (12 + count) % 32, 258 (13 + count) % 32, 259 (14 + count) % 32, 260 (15 + count) % 32)(cast(byte16)_mm_setzero_si128(), cast(byte16)a); 261 } 262 } 263 else 264 { 265 byte16 ab = cast(byte16)a; 266 byte16 bb = cast(byte16)b; 267 byte16 r; 268 269 for (int i = 0; i < 16; ++i) 270 { 271 const int srcpos = count + cast(int)i; 272 if (srcpos > 31) 273 { 274 r.ptr[i] = 0; 275 } 276 else if (srcpos > 15) 277 { 278 r.ptr[i] = ab.array[(srcpos) & 15]; 279 } 280 else 281 { 282 r.ptr[i] = bb.array[srcpos]; 283 } 284 } 285 return cast(__m128i)r; 286 } 287 } 288 unittest 289 { 290 __m128i A = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); 291 __m128i B = _mm_setr_epi8(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); 292 293 { 294 byte16 C = cast(byte16)_mm_alignr_epi8!0(A ,B); 295 byte[16] correct = [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]; 296 assert(C.array == correct); 297 } 298 { 299 byte16 C = cast(byte16)_mm_alignr_epi8!20(A ,B); 300 byte[16] correct = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0]; 301 assert(C.array == correct); 302 } 303 { 304 byte16 C = cast(byte16)_mm_alignr_epi8!34(A ,B); 305 byte[16] correct = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 306 assert(C.array == correct); 307 } 308 309 __m128i D = _mm_setr_epi8(-123, -82, 103, -69, 103, -26, 9, 106, 58, -11, 79, -91, 114, -13, 110, 60); 310 __m128i E = _mm_setr_epi8(25, -51, -32, 91, -85, -39, -125, 31, -116, 104, 5, -101, 127, 82, 14, 81); 311 byte16 F = cast(byte16)_mm_alignr_epi8!8(D, E); 312 byte[16] correct = [-116, 104, 5, -101, 127, 82, 14, 81, -123, -82, 103, -69, 103, -26, 9, 106]; 313 assert(F.array == correct); 314 } 315 316 /// Concatenate 8-byte blocks in `a` and `b` into a 16-byte temporary result, shift the result right by `count` bytes, and return the low 8 bytes. 317 __m64 _mm_alignr_pi8(ubyte count)(__m64 a, __m64 b) @trusted 318 { 319 // PERF DMD 320 static if (GDC_with_SSSE3) 321 { 322 return cast(__m64)__builtin_ia32_palignr(cast(long1)a, cast(long1)b, count * 8); 323 } 324 else version(LDC) 325 { 326 static if (count >= 16) 327 { 328 return _mm_setzero_si64(); 329 } 330 else static if (count < 8) 331 { 332 // Note: in LDC x86 this uses a pshufb. 333 // Generates ext in arm64. 334 return cast(__m64) shufflevectorLDC!(byte8, (0 + count), 335 (1 + count), 336 (2 + count), 337 (3 + count), 338 (4 + count), 339 (5 + count), 340 (6 + count), 341 (7 + count))(cast(byte8)b, cast(byte8)a); 342 } 343 else 344 { 345 return cast(__m64) shufflevectorLDC!(byte8, (0 + count)%16, 346 (1 + count)%16, 347 (2 + count)%16, 348 (3 + count)%16, 349 (4 + count)%16, 350 (5 + count)%16, 351 (6 + count)%16, 352 (7 + count)%16)(cast(byte8)_mm_setzero_si64(), cast(byte8)a); 353 } 354 } 355 else 356 { 357 byte8 ab = cast(byte8)a; 358 byte8 bb = cast(byte8)b; 359 byte8 r; 360 361 for (int i = 0; i < 8; ++i) 362 { 363 const int srcpos = count + cast(int)i; 364 if (srcpos > 15) 365 { 366 r.ptr[i] = 0; 367 } 368 else if (srcpos > 7) 369 { 370 r.ptr[i] = ab.array[(srcpos) & 7]; 371 } 372 else 373 { 374 r.ptr[i] = bb.array[srcpos]; 375 } 376 } 377 return cast(__m64)r; 378 } 379 } 380 unittest 381 { 382 __m64 A = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8); 383 __m64 B = _mm_setr_pi8(17, 18, 19, 20, 21, 22, 23, 24); 384 385 { 386 byte8 C = cast(byte8)_mm_alignr_pi8!0(A ,B); 387 byte[8] correct = [17, 18, 19, 20, 21, 22, 23, 24]; 388 assert(C.array == correct); 389 } 390 391 { 392 byte8 C = cast(byte8)_mm_alignr_pi8!3(A ,B); 393 byte[8] correct = [ 20, 21, 22, 23, 24, 1, 2, 3]; 394 assert(C.array == correct); 395 } 396 { 397 byte8 C = cast(byte8)_mm_alignr_pi8!11(A ,B); 398 byte[8] correct = [4, 5, 6, 7, 8, 0, 0, 0]; 399 assert(C.array == correct); 400 } 401 { 402 byte8 C = cast(byte8)_mm_alignr_pi8!17(A ,B); 403 byte[8] correct = [0, 0, 0, 0, 0, 0, 0, 0]; 404 assert(C.array == correct); 405 } 406 } 407 408 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results. 409 __m128i _mm_hadd_epi16 (__m128i a, __m128i b) @trusted 410 { 411 // PERF DMD 412 static if (GDC_with_SSSE3) 413 { 414 return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b); 415 } 416 else static if (LDC_with_SSSE3) 417 { 418 return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b); 419 } 420 else static if (LDC_with_ARM64) 421 { 422 return cast(__m128i)vpaddq_s16(cast(short8)a, cast(short8)b); 423 } 424 else 425 { 426 short8 sa = cast(short8)a; 427 short8 sb = cast(short8)b; 428 short8 r; 429 r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]); 430 r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]); 431 r.ptr[2] = cast(short)(sa.array[4] + sa.array[5]); 432 r.ptr[3] = cast(short)(sa.array[6] + sa.array[7]); 433 r.ptr[4] = cast(short)(sb.array[0] + sb.array[1]); 434 r.ptr[5] = cast(short)(sb.array[2] + sb.array[3]); 435 r.ptr[6] = cast(short)(sb.array[4] + sb.array[5]); 436 r.ptr[7] = cast(short)(sb.array[6] + sb.array[7]); 437 return cast(__m128i)r; 438 } 439 } 440 unittest 441 { 442 __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768); 443 short8 C = cast(short8) _mm_hadd_epi16(A, A); 444 short[8] correct = [ -1, 12, 48, 32767, -1, 12, 48, 32767]; 445 assert(C.array == correct); 446 } 447 448 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results. 449 __m128i _mm_hadd_epi32 (__m128i a, __m128i b) @trusted 450 { 451 // PERF DMD 452 static if (GDC_with_SSSE3) 453 { 454 return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b); 455 } 456 else static if (LDC_with_SSSE3) 457 { 458 return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b); 459 } 460 else static if (LDC_with_ARM64) 461 { 462 return cast(__m128i)vpaddq_s32(cast(int4)a, cast(int4)b); 463 } 464 else 465 { 466 int4 ia = cast(int4)a; 467 int4 ib = cast(int4)b; 468 int4 r; 469 r.ptr[0] = ia.array[0] + ia.array[1]; 470 r.ptr[1] = ia.array[2] + ia.array[3]; 471 r.ptr[2] = ib.array[0] + ib.array[1]; 472 r.ptr[3] = ib.array[2] + ib.array[3]; 473 return cast(__m128i)r; 474 } 475 } 476 unittest 477 { 478 __m128i A = _mm_setr_epi32(1, -2, int.min, -1); 479 __m128i B = _mm_setr_epi32(1, int.max, 4, -4); 480 int4 C = cast(int4) _mm_hadd_epi32(A, B); 481 int[4] correct = [ -1, int.max, int.min, 0 ]; 482 assert(C.array == correct); 483 } 484 485 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results. 486 __m64 _mm_hadd_pi16 (__m64 a, __m64 b) @trusted 487 { 488 // PERF DMD 489 static if (GDC_with_SSSE3) 490 { 491 return cast(__m64) __builtin_ia32_phaddw(cast(short4)a, cast(short4)b); 492 } 493 else static if (LDC_with_ARM64) 494 { 495 return cast(__m64) vpadd_s16(cast(short4)a, cast(short4)b); 496 } 497 else 498 { 499 // LDC x86: generates phaddw since LDC 1.24 -O2. 500 short4 r; 501 short4 sa = cast(short4)a; 502 short4 sb = cast(short4)b; 503 r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]); 504 r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]); 505 r.ptr[2] = cast(short)(sb.array[0] + sb.array[1]); 506 r.ptr[3] = cast(short)(sb.array[2] + sb.array[3]); 507 return cast(__m64)r; 508 } 509 } 510 unittest 511 { 512 __m64 A = _mm_setr_pi16(1, -2, 4, 8); 513 __m64 B = _mm_setr_pi16(16, 32, -1, -32768); 514 short4 C = cast(short4) _mm_hadd_pi16(A, B); 515 short[4] correct = [ -1, 12, 48, 32767 ]; 516 assert(C.array == correct); 517 } 518 519 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, 520 /// and pack the signed 32-bit results. 521 __m64 _mm_hadd_pi32 (__m64 a, __m64 b) @trusted 522 { 523 // PERF DMD 524 static if (GDC_with_SSSE3) 525 { 526 return cast(__m64) __builtin_ia32_phaddd(cast(int2)a, cast(int2)b); 527 } 528 else static if (LDC_with_ARM64) 529 { 530 return cast(__m64)vpadd_s32(cast(int2)a, cast(int2)b); 531 } 532 else 533 { 534 // LDC x86: generates phaddd since LDC 1.24 -O2 535 int2 ia = cast(int2)a; 536 int2 ib = cast(int2)b; 537 int2 r; 538 r.ptr[0] = ia.array[0] + ia.array[1]; 539 r.ptr[1] = ib.array[0] + ib.array[1]; 540 return cast(__m64)r; 541 } 542 } 543 unittest 544 { 545 __m64 A = _mm_setr_pi32(int.min, -1); 546 __m64 B = _mm_setr_pi32(1, int.max); 547 int2 C = cast(int2) _mm_hadd_pi32(A, B); 548 int[2] correct = [ int.max, int.min ]; 549 assert(C.array == correct); 550 } 551 552 /// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 553 /// and pack the signed 16-bit results. 554 __m128i _mm_hadds_epi16 (__m128i a, __m128i b) @trusted 555 { 556 // PERF DMD 557 static if (GDC_with_SSSE3) 558 { 559 return cast(__m128i)__builtin_ia32_phaddsw128(cast(short8)a, cast(short8)b); 560 } 561 else static if (LDC_with_SSSE3) 562 { 563 return cast(__m128i)__builtin_ia32_phaddsw128(cast(short8)a, cast(short8)b); 564 } 565 else static if (LDC_with_ARM64) 566 { 567 // uzp1/uzp2/sqadd sequence 568 short8 sa = cast(short8)a; 569 short8 sb = cast(short8)b; 570 short8 c = shufflevectorLDC!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb); 571 short8 d = shufflevectorLDC!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb); 572 return cast(__m128i)vqaddq_s16(c, d); 573 } 574 else 575 { 576 short8 sa = cast(short8)a; 577 short8 sb = cast(short8)b; 578 short8 r; 579 r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] + sa.array[1]); 580 r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] + sa.array[3]); 581 r.ptr[2] = saturateSignedIntToSignedShort(sa.array[4] + sa.array[5]); 582 r.ptr[3] = saturateSignedIntToSignedShort(sa.array[6] + sa.array[7]); 583 r.ptr[4] = saturateSignedIntToSignedShort(sb.array[0] + sb.array[1]); 584 r.ptr[5] = saturateSignedIntToSignedShort(sb.array[2] + sb.array[3]); 585 r.ptr[6] = saturateSignedIntToSignedShort(sb.array[4] + sb.array[5]); 586 r.ptr[7] = saturateSignedIntToSignedShort(sb.array[6] + sb.array[7]); 587 return cast(__m128i)r; 588 } 589 } 590 unittest 591 { 592 __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768); 593 short8 C = cast(short8) _mm_hadds_epi16(A, A); 594 short[8] correct = [ -1, 12, 48, -32768, -1, 12, 48, -32768]; 595 assert(C.array == correct); 596 } 597 598 /// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 599 /// and pack the signed 16-bit results. 600 __m64 _mm_hadds_pi16 (__m64 a, __m64 b) @trusted 601 { 602 static if (GDC_with_SSSE3) 603 { 604 return cast(__m64)__builtin_ia32_phaddsw(cast(short4)a, cast(short4)b); 605 } 606 else static if (LDC_with_SSSE3) 607 { 608 // Note: LDC doesn't have __builtin_ia32_phaddsw 609 long2 la; 610 la.ptr[0] = a.array[0]; 611 long2 lb; 612 lb.ptr[0] = b.array[0]; 613 int4 sum = cast(int4)__builtin_ia32_phaddsw128(cast(short8)la, cast(short8)lb); 614 int2 r; 615 r.ptr[0] = sum.array[0]; 616 r.ptr[1] = sum.array[2]; 617 return cast(__m64)r; 618 } 619 else static if (LDC_with_ARM64) 620 { 621 // uzp1/uzp2/sqadd sequence 622 short4 sa = cast(short4)a; 623 short4 sb = cast(short4)b; 624 short4 c = shufflevectorLDC!(short4, 0, 2, 4, 6)(sa, sb); 625 short4 d = shufflevectorLDC!(short4, 1, 3, 5, 7)(sa, sb); 626 return cast(__m64)vqadd_s16(c, d); 627 } 628 else 629 { 630 short4 sa = cast(short4)a; 631 short4 sb = cast(short4)b; 632 short4 r; 633 r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] + sa.array[1]); 634 r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] + sa.array[3]); 635 r.ptr[2] = saturateSignedIntToSignedShort(sb.array[0] + sb.array[1]); 636 r.ptr[3] = saturateSignedIntToSignedShort(sb.array[2] + sb.array[3]); 637 return cast(__m64)r; 638 } 639 } 640 unittest 641 { 642 __m64 A = _mm_setr_pi16(-16, 32, -100, -32768); 643 __m64 B = _mm_setr_pi16( 64, 32, 1, 32767); 644 short4 C = cast(short4) _mm_hadds_pi16(A, B); 645 short[4] correct = [ 16, -32768, 96, 32767]; 646 assert(C.array == correct); 647 } 648 649 650 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results. 651 __m128i _mm_hsub_epi16 (__m128i a, __m128i b) @trusted 652 { 653 // PERF DMD 654 static if (GDC_with_SSSE3) 655 { 656 return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b); 657 } 658 else static if (LDC_with_SSSE3) 659 { 660 return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b); 661 } 662 else static if (LDC_with_ARM64) 663 { 664 // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 665 short8 sa = cast(short8)a; 666 short8 sb = cast(short8)b; 667 short8 c = shufflevectorLDC!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb); 668 short8 d = shufflevectorLDC!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb); 669 return cast(__m128i)(c - d); 670 } 671 else 672 { 673 short8 sa = cast(short8)a; 674 short8 sb = cast(short8)b; 675 short8 r; 676 r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]); 677 r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]); 678 r.ptr[2] = cast(short)(sa.array[4] - sa.array[5]); 679 r.ptr[3] = cast(short)(sa.array[6] - sa.array[7]); 680 r.ptr[4] = cast(short)(sb.array[0] - sb.array[1]); 681 r.ptr[5] = cast(short)(sb.array[2] - sb.array[3]); 682 r.ptr[6] = cast(short)(sb.array[4] - sb.array[5]); 683 r.ptr[7] = cast(short)(sb.array[6] - sb.array[7]); 684 return cast(__m128i)r; 685 } 686 } 687 unittest 688 { 689 __m128i A = _mm_setr_epi16(short.min, 1, 4, 8, 16, 32, 1, -32768); 690 short8 C = cast(short8) _mm_hsub_epi16(A, A); 691 short[8] correct = [ short.max, -4, -16, -32767, short.max, -4, -16, -32767]; 692 assert(C.array == correct); 693 } 694 695 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results. 696 __m128i _mm_hsub_epi32 (__m128i a, __m128i b) @trusted 697 { 698 // PERF DMD 699 static if (GDC_with_SSSE3) 700 { 701 return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b); 702 } 703 else static if (LDC_with_SSSE3) 704 { 705 return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b); 706 } 707 else static if (LDC_with_ARM64) 708 { 709 // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 710 int4 ia = cast(int4)a; 711 int4 ib = cast(int4)b; 712 int4 c = shufflevectorLDC!(int4, 0, 2, 4, 6)(ia, ib); 713 int4 d = shufflevectorLDC!(int4, 1, 3, 5, 7)(ia, ib); 714 return cast(__m128i)(c - d); 715 } 716 else 717 { 718 int4 ia = cast(int4)a; 719 int4 ib = cast(int4)b; 720 int4 r; 721 r.ptr[0] = ia.array[0] - ia.array[1]; 722 r.ptr[1] = ia.array[2] - ia.array[3]; 723 r.ptr[2] = ib.array[0] - ib.array[1]; 724 r.ptr[3] = ib.array[2] - ib.array[3]; 725 return cast(__m128i)r; 726 } 727 } 728 unittest 729 { 730 __m128i A = _mm_setr_epi32(1, 2, int.min, 1); 731 __m128i B = _mm_setr_epi32(int.max, -1, 4, 4); 732 int4 C = cast(int4) _mm_hsub_epi32(A, B); 733 int[4] correct = [ -1, int.max, int.min, 0 ]; 734 assert(C.array == correct); 735 } 736 737 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`, 738 /// and pack the signed 16-bit results. 739 __m64 _mm_hsub_pi16 (__m64 a, __m64 b) @trusted 740 { 741 // PERF DMD 742 static if (GDC_with_SSSE3) 743 { 744 return cast(__m64)__builtin_ia32_phsubw(cast(short4)a, cast(short4)b); 745 } 746 else static if (LDC_with_ARM64) 747 { 748 // Produce uzp1 uzp2 sub sequence since LDC 1.3 -O1 749 short4 sa = cast(short4)a; 750 short4 sb = cast(short4)b; 751 short4 c = shufflevectorLDC!(short4, 0, 2, 4, 6)(sa, sb); 752 short4 d = shufflevectorLDC!(short4, 1, 3, 5, 7)(sa, sb); 753 return cast(__m64)(c - d); 754 } 755 else 756 { 757 // LDC x86: generates phsubw since LDC 1.24 -O2 758 short4 sa = cast(short4)a; 759 short4 sb = cast(short4)b; 760 short4 r; 761 r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]); 762 r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]); 763 r.ptr[2] = cast(short)(sb.array[0] - sb.array[1]); 764 r.ptr[3] = cast(short)(sb.array[2] - sb.array[3]); 765 return cast(__m64)r; 766 } 767 } 768 unittest 769 { 770 __m64 A = _mm_setr_pi16(short.min, 1, 4, 8); 771 __m64 B = _mm_setr_pi16(16, 32, 1, -32768); 772 short4 C = cast(short4) _mm_hsub_pi16(A, B); 773 short[4] correct = [ short.max, -4, -16, -32767]; 774 assert(C.array == correct); 775 } 776 777 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`, 778 /// and pack the signed 32-bit results. 779 __m64 _mm_hsub_pi32 (__m64 a, __m64 b) @trusted 780 { 781 // PERF DMD 782 static if (GDC_with_SSSE3) 783 { 784 return cast(__m64)__builtin_ia32_phsubd(cast(int2)a, cast(int2)b); 785 } 786 else static if (LDC_with_ARM64) 787 { 788 // LDC arm64: generates zip1+zip2+sub sequence since LDC 1.8 -O1 789 int2 ia = cast(int2)a; 790 int2 ib = cast(int2)b; 791 int2 c = shufflevectorLDC!(int2, 0, 2)(ia, ib); 792 int2 d = shufflevectorLDC!(int2, 1, 3)(ia, ib); 793 return cast(__m64)(c - d); 794 } 795 else 796 { 797 // LDC x86: generates phsubd since LDC 1.24 -O2 798 int2 ia = cast(int2)a; 799 int2 ib = cast(int2)b; 800 int2 r; 801 r.ptr[0] = ia.array[0] - ia.array[1]; 802 r.ptr[1] = ib.array[0] - ib.array[1]; 803 return cast(__m64)r; 804 } 805 } 806 unittest 807 { 808 __m64 A = _mm_setr_pi32(int.min, 1); 809 __m64 B = _mm_setr_pi32(int.max, -1); 810 int2 C = cast(int2) _mm_hsub_pi32(A, B); 811 int[2] correct = [ int.max, int.min ]; 812 assert(C.array == correct); 813 } 814 815 /// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 816 /// and pack the signed 16-bit results. 817 __m128i _mm_hsubs_epi16 (__m128i a, __m128i b) @trusted 818 { 819 // PERF DMD 820 static if (GDC_with_SSSE3) 821 { 822 return cast(__m128i)__builtin_ia32_phsubsw128(cast(short8)a, cast(short8)b); 823 } 824 else static if (LDC_with_SSSE3) 825 { 826 return cast(__m128i)__builtin_ia32_phsubsw128(cast(short8)a, cast(short8)b); 827 } 828 else static if (LDC_with_ARM64) 829 { 830 // uzp1/uzp2/sqsub sequence 831 short8 sa = cast(short8)a; 832 short8 sb = cast(short8)b; 833 short8 c = shufflevectorLDC!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb); 834 short8 d = shufflevectorLDC!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb); 835 return cast(__m128i)vqsubq_s16(c, d); 836 } 837 else 838 { 839 short8 sa = cast(short8)a; 840 short8 sb = cast(short8)b; 841 short8 r; 842 r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] - sa.array[1]); 843 r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] - sa.array[3]); 844 r.ptr[2] = saturateSignedIntToSignedShort(sa.array[4] - sa.array[5]); 845 r.ptr[3] = saturateSignedIntToSignedShort(sa.array[6] - sa.array[7]); 846 r.ptr[4] = saturateSignedIntToSignedShort(sb.array[0] - sb.array[1]); 847 r.ptr[5] = saturateSignedIntToSignedShort(sb.array[2] - sb.array[3]); 848 r.ptr[6] = saturateSignedIntToSignedShort(sb.array[4] - sb.array[5]); 849 r.ptr[7] = saturateSignedIntToSignedShort(sb.array[6] - sb.array[7]); 850 return cast(__m128i)r; 851 } 852 } 853 unittest 854 { 855 __m128i A = _mm_setr_epi16(1, -2, 4, 8, 32767, -1, -10, 32767); 856 short8 C = cast(short8) _mm_hsubs_epi16(A, A); 857 short[8] correct = [ 3, -4, 32767, -32768, 3, -4, 32767, -32768 ]; 858 assert(C.array == correct); 859 } 860 861 862 /// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 863 /// and pack the signed 16-bit results. 864 __m64 _mm_hsubs_pi16 (__m64 a, __m64 b) @trusted 865 { 866 static if (GDC_with_SSSE3) 867 { 868 return cast(__m64)__builtin_ia32_phsubsw(cast(short4)a, cast(short4)b); 869 } 870 else static if (LDC_with_SSSE3) 871 { 872 // Note: LDC doesn't have __builtin_ia32_phsubsw 873 long2 la; 874 la.ptr[0] = a.array[0]; 875 long2 lb; 876 lb.ptr[0] = b.array[0]; 877 int4 sum = cast(int4)__builtin_ia32_phsubsw128(cast(short8)la, cast(short8)lb); 878 int2 r; 879 r.ptr[0] = sum.array[0]; 880 r.ptr[1] = sum.array[2]; 881 return cast(__m64)r; 882 } 883 else static if (LDC_with_ARM64) 884 { 885 // uzp1/uzp2/sqsub sequence in -O1 886 short4 sa = cast(short4)a; 887 short4 sb = cast(short4)b; 888 short4 c = shufflevectorLDC!(short4, 0, 2, 4, 6)(sa, sb); 889 short4 d = shufflevectorLDC!(short4, 1, 3, 5, 7)(sa, sb); 890 return cast(__m64)vqsub_s16(c, d); 891 } 892 else 893 { 894 short4 sa = cast(short4)a; 895 short4 sb = cast(short4)b; 896 short4 r; 897 r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] - sa.array[1]); 898 r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] - sa.array[3]); 899 r.ptr[2] = saturateSignedIntToSignedShort(sb.array[0] - sb.array[1]); 900 r.ptr[3] = saturateSignedIntToSignedShort(sb.array[2] - sb.array[3]); 901 return cast(__m64)r; 902 } 903 } 904 unittest 905 { 906 __m64 A = _mm_setr_pi16(-16, 32, 100, -32768); 907 __m64 B = _mm_setr_pi16( 64, 30, -9, 32767); 908 short4 C = cast(short4) _mm_hsubs_pi16(A, B); 909 short[4] correct = [ -48, 32767, 34, -32768]; 910 assert(C.array == correct); 911 } 912 913 914 /// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 915 /// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 916 /// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 917 /// and pack the saturated results. 918 __m128i _mm_maddubs_epi16 (__m128i a, __m128i b) @trusted 919 { 920 static if (GDC_with_SSSE3) 921 { 922 return cast(__m128i)__builtin_ia32_pmaddubsw128(cast(ubyte16)a, cast(ubyte16)b); 923 } 924 else static if (LDC_with_SSSE3) 925 { 926 return cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16)a, cast(byte16)b); 927 } 928 else 929 { 930 // zero-extend a to 16-bit 931 __m128i zero = _mm_setzero_si128(); 932 __m128i a_lo = _mm_unpacklo_epi8(a, zero); 933 __m128i a_hi = _mm_unpackhi_epi8(a, zero); 934 935 // sign-extend b to 16-bit 936 __m128i b_lo = _mm_unpacklo_epi8(b, zero); 937 __m128i b_hi = _mm_unpackhi_epi8(b, zero); 938 b_lo = _mm_srai_epi16( _mm_slli_epi16(b_lo, 8), 8); 939 b_hi = _mm_srai_epi16( _mm_slli_epi16(b_hi, 8), 8); 940 941 // Multiply element-wise, no overflow can occur 942 __m128i c_lo = _mm_mullo_epi16(a_lo, b_lo); 943 __m128i c_hi = _mm_mullo_epi16(a_hi, b_hi); 944 945 // Add pairwise with saturating horizontal add 946 return _mm_hadds_epi16(c_lo, c_hi); 947 } 948 } 949 unittest 950 { 951 __m128i A = _mm_setr_epi8( -1, 10, 100, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // u8 952 __m128i B = _mm_setr_epi8(-128, -30, 100, 127, -1, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0); // i8 953 short8 C = cast(short8) _mm_maddubs_epi16(A, B); 954 short[8] correct = [ -32768, 26256, 0, 0, 0, 0, 0, 0]; 955 assert(C.array == correct); 956 } 957 958 /// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 959 /// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 960 /// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 961 /// and pack the saturated results. 962 __m64 _mm_maddubs_pi16 (__m64 a, __m64 b) @trusted 963 { 964 static if (GDC_with_SSSE3) 965 { 966 return cast(__m64)__builtin_ia32_pmaddubsw(cast(ubyte8)a, cast(ubyte8)b); 967 } 968 else static if (LDC_with_SSSE3) 969 { 970 __m128i A = to_m128i(a); 971 __m128i B = to_m128i(b); 972 return to_m64( cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16) to_m128i(a), cast(byte16) to_m128i(b))); 973 } 974 else 975 { 976 // zero-extend a to 16-bit 977 __m128i zero = _mm_setzero_si128(); 978 __m128i A = _mm_unpacklo_epi8(to_m128i(a), zero); 979 980 // sign-extend b to 16-bit 981 __m128i B = _mm_unpacklo_epi8(to_m128i(b), zero); 982 B = _mm_srai_epi16( _mm_slli_epi16(B, 8), 8); 983 984 // Multiply element-wise, no overflow can occur 985 __m128i c = _mm_mullo_epi16(A, B); 986 987 // Add pairwise with saturating horizontal add 988 return to_m64( _mm_hadds_epi16(c, zero)); 989 } 990 } 991 unittest 992 { 993 __m64 A = _mm_setr_pi8( -1, 10, 100, -128, 0, 0, 0, 0); // u8 994 __m64 B = _mm_setr_pi8(-128, -30, 100, 127, -1, 2, 4, 6); // i8 995 short4 C = cast(short4) _mm_maddubs_pi16(A, B); 996 short[4] correct = [ -32768, 26256, 0, 0]; 997 assert(C.array == correct); 998 } 999 1000 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers. 1001 /// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return bits `[16:1]`. 1002 __m128i _mm_mulhrs_epi16 (__m128i a, __m128i b) @trusted 1003 { 1004 // PERF DMD 1005 static if (GDC_with_SSSE3) 1006 { 1007 return cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8)a, cast(short8)b); 1008 } 1009 else static if (LDC_with_SSSE3) 1010 { 1011 return cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8)a, cast(short8)b); 1012 } 1013 else static if (LDC_with_ARM64) 1014 { 1015 int4 mul_lo = vmull_s16(vget_low_s16(cast(short8)a), 1016 vget_low_s16(cast(short8)b)); 1017 int4 mul_hi = vmull_s16(vget_high_s16(cast(short8)a), 1018 vget_high_s16(cast(short8)b)); 1019 1020 // Rounding narrowing shift right 1021 // narrow = (int16_t)((mul + 16384) >> 15); 1022 short4 narrow_lo = vrshrn_n_s32(mul_lo, 15); 1023 short4 narrow_hi = vrshrn_n_s32(mul_hi, 15); 1024 1025 // Join together. 1026 return cast(__m128i) vcombine_s16(narrow_lo, narrow_hi); 1027 } 1028 else 1029 { 1030 short8 sa = cast(short8)a; 1031 short8 sb = cast(short8)b; 1032 short8 r; 1033 1034 for (int i = 0; i < 8; ++i) 1035 { 1036 // I doubted it at first, but an exhaustive search show this to be equivalent to Intel pseudocode. 1037 r.ptr[i] = cast(short) ( (sa.array[i] * sb.array[i] + 0x4000) >> 15); 1038 } 1039 1040 return cast(__m128i)r; 1041 } 1042 } 1043 1044 unittest 1045 { 1046 __m128i A = _mm_setr_epi16(12345, -32768, 32767, 0, 1, 845, -6999, -1); 1047 __m128i B = _mm_setr_epi16(8877, -24487, 15678, 32760, 1, 0, -149, -1); 1048 short8 C = cast(short8) _mm_mulhrs_epi16(A, B); 1049 short[8] correct = [3344, 24487, 15678, 0, 0, 0, 32, 0]; 1050 assert(C.array == correct); 1051 } 1052 1053 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers. 1054 /// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return bits `[16:1]`. 1055 __m64 _mm_mulhrs_pi16 (__m64 a, __m64 b) @trusted 1056 { 1057 // PERF DMD 1058 static if (GDC_with_SSSE3) 1059 { 1060 return cast(__m64) __builtin_ia32_pmulhrsw(cast(short4)a, cast(short4)b); 1061 } 1062 else static if (LDC_with_SSSE3) 1063 { 1064 return cast(__m64) to_m64( cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8) to_m128i(a), cast(short8) to_m128i(b))); 1065 } 1066 else static if (LDC_with_ARM64) 1067 { 1068 int4 mul = vmull_s16(cast(short4)a, cast(short4)b); 1069 1070 // Rounding narrowing shift right 1071 // (int16_t)((mul + 16384) >> 15); 1072 return cast(__m64) vrshrn_n_s32(mul, 15); 1073 } 1074 else 1075 { 1076 short4 sa = cast(short4)a; 1077 short4 sb = cast(short4)b; 1078 short4 r; 1079 1080 for (int i = 0; i < 4; ++i) 1081 { 1082 r.ptr[i] = cast(short) ( (sa.array[i] * sb.array[i] + 0x4000) >> 15); 1083 } 1084 return cast(__m64)r; 1085 } 1086 } 1087 unittest 1088 { 1089 __m64 A = _mm_setr_pi16(12345, -32768, 32767, 0); 1090 __m64 B = _mm_setr_pi16(8877, -24487, 15678, 32760); 1091 short4 C = cast(short4) _mm_mulhrs_pi16(A, B); 1092 short[4] correct = [3344, 24487, 15678, 0]; 1093 assert(C.array == correct); 1094 } 1095 1096 1097 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`. 1098 __m128i _mm_shuffle_epi8 (__m128i a, __m128i b) @trusted 1099 { 1100 // This is the lovely pshufb. 1101 // PERF DMD 1102 static if (GDC_with_SSSE3) 1103 { 1104 return cast(__m128i) __builtin_ia32_pshufb128(cast(ubyte16) a, cast(ubyte16) b); 1105 } 1106 else static if (LDC_with_SSSE3) 1107 { 1108 return cast(__m128i) __builtin_ia32_pshufb128(cast(byte16) a, cast(byte16) b); 1109 } 1110 else static if (LDC_with_ARM64) 1111 { 1112 byte16 bb = cast(byte16)b; 1113 byte16 mask; 1114 mask = cast(byte)(0x8F); 1115 bb = bb & mask; 1116 byte16 r = vqtbl1q_s8(cast(byte16)a, bb); 1117 return cast(__m128i)r; 1118 } 1119 else 1120 { 1121 byte16 r; 1122 byte16 ba = cast(byte16)a; 1123 byte16 bb = cast(byte16)b; 1124 for (int i = 0; i < 16; ++i) 1125 { 1126 byte s = bb.array[i]; 1127 r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 15 ]; 1128 } 1129 return cast(__m128i)r; 1130 } 1131 } 1132 unittest 1133 { 1134 __m128i A = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 1135 __m128i B = _mm_setr_epi8(15, -128, 13 + 16, -12, 11, -10, 9, 8, 7, 6, -5, 4, 3, -2, 1, 0); 1136 byte16 C = cast(byte16) _mm_shuffle_epi8(A, B); 1137 byte[16] correct = [0, 0, 2, 0, 4, 0, 6, 7, 8, 9, 0, 11, 12, 0, 14, 15]; 1138 assert(C.array == correct); 1139 } 1140 1141 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`. 1142 __m64 _mm_shuffle_pi8 (__m64 a, __m64 b) @trusted 1143 { 1144 // PERF DMD 1145 static if (GDC_with_SSSE3) 1146 { 1147 alias ubyte8 =__vector(ubyte[8]); 1148 return cast(__m64) __builtin_ia32_pshufb(cast(ubyte8) a, cast(ubyte8) b); 1149 } 1150 else static if (LDC_with_SSSE3) 1151 { 1152 // GDC does proper dance to avoid mmx registers, do it manually in LDC since __builtin_ia32_pshufb doesn't exist there 1153 __m128i A = to_m128i(a); 1154 __m128i index = to_m128i(b); 1155 index = index & _mm_set1_epi32(0xF7F7F7F7); 1156 return to_m64( cast(__m128i) __builtin_ia32_pshufb128(cast(byte16)A, cast(byte16) index) ); 1157 } 1158 else static if (LDC_with_ARM64) 1159 { 1160 byte8 bb = cast(byte8)b; 1161 byte8 mask; 1162 mask = cast(byte)(0x87); 1163 bb = bb & mask; 1164 __m128i l = to_m128i(a); 1165 byte8 r = vtbl1_s8(cast(byte16)l, cast(byte8)bb); 1166 return cast(__m64)r; 1167 } 1168 else 1169 { 1170 byte8 r; 1171 byte8 ba = cast(byte8)a; 1172 byte8 bb = cast(byte8)b; 1173 for (int i = 0; i < 8; ++i) 1174 { 1175 byte s = bb.array[i]; 1176 r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 7 ]; 1177 } 1178 return cast(__m64)r; 1179 } 1180 } 1181 unittest 1182 { 1183 __m64 A = _mm_setr_pi8(7, 6, 5, 4, 3, 2, 1, 0); 1184 __m64 B = _mm_setr_pi8(7, 6, -5, 4, 3 + 8, -2, 1, 0); 1185 byte8 C = cast(byte8) _mm_shuffle_pi8(A, B); 1186 byte[8] correct = [0, 1, 0, 3, 4, 0, 6, 7]; 1187 assert(C.array == correct); 1188 } 1189 1190 /// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit integer in `b` is negative. 1191 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 1192 __m128i _mm_sign_epi16 (__m128i a, __m128i b) @trusted 1193 { 1194 // PERF DMD 1195 static if (GDC_with_SSSE3) 1196 { 1197 return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b); 1198 } 1199 else static if (LDC_with_SSSE3) 1200 { 1201 return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b); 1202 } 1203 else 1204 { 1205 // LDC arm64: 5 instructions 1206 __m128i mask = _mm_srai_epi16(b, 15); 1207 __m128i zeromask = _mm_cmpeq_epi16(b, _mm_setzero_si128()); 1208 return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi16(a, mask), mask)); 1209 } 1210 } 1211 unittest 1212 { 1213 __m128i A = _mm_setr_epi16(-2, -1, 0, 1, 2, short.min, short.min, short.min); 1214 __m128i B = _mm_setr_epi16(-1, 0,-1, 1, -2, -50, 0, 50); 1215 short8 C = cast(short8) _mm_sign_epi16(A, B); 1216 short[8] correct = [ 2, 0, 0, 1, -2, short.min, 0, short.min]; 1217 assert(C.array == correct); 1218 } 1219 1220 /// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit integer in `b` is negative. 1221 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 1222 __m128i _mm_sign_epi32 (__m128i a, __m128i b) @trusted 1223 { 1224 // PERF DMD 1225 static if (GDC_with_SSSE3) 1226 { 1227 return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b); 1228 } 1229 else static if (LDC_with_SSSE3) 1230 { 1231 return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b); 1232 } 1233 else 1234 { 1235 __m128i mask = _mm_srai_epi32(b, 31); 1236 __m128i zeromask = _mm_cmpeq_epi32(b, _mm_setzero_si128()); 1237 return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi32(a, mask), mask)); 1238 } 1239 } 1240 unittest 1241 { 1242 __m128i A = _mm_setr_epi32(-2, -1, 0, int.max); 1243 __m128i B = _mm_setr_epi32(-1, 0, -1, 1); 1244 int4 C = cast(int4) _mm_sign_epi32(A, B); 1245 int[4] correct = [ 2, 0, 0, int.max]; 1246 assert(C.array == correct); 1247 } 1248 1249 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 1250 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 1251 __m128i _mm_sign_epi8 (__m128i a, __m128i b) @trusted 1252 { 1253 // PERF DMD 1254 static if (GDC_with_SSSE3) 1255 { 1256 return cast(__m128i) __builtin_ia32_psignb128(cast(ubyte16)a, cast(ubyte16)b); 1257 } 1258 else static if (LDC_with_SSSE3) 1259 { 1260 return cast(__m128i) __builtin_ia32_psignb128(cast(byte16)a, cast(byte16)b); 1261 } 1262 else 1263 { 1264 __m128i mask = _mm_cmplt_epi8(b, _mm_setzero_si128()); // extend sign bit 1265 __m128i zeromask = _mm_cmpeq_epi8(b, _mm_setzero_si128()); 1266 return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi8(a, mask), mask)); 1267 } 1268 } 1269 unittest 1270 { 1271 __m128i A = _mm_setr_epi8(-2, -1, 0, 1, 2, byte.min, byte.min, byte.min, -1, 0,-1, 1, -2, -50, 0, 50); 1272 __m128i B = _mm_setr_epi8(-1, 0,-1, 1, -2, -50, 0, 50, -2, -1, 0, 1, 2, byte.min, byte.min, byte.min); 1273 byte16 C = cast(byte16) _mm_sign_epi8(A, B); 1274 byte[16] correct = [ 2, 0, 0, 1, -2, byte.min, 0, byte.min, 1, 0, 0, 1, -2, 50, 0, -50]; 1275 assert(C.array == correct); 1276 } 1277 1278 /// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit integer in `b` is negative. 1279 /// Element in result are zeroed out when the corresponding element in `b` is zero. 1280 __m64 _mm_sign_pi16 (__m64 a, __m64 b) @trusted 1281 { 1282 return to_m64( _mm_sign_epi16( to_m128i(a), to_m128i(b)) ); 1283 } 1284 unittest 1285 { 1286 __m64 A = _mm_setr_pi16( 2, short.min, short.min, short.min); 1287 __m64 B = _mm_setr_pi16(-2, -50, 0, 50); 1288 short4 C = cast(short4) _mm_sign_pi16(A, B); 1289 short[4] correct = [-2, short.min, 0, short.min]; 1290 assert(C.array == correct); 1291 } 1292 1293 /// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit integer in `b` is negative. 1294 /// Element in result are zeroed out when the corresponding element in `b` is zero. 1295 __m64 _mm_sign_pi32 (__m64 a, __m64 b) @trusted 1296 { 1297 return to_m64( _mm_sign_epi32( to_m128i(a), to_m128i(b)) ); 1298 } 1299 unittest 1300 { 1301 __m64 A = _mm_setr_pi32(-2, -100); 1302 __m64 B = _mm_setr_pi32(-1, 0); 1303 int2 C = cast(int2) _mm_sign_pi32(A, B); 1304 int[2] correct = [ 2, 0]; 1305 assert(C.array == correct); 1306 } 1307 1308 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 1309 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 1310 __m64 _mm_sign_pi8 (__m64 a, __m64 b) @trusted 1311 { 1312 return to_m64( _mm_sign_epi8( to_m128i(a), to_m128i(b)) ); 1313 } 1314 unittest 1315 { 1316 __m64 A = _mm_setr_pi8(-2, -1, 0, 1, 2, byte.min, byte.min, byte.min); 1317 __m64 B = _mm_setr_pi8(-1, 0,-1, 1, -2, -50, 0, 50); 1318 byte8 C = cast(byte8) _mm_sign_pi8(A, B); 1319 byte[8] correct = [ 2, 0, 0, 1, -2, byte.min, 0, byte.min]; 1320 assert(C.array == correct); 1321 }