1 /** 2 * SSSE3 intrinsics. 3 * 4 * Copyright: Guillaume Piolat 2021. 5 * Johan Engelen 2021. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.tmmintrin; 9 10 public import inteli.types; 11 import inteli.internals; 12 13 public import inteli.pmmintrin; 14 import inteli.mmx; 15 16 nothrow @nogc: 17 18 19 // SSSE3 instructions 20 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSSE3 21 // Note: this header will work whether you have SSSE3 enabled or not. 22 // With LDC, use "dflags-ldc": ["-mattr=+ssse3"] or equivalent to actively 23 // generate SSE3 instructions. 24 25 /// Compute the absolute value of packed signed 16-bit integers in `a`. 26 __m128i _mm_abs_epi16 (__m128i a) @trusted 27 { 28 static if (DMD_with_DSIMD) 29 { 30 return cast(__m128i)__simd(XMM.PABSW, a); 31 } 32 else static if (GDC_with_SSSE3) 33 { 34 return cast(__m128i) __builtin_ia32_pabsw128(cast(short8)a); 35 } 36 else static if (LDC_with_ARM64) 37 { 38 return cast(__m128i) vabsq_s16(cast(short8)a); 39 } 40 else 41 { 42 // LDC x86: generate pabsw since LDC 1.1 -O2 43 short8 sa = cast(short8)a; 44 for (int i = 0; i < 8; ++i) 45 { 46 short s = sa.array[i]; 47 sa.ptr[i] = s >= 0 ? s : cast(short)(-cast(int)(s)); 48 } 49 return cast(__m128i)sa; 50 } 51 } 52 unittest 53 { 54 __m128i A = _mm_setr_epi16(0, -1, -32768, 32767, 10, -10, 1000, -1000); 55 short8 B = cast(short8) _mm_abs_epi16(A); 56 short[8] correct = [0, 1, -32768, 32767, 10, 10, 1000, 1000]; 57 assert(B.array == correct); 58 } 59 60 /// Compute the absolute value of packed signed 32-bit integers in `a`. 61 __m128i _mm_abs_epi32 (__m128i a) @trusted 62 { 63 static if (DMD_with_DSIMD) 64 { 65 return cast(__m128i)__simd(XMM.PABSD, cast(int4)a); 66 } 67 else static if (GDC_with_SSSE3) 68 { 69 return cast(__m128i) __builtin_ia32_pabsd128(cast(int4)a); 70 } 71 else static if (LDC_with_ARM64) 72 { 73 return cast(__m128i) vabsq_s32(cast(int4)a); 74 } 75 else 76 { 77 // LDC x86: generates pabsd since LDC 1.1 -O2 78 int4 sa = cast(int4)a; 79 for (int i = 0; i < 4; ++i) 80 { 81 int s = sa.array[i]; 82 sa.ptr[i] = s >= 0 ? s : -s; 83 } 84 return cast(__m128i)sa; 85 } 86 } 87 unittest 88 { 89 __m128i A = _mm_setr_epi32(0, -1, -2_147_483_648, -2_147_483_647); 90 int4 B = cast(int4) _mm_abs_epi32(A); 91 int[4] correct = [0, 1, -2_147_483_648, 2_147_483_647]; 92 assert(B.array == correct); 93 } 94 95 /// Compute the absolute value of packed signed 8-bit integers in `a`. 96 __m128i _mm_abs_epi8 (__m128i a) @trusted 97 { 98 static if (DMD_with_DSIMD) 99 { 100 return cast(__m128i)__simd(XMM.PABSB, cast(byte16)a); 101 } 102 else static if (GDC_with_SSSE3) 103 { 104 alias ubyte16 = __vector(ubyte[16]); 105 return cast(__m128i) __builtin_ia32_pabsb128(cast(ubyte16)a); 106 } 107 else static if (LDC_with_ARM64) 108 { 109 return cast(__m128i) vabsq_s8(cast(byte16)a); 110 } 111 else version(LDC) 112 { 113 // LDC x86: generates pabsb since LDC 1.1 -O1 114 // arm64: generates abs since LDC 1.8 -O1 115 enum ir = ` 116 %n = sub <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %0 117 %s = icmp slt <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %0 118 %r = select <16 x i1> %s, <16 x i8> %0, <16 x i8> %n 119 ret <16 x i8> %r`; 120 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16)(cast(byte16)a); 121 } 122 else 123 { 124 // A loop version like in _mm_abs_epi16/_mm_abs_epi32 would be very slow 125 // in LDC x86 and wouldn't vectorize. Doesn't generate pabsb in LDC though. 126 return _mm_min_epu8(a, _mm_sub_epi8(_mm_setzero_si128(), a)); 127 } 128 } 129 unittest 130 { 131 __m128i A = _mm_setr_epi8(0, -1, -128, -127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 132 byte16 B = cast(byte16) _mm_abs_epi8(A); 133 byte[16] correct = [0, 1, -128, 127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 134 assert(B.array == correct); 135 } 136 137 /// Compute the absolute value of packed signed 16-bit integers in `a`. 138 __m64 _mm_abs_pi16 (__m64 a) @trusted 139 { 140 return to_m64(_mm_abs_epi16(to_m128i(a))); 141 } 142 unittest 143 { 144 __m64 A = _mm_setr_pi16(0, -1, -32768, 32767); 145 short4 B = cast(short4) _mm_abs_pi16(A); 146 short[4] correct = [0, 1, -32768, 32767]; 147 assert(B.array == correct); 148 } 149 150 /// Compute the absolute value of packed signed 32-bit integers in `a`. 151 __m64 _mm_abs_pi32 (__m64 a) @trusted 152 { 153 return to_m64(_mm_abs_epi32(to_m128i(a))); 154 } 155 unittest 156 { 157 __m64 A = _mm_setr_pi32(-1, -2_147_483_648); 158 int2 B = cast(int2) _mm_abs_pi32(A); 159 int[2] correct = [1, -2_147_483_648]; 160 assert(B.array == correct); 161 } 162 163 /// Compute the absolute value of packed signed 8-bit integers in `a`. 164 __m64 _mm_abs_pi8 (__m64 a) @trusted 165 { 166 return to_m64(_mm_abs_epi8(to_m128i(a))); 167 } 168 unittest 169 { 170 __m64 A = _mm_setr_pi8(0, -1, -128, -127, 127, 0, 0, 0); 171 byte8 B = cast(byte8) _mm_abs_pi8(A); 172 byte[8] correct = [0, 1, -128, 127, 127, 0, 0, 0]; 173 assert(B.array == correct); 174 } 175 176 /// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result, shift the result right by `count` bytes, and return the low 16 bytes. 177 __m128i _mm_alignr_epi8(ubyte count)(__m128i a, __m128i b) @trusted 178 { 179 // PERF DMD 180 static if (GDC_with_SSSE3) 181 { 182 return cast(__m128i)__builtin_ia32_palignr128(cast(long2)a, cast(long2)b, count * 8); 183 } 184 else version(LDC) 185 { 186 static if (count >= 32) 187 { 188 return _mm_setzero_si128(); 189 } 190 else static if (count < 16) 191 { 192 // Generates palignr since LDC 1.1 -O1 193 // Also generates a single ext instruction on arm64. 194 return cast(__m128i) shufflevector!(byte16, ( 0 + count), 195 ( 1 + count), 196 ( 2 + count), 197 ( 3 + count), 198 ( 4 + count), 199 ( 5 + count), 200 ( 6 + count), 201 ( 7 + count), 202 ( 8 + count), 203 ( 9 + count), 204 (10 + count), 205 (11 + count), 206 (12 + count), 207 (13 + count), 208 (14 + count), 209 (15 + count))(cast(byte16)b, cast(byte16)a); 210 } 211 else 212 { 213 return cast(__m128i) shufflevector!(byte16, ( 0 + count) % 32, 214 ( 1 + count) % 32, 215 ( 2 + count) % 32, 216 ( 3 + count) % 32, 217 ( 4 + count) % 32, 218 ( 5 + count) % 32, 219 ( 6 + count) % 32, 220 ( 7 + count) % 32, 221 ( 8 + count) % 32, 222 ( 9 + count) % 32, 223 (10 + count) % 32, 224 (11 + count) % 32, 225 (12 + count) % 32, 226 (13 + count) % 32, 227 (14 + count) % 32, 228 (15 + count) % 32)(cast(byte16)_mm_setzero_si128(), cast(byte16)a); 229 } 230 } 231 else 232 { 233 byte16 ab = cast(byte16)a; 234 byte16 bb = cast(byte16)b; 235 byte16 r; 236 237 for (int i = 0; i < 16; ++i) 238 { 239 const int srcpos = count + cast(int)i; 240 if (srcpos > 31) 241 { 242 r.ptr[i] = 0; 243 } 244 else if (srcpos > 15) 245 { 246 r.ptr[i] = ab.array[(srcpos) & 15]; 247 } 248 else 249 { 250 r.ptr[i] = bb.array[srcpos]; 251 } 252 } 253 return cast(__m128i)r; 254 } 255 } 256 unittest 257 { 258 __m128i A = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); 259 __m128i B = _mm_setr_epi8(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); 260 261 { 262 byte16 C = cast(byte16)_mm_alignr_epi8!0(A ,B); 263 byte[16] correct = [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]; 264 assert(C.array == correct); 265 } 266 { 267 byte16 C = cast(byte16)_mm_alignr_epi8!20(A ,B); 268 byte[16] correct = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0]; 269 assert(C.array == correct); 270 } 271 { 272 byte16 C = cast(byte16)_mm_alignr_epi8!34(A ,B); 273 byte[16] correct = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 274 assert(C.array == correct); 275 } 276 277 __m128i D = _mm_setr_epi8(-123, -82, 103, -69, 103, -26, 9, 106, 58, -11, 79, -91, 114, -13, 110, 60); 278 __m128i E = _mm_setr_epi8(25, -51, -32, 91, -85, -39, -125, 31, -116, 104, 5, -101, 127, 82, 14, 81); 279 byte16 F = cast(byte16)_mm_alignr_epi8!8(D, E); 280 byte[16] correct = [-116, 104, 5, -101, 127, 82, 14, 81, -123, -82, 103, -69, 103, -26, 9, 106]; 281 assert(F.array == correct); 282 } 283 284 /// Concatenate 8-byte blocks in `a` and `b` into a 16-byte temporary result, shift the result right by `count` bytes, and return the low 8 bytes. 285 __m64 _mm_alignr_pi8(ubyte count)(__m64 a, __m64 b) @trusted 286 { 287 // PERF DMD 288 static if (GDC_with_SSSE3) 289 { 290 return cast(__m64)__builtin_ia32_palignr(cast(long1)a, cast(long1)b, count * 8); 291 } 292 else version(LDC) 293 { 294 static if (count >= 16) 295 { 296 return _mm_setzero_si64(); 297 } 298 else static if (count < 8) 299 { 300 // Note: in LDC x86 this uses a pshufb. 301 // Generates ext in arm64. 302 return cast(__m64) shufflevector!(byte8, (0 + count), 303 (1 + count), 304 (2 + count), 305 (3 + count), 306 (4 + count), 307 (5 + count), 308 (6 + count), 309 (7 + count))(cast(byte8)b, cast(byte8)a); 310 } 311 else 312 { 313 return cast(__m64) shufflevector!(byte8, (0 + count)%16, 314 (1 + count)%16, 315 (2 + count)%16, 316 (3 + count)%16, 317 (4 + count)%16, 318 (5 + count)%16, 319 (6 + count)%16, 320 (7 + count)%16)(cast(byte8)_mm_setzero_si64(), cast(byte8)a); 321 } 322 } 323 else 324 { 325 byte8 ab = cast(byte8)a; 326 byte8 bb = cast(byte8)b; 327 byte8 r; 328 329 for (int i = 0; i < 8; ++i) 330 { 331 const int srcpos = count + cast(int)i; 332 if (srcpos > 15) 333 { 334 r.ptr[i] = 0; 335 } 336 else if (srcpos > 7) 337 { 338 r.ptr[i] = ab.array[(srcpos) & 7]; 339 } 340 else 341 { 342 r.ptr[i] = bb.array[srcpos]; 343 } 344 } 345 return cast(__m64)r; 346 } 347 } 348 unittest 349 { 350 __m64 A = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8); 351 __m64 B = _mm_setr_pi8(17, 18, 19, 20, 21, 22, 23, 24); 352 353 { 354 byte8 C = cast(byte8)_mm_alignr_pi8!0(A ,B); 355 byte[8] correct = [17, 18, 19, 20, 21, 22, 23, 24]; 356 assert(C.array == correct); 357 } 358 359 { 360 byte8 C = cast(byte8)_mm_alignr_pi8!3(A ,B); 361 byte[8] correct = [ 20, 21, 22, 23, 24, 1, 2, 3]; 362 assert(C.array == correct); 363 } 364 { 365 byte8 C = cast(byte8)_mm_alignr_pi8!11(A ,B); 366 byte[8] correct = [4, 5, 6, 7, 8, 0, 0, 0]; 367 assert(C.array == correct); 368 } 369 { 370 byte8 C = cast(byte8)_mm_alignr_pi8!17(A ,B); 371 byte[8] correct = [0, 0, 0, 0, 0, 0, 0, 0]; 372 assert(C.array == correct); 373 } 374 } 375 376 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results. 377 __m128i _mm_hadd_epi16 (__m128i a, __m128i b) @trusted 378 { 379 // PERF DMD 380 static if (GDC_with_SSSE3) 381 { 382 return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b); 383 } 384 else static if (LDC_with_SSSE3) 385 { 386 return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b); 387 } 388 else static if (LDC_with_ARM64) 389 { 390 return cast(__m128i)vpaddq_s16(cast(short8)a, cast(short8)b); 391 } 392 else 393 { 394 short8 sa = cast(short8)a; 395 short8 sb = cast(short8)b; 396 short8 r; 397 r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]); 398 r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]); 399 r.ptr[2] = cast(short)(sa.array[4] + sa.array[5]); 400 r.ptr[3] = cast(short)(sa.array[6] + sa.array[7]); 401 r.ptr[4] = cast(short)(sb.array[0] + sb.array[1]); 402 r.ptr[5] = cast(short)(sb.array[2] + sb.array[3]); 403 r.ptr[6] = cast(short)(sb.array[4] + sb.array[5]); 404 r.ptr[7] = cast(short)(sb.array[6] + sb.array[7]); 405 return cast(__m128i)r; 406 } 407 } 408 unittest 409 { 410 __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768); 411 short8 C = cast(short8) _mm_hadd_epi16(A, A); 412 short[8] correct = [ -1, 12, 48, 32767, -1, 12, 48, 32767]; 413 assert(C.array == correct); 414 } 415 416 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results. 417 __m128i _mm_hadd_epi32 (__m128i a, __m128i b) @trusted 418 { 419 // PERF DMD 420 static if (GDC_with_SSSE3) 421 { 422 return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b); 423 } 424 else static if (LDC_with_SSSE3) 425 { 426 return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b); 427 } 428 else static if (LDC_with_ARM64) 429 { 430 return cast(__m128i)vpaddq_s32(cast(int4)a, cast(int4)b); 431 } 432 else 433 { 434 int4 ia = cast(int4)a; 435 int4 ib = cast(int4)b; 436 int4 r; 437 r.ptr[0] = ia.array[0] + ia.array[1]; 438 r.ptr[1] = ia.array[2] + ia.array[3]; 439 r.ptr[2] = ib.array[0] + ib.array[1]; 440 r.ptr[3] = ib.array[2] + ib.array[3]; 441 return cast(__m128i)r; 442 } 443 } 444 unittest 445 { 446 __m128i A = _mm_setr_epi32(1, -2, int.min, -1); 447 __m128i B = _mm_setr_epi32(1, int.max, 4, -4); 448 int4 C = cast(int4) _mm_hadd_epi32(A, B); 449 int[4] correct = [ -1, int.max, int.min, 0 ]; 450 assert(C.array == correct); 451 } 452 453 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results. 454 __m64 _mm_hadd_pi16 (__m64 a, __m64 b) @trusted 455 { 456 // PERF DMD 457 static if (GDC_with_SSSE3) 458 { 459 return cast(__m64) __builtin_ia32_phaddw(cast(short4)a, cast(short4)b); 460 } 461 else static if (LDC_with_ARM64) 462 { 463 return cast(__m64) vpadd_s16(cast(short4)a, cast(short4)b); 464 } 465 else 466 { 467 // LDC x86: generates phaddw since LDC 1.24 -O2. 468 short4 r; 469 short4 sa = cast(short4)a; 470 short4 sb = cast(short4)b; 471 r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]); 472 r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]); 473 r.ptr[2] = cast(short)(sb.array[0] + sb.array[1]); 474 r.ptr[3] = cast(short)(sb.array[2] + sb.array[3]); 475 return cast(__m64)r; 476 } 477 } 478 unittest 479 { 480 __m64 A = _mm_setr_pi16(1, -2, 4, 8); 481 __m64 B = _mm_setr_pi16(16, 32, -1, -32768); 482 short4 C = cast(short4) _mm_hadd_pi16(A, B); 483 short[4] correct = [ -1, 12, 48, 32767 ]; 484 assert(C.array == correct); 485 } 486 487 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, 488 /// and pack the signed 32-bit results. 489 __m64 _mm_hadd_pi32 (__m64 a, __m64 b) @trusted 490 { 491 // PERF DMD 492 static if (GDC_with_SSSE3) 493 { 494 return cast(__m64) __builtin_ia32_phaddd(cast(int2)a, cast(int2)b); 495 } 496 else static if (LDC_with_ARM64) 497 { 498 return cast(__m64)vpadd_s32(cast(int2)a, cast(int2)b); 499 } 500 else 501 { 502 // LDC x86: generates phaddd since LDC 1.24 -O2 503 int2 ia = cast(int2)a; 504 int2 ib = cast(int2)b; 505 int2 r; 506 r.ptr[0] = ia.array[0] + ia.array[1]; 507 r.ptr[1] = ib.array[0] + ib.array[1]; 508 return cast(__m64)r; 509 } 510 } 511 unittest 512 { 513 __m64 A = _mm_setr_pi32(int.min, -1); 514 __m64 B = _mm_setr_pi32(1, int.max); 515 int2 C = cast(int2) _mm_hadd_pi32(A, B); 516 int[2] correct = [ int.max, int.min ]; 517 assert(C.array == correct); 518 } 519 520 /// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 521 /// and pack the signed 16-bit results. 522 __m128i _mm_hadds_epi16 (__m128i a, __m128i b) @trusted 523 { 524 // PERF DMD 525 static if (GDC_with_SSSE3) 526 { 527 return cast(__m128i)__builtin_ia32_phaddsw128(cast(short8)a, cast(short8)b); 528 } 529 else static if (LDC_with_SSSE3) 530 { 531 return cast(__m128i)__builtin_ia32_phaddsw128(cast(short8)a, cast(short8)b); 532 } 533 else static if (LDC_with_ARM64) 534 { 535 // uzp1/uzp2/sqadd sequence 536 short8 sa = cast(short8)a; 537 short8 sb = cast(short8)b; 538 short8 c = shufflevector!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb); 539 short8 d = shufflevector!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb); 540 return cast(__m128i)vqaddq_s16(c, d); 541 } 542 else 543 { 544 short8 sa = cast(short8)a; 545 short8 sb = cast(short8)b; 546 short8 r; 547 r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] + sa.array[1]); 548 r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] + sa.array[3]); 549 r.ptr[2] = saturateSignedIntToSignedShort(sa.array[4] + sa.array[5]); 550 r.ptr[3] = saturateSignedIntToSignedShort(sa.array[6] + sa.array[7]); 551 r.ptr[4] = saturateSignedIntToSignedShort(sb.array[0] + sb.array[1]); 552 r.ptr[5] = saturateSignedIntToSignedShort(sb.array[2] + sb.array[3]); 553 r.ptr[6] = saturateSignedIntToSignedShort(sb.array[4] + sb.array[5]); 554 r.ptr[7] = saturateSignedIntToSignedShort(sb.array[6] + sb.array[7]); 555 return cast(__m128i)r; 556 } 557 } 558 unittest 559 { 560 __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768); 561 short8 C = cast(short8) _mm_hadds_epi16(A, A); 562 short[8] correct = [ -1, 12, 48, -32768, -1, 12, 48, -32768]; 563 assert(C.array == correct); 564 } 565 566 /// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 567 /// and pack the signed 16-bit results. 568 __m64 _mm_hadds_pi16 (__m64 a, __m64 b) @trusted 569 { 570 static if (GDC_with_SSSE3) 571 { 572 return cast(__m64)__builtin_ia32_phaddsw(cast(short4)a, cast(short4)b); 573 } 574 else static if (LDC_with_SSSE3) 575 { 576 // Note: LDC doesn't have __builtin_ia32_phaddsw 577 long2 la; 578 la.ptr[0] = a.array[0]; 579 long2 lb; 580 lb.ptr[0] = b.array[0]; 581 int4 sum = cast(int4)__builtin_ia32_phaddsw128(cast(short8)la, cast(short8)lb); 582 int2 r; 583 r.ptr[0] = sum.array[0]; 584 r.ptr[1] = sum.array[2]; 585 return cast(__m64)r; 586 } 587 else static if (LDC_with_ARM64) 588 { 589 // uzp1/uzp2/sqadd sequence 590 short4 sa = cast(short4)a; 591 short4 sb = cast(short4)b; 592 short4 c = shufflevector!(short4, 0, 2, 4, 6)(sa, sb); 593 short4 d = shufflevector!(short4, 1, 3, 5, 7)(sa, sb); 594 return cast(__m64)vqadd_s16(c, d); 595 } 596 else 597 { 598 short4 sa = cast(short4)a; 599 short4 sb = cast(short4)b; 600 short4 r; 601 r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] + sa.array[1]); 602 r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] + sa.array[3]); 603 r.ptr[2] = saturateSignedIntToSignedShort(sb.array[0] + sb.array[1]); 604 r.ptr[3] = saturateSignedIntToSignedShort(sb.array[2] + sb.array[3]); 605 return cast(__m64)r; 606 } 607 } 608 unittest 609 { 610 __m64 A = _mm_setr_pi16(-16, 32, -100, -32768); 611 __m64 B = _mm_setr_pi16( 64, 32, 1, 32767); 612 short4 C = cast(short4) _mm_hadds_pi16(A, B); 613 short[4] correct = [ 16, -32768, 96, 32767]; 614 assert(C.array == correct); 615 } 616 617 618 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results. 619 __m128i _mm_hsub_epi16 (__m128i a, __m128i b) @trusted 620 { 621 // PERF DMD 622 static if (GDC_with_SSSE3) 623 { 624 return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b); 625 } 626 else static if (LDC_with_SSSE3) 627 { 628 return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b); 629 } 630 else static if (LDC_with_ARM64) 631 { 632 // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 633 short8 sa = cast(short8)a; 634 short8 sb = cast(short8)b; 635 short8 c = shufflevector!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb); 636 short8 d = shufflevector!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb); 637 return cast(__m128i)(c - d); 638 } 639 else 640 { 641 short8 sa = cast(short8)a; 642 short8 sb = cast(short8)b; 643 short8 r; 644 r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]); 645 r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]); 646 r.ptr[2] = cast(short)(sa.array[4] - sa.array[5]); 647 r.ptr[3] = cast(short)(sa.array[6] - sa.array[7]); 648 r.ptr[4] = cast(short)(sb.array[0] - sb.array[1]); 649 r.ptr[5] = cast(short)(sb.array[2] - sb.array[3]); 650 r.ptr[6] = cast(short)(sb.array[4] - sb.array[5]); 651 r.ptr[7] = cast(short)(sb.array[6] - sb.array[7]); 652 return cast(__m128i)r; 653 } 654 } 655 unittest 656 { 657 __m128i A = _mm_setr_epi16(short.min, 1, 4, 8, 16, 32, 1, -32768); 658 short8 C = cast(short8) _mm_hsub_epi16(A, A); 659 short[8] correct = [ short.max, -4, -16, -32767, short.max, -4, -16, -32767]; 660 assert(C.array == correct); 661 } 662 663 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results. 664 __m128i _mm_hsub_epi32 (__m128i a, __m128i b) @trusted 665 { 666 // PERF DMD 667 static if (GDC_with_SSSE3) 668 { 669 return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b); 670 } 671 else static if (LDC_with_SSSE3) 672 { 673 return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b); 674 } 675 else static if (LDC_with_ARM64) 676 { 677 // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 678 int4 ia = cast(int4)a; 679 int4 ib = cast(int4)b; 680 int4 c = shufflevector!(int4, 0, 2, 4, 6)(ia, ib); 681 int4 d = shufflevector!(int4, 1, 3, 5, 7)(ia, ib); 682 return cast(__m128i)(c - d); 683 } 684 else 685 { 686 int4 ia = cast(int4)a; 687 int4 ib = cast(int4)b; 688 int4 r; 689 r.ptr[0] = ia.array[0] - ia.array[1]; 690 r.ptr[1] = ia.array[2] - ia.array[3]; 691 r.ptr[2] = ib.array[0] - ib.array[1]; 692 r.ptr[3] = ib.array[2] - ib.array[3]; 693 return cast(__m128i)r; 694 } 695 } 696 unittest 697 { 698 __m128i A = _mm_setr_epi32(1, 2, int.min, 1); 699 __m128i B = _mm_setr_epi32(int.max, -1, 4, 4); 700 int4 C = cast(int4) _mm_hsub_epi32(A, B); 701 int[4] correct = [ -1, int.max, int.min, 0 ]; 702 assert(C.array == correct); 703 } 704 705 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`, 706 /// and pack the signed 16-bit results. 707 __m64 _mm_hsub_pi16 (__m64 a, __m64 b) @trusted 708 { 709 // PERF DMD 710 static if (GDC_with_SSSE3) 711 { 712 return cast(__m64)__builtin_ia32_phsubw(cast(short4)a, cast(short4)b); 713 } 714 else static if (LDC_with_ARM64) 715 { 716 // Produce uzp1 uzp2 sub sequence since LDC 1.3 -O1 717 short4 sa = cast(short4)a; 718 short4 sb = cast(short4)b; 719 short4 c = shufflevector!(short4, 0, 2, 4, 6)(sa, sb); 720 short4 d = shufflevector!(short4, 1, 3, 5, 7)(sa, sb); 721 return cast(__m64)(c - d); 722 } 723 else 724 { 725 // LDC x86: generates phsubw since LDC 1.24 -O2 726 short4 sa = cast(short4)a; 727 short4 sb = cast(short4)b; 728 short4 r; 729 r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]); 730 r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]); 731 r.ptr[2] = cast(short)(sb.array[0] - sb.array[1]); 732 r.ptr[3] = cast(short)(sb.array[2] - sb.array[3]); 733 return cast(__m64)r; 734 } 735 } 736 unittest 737 { 738 __m64 A = _mm_setr_pi16(short.min, 1, 4, 8); 739 __m64 B = _mm_setr_pi16(16, 32, 1, -32768); 740 short4 C = cast(short4) _mm_hsub_pi16(A, B); 741 short[4] correct = [ short.max, -4, -16, -32767]; 742 assert(C.array == correct); 743 } 744 745 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`, 746 /// and pack the signed 32-bit results. 747 __m64 _mm_hsub_pi32 (__m64 a, __m64 b) @trusted 748 { 749 // PERF DMD 750 static if (GDC_with_SSSE3) 751 { 752 return cast(__m64)__builtin_ia32_phsubd(cast(int2)a, cast(int2)b); 753 } 754 else static if (LDC_with_ARM64) 755 { 756 // LDC arm64: generates zip1+zip2+sub sequence since LDC 1.8 -O1 757 int2 ia = cast(int2)a; 758 int2 ib = cast(int2)b; 759 int2 c = shufflevector!(int2, 0, 2)(ia, ib); 760 int2 d = shufflevector!(int2, 1, 3)(ia, ib); 761 return cast(__m64)(c - d); 762 } 763 else 764 { 765 // LDC x86: generates phsubd since LDC 1.24 -O2 766 int2 ia = cast(int2)a; 767 int2 ib = cast(int2)b; 768 int2 r; 769 r.ptr[0] = ia.array[0] - ia.array[1]; 770 r.ptr[1] = ib.array[0] - ib.array[1]; 771 return cast(__m64)r; 772 } 773 } 774 unittest 775 { 776 __m64 A = _mm_setr_pi32(int.min, 1); 777 __m64 B = _mm_setr_pi32(int.max, -1); 778 int2 C = cast(int2) _mm_hsub_pi32(A, B); 779 int[2] correct = [ int.max, int.min ]; 780 assert(C.array == correct); 781 } 782 783 /// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 784 /// and pack the signed 16-bit results. 785 __m128i _mm_hsubs_epi16 (__m128i a, __m128i b) @trusted 786 { 787 // PERF DMD 788 static if (GDC_with_SSSE3) 789 { 790 return cast(__m128i)__builtin_ia32_phsubsw128(cast(short8)a, cast(short8)b); 791 } 792 else static if (LDC_with_SSSE3) 793 { 794 return cast(__m128i)__builtin_ia32_phsubsw128(cast(short8)a, cast(short8)b); 795 } 796 else static if (LDC_with_ARM64) 797 { 798 // uzp1/uzp2/sqsub sequence 799 short8 sa = cast(short8)a; 800 short8 sb = cast(short8)b; 801 short8 c = shufflevector!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb); 802 short8 d = shufflevector!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb); 803 return cast(__m128i)vqsubq_s16(c, d); 804 } 805 else 806 { 807 short8 sa = cast(short8)a; 808 short8 sb = cast(short8)b; 809 short8 r; 810 r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] - sa.array[1]); 811 r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] - sa.array[3]); 812 r.ptr[2] = saturateSignedIntToSignedShort(sa.array[4] - sa.array[5]); 813 r.ptr[3] = saturateSignedIntToSignedShort(sa.array[6] - sa.array[7]); 814 r.ptr[4] = saturateSignedIntToSignedShort(sb.array[0] - sb.array[1]); 815 r.ptr[5] = saturateSignedIntToSignedShort(sb.array[2] - sb.array[3]); 816 r.ptr[6] = saturateSignedIntToSignedShort(sb.array[4] - sb.array[5]); 817 r.ptr[7] = saturateSignedIntToSignedShort(sb.array[6] - sb.array[7]); 818 return cast(__m128i)r; 819 } 820 } 821 unittest 822 { 823 __m128i A = _mm_setr_epi16(1, -2, 4, 8, 32767, -1, -10, 32767); 824 short8 C = cast(short8) _mm_hsubs_epi16(A, A); 825 short[8] correct = [ 3, -4, 32767, -32768, 3, -4, 32767, -32768 ]; 826 assert(C.array == correct); 827 } 828 829 830 /// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 831 /// and pack the signed 16-bit results. 832 __m64 _mm_hsubs_pi16 (__m64 a, __m64 b) @trusted 833 { 834 static if (GDC_with_SSSE3) 835 { 836 return cast(__m64)__builtin_ia32_phsubsw(cast(short4)a, cast(short4)b); 837 } 838 else static if (LDC_with_SSSE3) 839 { 840 // Note: LDC doesn't have __builtin_ia32_phsubsw 841 long2 la; 842 la.ptr[0] = a.array[0]; 843 long2 lb; 844 lb.ptr[0] = b.array[0]; 845 int4 sum = cast(int4)__builtin_ia32_phsubsw128(cast(short8)la, cast(short8)lb); 846 int2 r; 847 r.ptr[0] = sum.array[0]; 848 r.ptr[1] = sum.array[2]; 849 return cast(__m64)r; 850 } 851 else static if (LDC_with_ARM64) 852 { 853 // uzp1/uzp2/sqsub sequence in -O1 854 short4 sa = cast(short4)a; 855 short4 sb = cast(short4)b; 856 short4 c = shufflevector!(short4, 0, 2, 4, 6)(sa, sb); 857 short4 d = shufflevector!(short4, 1, 3, 5, 7)(sa, sb); 858 return cast(__m64)vqsub_s16(c, d); 859 } 860 else 861 { 862 short4 sa = cast(short4)a; 863 short4 sb = cast(short4)b; 864 short4 r; 865 r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] - sa.array[1]); 866 r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] - sa.array[3]); 867 r.ptr[2] = saturateSignedIntToSignedShort(sb.array[0] - sb.array[1]); 868 r.ptr[3] = saturateSignedIntToSignedShort(sb.array[2] - sb.array[3]); 869 return cast(__m64)r; 870 } 871 } 872 unittest 873 { 874 __m64 A = _mm_setr_pi16(-16, 32, 100, -32768); 875 __m64 B = _mm_setr_pi16( 64, 30, -9, 32767); 876 short4 C = cast(short4) _mm_hsubs_pi16(A, B); 877 short[4] correct = [ -48, 32767, 34, -32768]; 878 assert(C.array == correct); 879 } 880 881 882 /// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 883 /// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 884 /// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 885 /// and pack the saturated results. 886 __m128i _mm_maddubs_epi16 (__m128i a, __m128i b) @trusted 887 { 888 static if (GDC_with_SSSE3) 889 { 890 return cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16)a, cast(byte16)b); 891 } 892 else static if (LDC_with_SSSE3) 893 { 894 return cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16)a, cast(byte16)b); 895 } 896 else 897 { 898 // zero-extend a to 16-bit 899 __m128i zero = _mm_setzero_si128(); 900 __m128i a_lo = _mm_unpacklo_epi8(a, zero); 901 __m128i a_hi = _mm_unpackhi_epi8(a, zero); 902 903 // sign-extend b to 16-bit 904 __m128i b_lo = _mm_unpacklo_epi8(b, zero); 905 __m128i b_hi = _mm_unpackhi_epi8(b, zero); 906 b_lo = _mm_srai_epi16( _mm_slli_epi16(b_lo, 8), 8); 907 b_hi = _mm_srai_epi16( _mm_slli_epi16(b_hi, 8), 8); 908 909 // Multiply element-wise, no overflow can occur 910 __m128i c_lo = _mm_mullo_epi16(a_lo, b_lo); 911 __m128i c_hi = _mm_mullo_epi16(a_hi, b_hi); 912 913 // Add pairwise with saturating horizontal add 914 return _mm_hadds_epi16(c_lo, c_hi); 915 } 916 } 917 unittest 918 { 919 __m128i A = _mm_setr_epi8( -1, 10, 100, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // u8 920 __m128i B = _mm_setr_epi8(-128, -30, 100, 127, -1, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0); // i8 921 short8 C = cast(short8) _mm_maddubs_epi16(A, B); 922 short[8] correct = [ -32768, 26256, 0, 0, 0, 0, 0, 0]; 923 assert(C.array == correct); 924 } 925 926 /// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 927 /// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 928 /// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 929 /// and pack the saturated results. 930 __m64 _mm_maddubs_pi16 (__m64 a, __m64 b) @trusted 931 { 932 static if (GDC_with_SSSE3) 933 { 934 return cast(__m64)__builtin_ia32_pmaddubsw(cast(byte8)a, cast(byte8)b); 935 } 936 else static if (LDC_with_SSSE3) 937 { 938 __m128i A = to_m128i(a); 939 __m128i B = to_m128i(b); 940 return to_m64( cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16) to_m128i(a), cast(byte16) to_m128i(b))); 941 } 942 else 943 { 944 // zero-extend a to 16-bit 945 __m128i zero = _mm_setzero_si128(); 946 __m128i A = _mm_unpacklo_epi8(to_m128i(a), zero); 947 948 // sign-extend b to 16-bit 949 __m128i B = _mm_unpacklo_epi8(to_m128i(b), zero); 950 B = _mm_srai_epi16( _mm_slli_epi16(B, 8), 8); 951 952 // Multiply element-wise, no overflow can occur 953 __m128i c = _mm_mullo_epi16(A, B); 954 955 // Add pairwise with saturating horizontal add 956 return to_m64( _mm_hadds_epi16(c, zero)); 957 } 958 } 959 unittest 960 { 961 __m64 A = _mm_setr_pi8( -1, 10, 100, -128, 0, 0, 0, 0); // u8 962 __m64 B = _mm_setr_pi8(-128, -30, 100, 127, -1, 2, 4, 6); // i8 963 short4 C = cast(short4) _mm_maddubs_pi16(A, B); 964 short[4] correct = [ -32768, 26256, 0, 0]; 965 assert(C.array == correct); 966 } 967 968 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers. 969 /// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return bits `[16:1]`. 970 __m128i _mm_mulhrs_epi16 (__m128i a, __m128i b) @trusted 971 { 972 // PERF DMD 973 static if (GDC_with_SSSE3) 974 { 975 return cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8)a, cast(short8)b); 976 } 977 else static if (LDC_with_SSSE3) 978 { 979 return cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8)a, cast(short8)b); 980 } 981 else static if (LDC_with_ARM64) 982 { 983 int4 mul_lo = vmull_s16(vget_low_s16(cast(short8)a), 984 vget_low_s16(cast(short8)b)); 985 int4 mul_hi = vmull_s16(vget_high_s16(cast(short8)a), 986 vget_high_s16(cast(short8)b)); 987 988 // Rounding narrowing shift right 989 // narrow = (int16_t)((mul + 16384) >> 15); 990 short4 narrow_lo = vrshrn_n_s32(mul_lo, 15); 991 short4 narrow_hi = vrshrn_n_s32(mul_hi, 15); 992 993 // Join together. 994 return cast(__m128i) vcombine_s16(narrow_lo, narrow_hi); 995 } 996 else 997 { 998 short8 sa = cast(short8)a; 999 short8 sb = cast(short8)b; 1000 short8 r; 1001 1002 for (int i = 0; i < 8; ++i) 1003 { 1004 // I doubted it at first, but an exhaustive search show this to be equivalent to Intel pseudocode. 1005 r.ptr[i] = cast(short) ( (sa.array[i] * sb.array[i] + 0x4000) >> 15); 1006 } 1007 1008 return cast(__m128i)r; 1009 } 1010 } 1011 1012 unittest 1013 { 1014 __m128i A = _mm_setr_epi16(12345, -32768, 32767, 0, 1, 845, -6999, -1); 1015 __m128i B = _mm_setr_epi16(8877, -24487, 15678, 32760, 1, 0, -149, -1); 1016 short8 C = cast(short8) _mm_mulhrs_epi16(A, B); 1017 short[8] correct = [3344, 24487, 15678, 0, 0, 0, 32, 0]; 1018 assert(C.array == correct); 1019 } 1020 1021 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers. 1022 /// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return bits `[16:1]`. 1023 __m64 _mm_mulhrs_pi16 (__m64 a, __m64 b) @trusted 1024 { 1025 // PERF DMD 1026 static if (GDC_with_SSSE3) 1027 { 1028 return cast(__m64) __builtin_ia32_pmulhrsw(cast(short4)a, cast(short4)b); 1029 } 1030 else static if (LDC_with_SSSE3) 1031 { 1032 return cast(__m64) to_m64( cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8) to_m128i(a), cast(short8) to_m128i(b))); 1033 } 1034 else static if (LDC_with_ARM64) 1035 { 1036 int4 mul = vmull_s16(cast(short4)a, cast(short4)b); 1037 1038 // Rounding narrowing shift right 1039 // (int16_t)((mul + 16384) >> 15); 1040 return cast(__m64) vrshrn_n_s32(mul, 15); 1041 } 1042 else 1043 { 1044 short4 sa = cast(short4)a; 1045 short4 sb = cast(short4)b; 1046 short4 r; 1047 1048 for (int i = 0; i < 4; ++i) 1049 { 1050 r.ptr[i] = cast(short) ( (sa.array[i] * sb.array[i] + 0x4000) >> 15); 1051 } 1052 return cast(__m64)r; 1053 } 1054 } 1055 unittest 1056 { 1057 __m64 A = _mm_setr_pi16(12345, -32768, 32767, 0); 1058 __m64 B = _mm_setr_pi16(8877, -24487, 15678, 32760); 1059 short4 C = cast(short4) _mm_mulhrs_pi16(A, B); 1060 short[4] correct = [3344, 24487, 15678, 0]; 1061 assert(C.array == correct); 1062 } 1063 1064 1065 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`. 1066 __m128i _mm_shuffle_epi8 (__m128i a, __m128i b) @trusted 1067 { 1068 // This is the lovely pshufb. 1069 // PERF DMD 1070 static if (GDC_with_SSSE3) 1071 { 1072 return cast(__m128i) __builtin_ia32_pshufb128(cast(byte16) a, cast(byte16) b); 1073 } 1074 else static if (LDC_with_SSSE3) 1075 { 1076 return cast(__m128i) __builtin_ia32_pshufb128(cast(byte16) a, cast(byte16) b); 1077 } 1078 else static if (LDC_with_ARM64) 1079 { 1080 byte16 bb = cast(byte16)b; 1081 byte16 mask; 1082 mask = cast(byte)(0x8F); 1083 bb = bb & mask; 1084 byte16 r = vqtbl1q_s8(cast(byte16)a, bb); 1085 return cast(__m128i)r; 1086 } 1087 else 1088 { 1089 byte16 r; 1090 byte16 ba = cast(byte16)a; 1091 byte16 bb = cast(byte16)b; 1092 for (int i = 0; i < 16; ++i) 1093 { 1094 byte s = bb.array[i]; 1095 r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 15 ]; 1096 } 1097 return cast(__m128i)r; 1098 } 1099 } 1100 unittest 1101 { 1102 __m128i A = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 1103 __m128i B = _mm_setr_epi8(15, -128, 13 + 16, -12, 11, -10, 9, 8, 7, 6, -5, 4, 3, -2, 1, 0); 1104 byte16 C = cast(byte16) _mm_shuffle_epi8(A, B); 1105 byte[16] correct = [0, 0, 2, 0, 4, 0, 6, 7, 8, 9, 0, 11, 12, 0, 14, 15]; 1106 assert(C.array == correct); 1107 } 1108 1109 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`. 1110 __m64 _mm_shuffle_pi8 (__m64 a, __m64 b) @trusted 1111 { 1112 // PERF DMD 1113 static if (GDC_with_SSSE3) 1114 { 1115 alias ubyte8 =__vector(ubyte[8]); 1116 return cast(__m64) __builtin_ia32_pshufb(cast(ubyte8) a, cast(ubyte8) b); 1117 } 1118 else static if (LDC_with_SSSE3) 1119 { 1120 // GDC does proper dance to avoid mmx registers, do it manually in LDC since __builtin_ia32_pshufb doesn't exist there 1121 __m128i A = to_m128i(a); 1122 __m128i index = to_m128i(b); 1123 index = index & _mm_set1_epi32(0xF7F7F7F7); 1124 return to_m64( cast(__m128i) __builtin_ia32_pshufb128(cast(byte16)A, cast(byte16) index) ); 1125 } 1126 else static if (LDC_with_ARM64) 1127 { 1128 byte8 bb = cast(byte8)b; 1129 byte8 mask; 1130 mask = cast(byte)(0x87); 1131 bb = bb & mask; 1132 __m128i l = to_m128i(a); 1133 byte8 r = vtbl1_s8(cast(byte16)l, cast(byte8)bb); 1134 return cast(__m64)r; 1135 } 1136 else 1137 { 1138 byte8 r; 1139 byte8 ba = cast(byte8)a; 1140 byte8 bb = cast(byte8)b; 1141 for (int i = 0; i < 8; ++i) 1142 { 1143 byte s = bb.array[i]; 1144 r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 7 ]; 1145 } 1146 return cast(__m64)r; 1147 } 1148 } 1149 unittest 1150 { 1151 __m64 A = _mm_setr_pi8(7, 6, 5, 4, 3, 2, 1, 0); 1152 __m64 B = _mm_setr_pi8(7, 6, -5, 4, 3 + 8, -2, 1, 0); 1153 byte8 C = cast(byte8) _mm_shuffle_pi8(A, B); 1154 byte[8] correct = [0, 1, 0, 3, 4, 0, 6, 7]; 1155 assert(C.array == correct); 1156 } 1157 1158 /// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit integer in `b` is negative. 1159 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 1160 __m128i _mm_sign_epi16 (__m128i a, __m128i b) @trusted 1161 { 1162 // PERF DMD 1163 static if (GDC_with_SSSE3) 1164 { 1165 return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b); 1166 } 1167 else static if (LDC_with_SSSE3) 1168 { 1169 return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b); 1170 } 1171 else 1172 { 1173 // LDC arm64: 5 instructions 1174 __m128i mask = _mm_srai_epi16(b, 15); 1175 __m128i zeromask = _mm_cmpeq_epi16(b, _mm_setzero_si128()); 1176 return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi16(a, mask), mask)); 1177 } 1178 } 1179 unittest 1180 { 1181 __m128i A = _mm_setr_epi16(-2, -1, 0, 1, 2, short.min, short.min, short.min); 1182 __m128i B = _mm_setr_epi16(-1, 0,-1, 1, -2, -50, 0, 50); 1183 short8 C = cast(short8) _mm_sign_epi16(A, B); 1184 short[8] correct = [ 2, 0, 0, 1, -2, short.min, 0, short.min]; 1185 assert(C.array == correct); 1186 } 1187 1188 /// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit integer in `b` is negative. 1189 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 1190 __m128i _mm_sign_epi32 (__m128i a, __m128i b) @trusted 1191 { 1192 // PERF DMD 1193 static if (GDC_with_SSSE3) 1194 { 1195 return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b); 1196 } 1197 else static if (LDC_with_SSSE3) 1198 { 1199 return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b); 1200 } 1201 else 1202 { 1203 __m128i mask = _mm_srai_epi32(b, 31); 1204 __m128i zeromask = _mm_cmpeq_epi32(b, _mm_setzero_si128()); 1205 return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi32(a, mask), mask)); 1206 } 1207 } 1208 unittest 1209 { 1210 __m128i A = _mm_setr_epi32(-2, -1, 0, int.max); 1211 __m128i B = _mm_setr_epi32(-1, 0, -1, 1); 1212 int4 C = cast(int4) _mm_sign_epi32(A, B); 1213 int[4] correct = [ 2, 0, 0, int.max]; 1214 assert(C.array == correct); 1215 } 1216 1217 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 1218 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 1219 __m128i _mm_sign_epi8 (__m128i a, __m128i b) @trusted 1220 { 1221 // PERF DMD 1222 static if (GDC_with_SSSE3) 1223 { 1224 return cast(__m128i) __builtin_ia32_psignb128(cast(byte16)a, cast(byte16)b); 1225 } 1226 else static if (LDC_with_SSSE3) 1227 { 1228 return cast(__m128i) __builtin_ia32_psignb128(cast(byte16)a, cast(byte16)b); 1229 } 1230 else 1231 { 1232 __m128i mask = _mm_cmplt_epi8(b, _mm_setzero_si128()); // extend sign bit 1233 __m128i zeromask = _mm_cmpeq_epi8(b, _mm_setzero_si128()); 1234 return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi8(a, mask), mask)); 1235 } 1236 } 1237 unittest 1238 { 1239 __m128i A = _mm_setr_epi8(-2, -1, 0, 1, 2, byte.min, byte.min, byte.min, -1, 0,-1, 1, -2, -50, 0, 50); 1240 __m128i B = _mm_setr_epi8(-1, 0,-1, 1, -2, -50, 0, 50, -2, -1, 0, 1, 2, byte.min, byte.min, byte.min); 1241 byte16 C = cast(byte16) _mm_sign_epi8(A, B); 1242 byte[16] correct = [ 2, 0, 0, 1, -2, byte.min, 0, byte.min, 1, 0, 0, 1, -2, 50, 0, -50]; 1243 assert(C.array == correct); 1244 } 1245 1246 /// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit integer in `b` is negative. 1247 /// Element in result are zeroed out when the corresponding element in `b` is zero. 1248 __m64 _mm_sign_pi16 (__m64 a, __m64 b) @trusted 1249 { 1250 return to_m64( _mm_sign_epi16( to_m128i(a), to_m128i(b)) ); 1251 } 1252 unittest 1253 { 1254 __m64 A = _mm_setr_pi16( 2, short.min, short.min, short.min); 1255 __m64 B = _mm_setr_pi16(-2, -50, 0, 50); 1256 short4 C = cast(short4) _mm_sign_pi16(A, B); 1257 short[4] correct = [-2, short.min, 0, short.min]; 1258 assert(C.array == correct); 1259 } 1260 1261 /// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit integer in `b` is negative. 1262 /// Element in result are zeroed out when the corresponding element in `b` is zero. 1263 __m64 _mm_sign_pi32 (__m64 a, __m64 b) @trusted 1264 { 1265 return to_m64( _mm_sign_epi32( to_m128i(a), to_m128i(b)) ); 1266 } 1267 unittest 1268 { 1269 __m64 A = _mm_setr_pi32(-2, -100); 1270 __m64 B = _mm_setr_pi32(-1, 0); 1271 int2 C = cast(int2) _mm_sign_pi32(A, B); 1272 int[2] correct = [ 2, 0]; 1273 assert(C.array == correct); 1274 } 1275 1276 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 1277 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 1278 __m64 _mm_sign_pi8 (__m64 a, __m64 b) @trusted 1279 { 1280 return to_m64( _mm_sign_epi8( to_m128i(a), to_m128i(b)) ); 1281 } 1282 unittest 1283 { 1284 __m64 A = _mm_setr_pi8(-2, -1, 0, 1, 2, byte.min, byte.min, byte.min); 1285 __m64 B = _mm_setr_pi8(-1, 0,-1, 1, -2, -50, 0, 50); 1286 byte8 C = cast(byte8) _mm_sign_pi8(A, B); 1287 byte[8] correct = [ 2, 0, 0, 1, -2, byte.min, 0, byte.min]; 1288 assert(C.array == correct); 1289 }