1 /** 2 * SSSE3 intrinsics. 3 * 4 * Copyright: Guillaume Piolat 2021. 5 * Johan Engelen 2021. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.tmmintrin; 9 10 public import inteli.types; 11 import inteli.internals; 12 13 public import inteli.pmmintrin; 14 import inteli.mmx; 15 16 nothrow @nogc: 17 18 19 // SSSE3 instructions 20 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSSE3 21 // Note: this header will work whether you have SSSE3 enabled or not. 22 // With LDC, use "dflags-ldc": ["-mattr=+ssse3"] or equivalent to actively 23 // generate SSE3 instructions. 24 25 /// Compute the absolute value of packed signed 16-bit integers in `a`. 26 __m128i _mm_abs_epi16 (__m128i a) @trusted 27 { 28 static if (DMD_with_DSIMD) 29 { 30 return cast(__m128i)__simd(XMM.PABSW, a); 31 } 32 else static if (GDC_with_SSSE3) 33 { 34 return cast(__m128i) __builtin_ia32_pabsw128(cast(short8)a); 35 } 36 else static if (LDC_with_ARM64) 37 { 38 return cast(__m128i) vabsq_s16(cast(short8)a); 39 } 40 else 41 { 42 // LDC x86: generate pabsw since LDC 1.1 -O2 43 short8 sa = cast(short8)a; 44 for (int i = 0; i < 8; ++i) 45 { 46 short s = sa.array[i]; 47 sa.ptr[i] = s >= 0 ? s : cast(short)(-cast(int)(s)); 48 } 49 return cast(__m128i)sa; 50 } 51 } 52 unittest 53 { 54 __m128i A = _mm_setr_epi16(0, -1, -32768, 32767, 10, -10, 1000, -1000); 55 short8 B = cast(short8) _mm_abs_epi16(A); 56 short[8] correct = [0, 1, -32768, 32767, 10, 10, 1000, 1000]; 57 assert(B.array == correct); 58 } 59 60 /// Compute the absolute value of packed signed 32-bit integers in `a`. 61 __m128i _mm_abs_epi32 (__m128i a) @trusted 62 { 63 static if (DMD_with_DSIMD) 64 { 65 return cast(__m128i)__simd(XMM.PABSD, cast(int4)a); 66 } 67 else static if (GDC_with_SSSE3) 68 { 69 return cast(__m128i) __builtin_ia32_pabsd128(cast(int4)a); 70 } 71 else static if (LDC_with_ARM64) 72 { 73 return cast(__m128i) vabsq_s32(cast(int4)a); 74 } 75 else 76 { 77 // LDC x86: generates pabsd since LDC 1.1 -O2 78 int4 sa = cast(int4)a; 79 for (int i = 0; i < 4; ++i) 80 { 81 int s = sa.array[i]; 82 sa.ptr[i] = s >= 0 ? s : -s; 83 } 84 return cast(__m128i)sa; 85 } 86 } 87 unittest 88 { 89 __m128i A = _mm_setr_epi32(0, -1, -2_147_483_648, -2_147_483_647); 90 int4 B = cast(int4) _mm_abs_epi32(A); 91 int[4] correct = [0, 1, -2_147_483_648, 2_147_483_647]; 92 assert(B.array == correct); 93 } 94 95 /// Compute the absolute value of packed signed 8-bit integers in `a`. 96 __m128i _mm_abs_epi8 (__m128i a) @trusted 97 { 98 static if (DMD_with_DSIMD) 99 { 100 return cast(__m128i)__simd(XMM.PABSB, cast(byte16)a); 101 } 102 else static if (GDC_with_SSSE3) 103 { 104 alias ubyte16 = __vector(ubyte[16]); 105 return cast(__m128i) __builtin_ia32_pabsb128(cast(ubyte16)a); 106 } 107 else static if (LDC_with_ARM64) 108 { 109 return cast(__m128i) vabsq_s8(cast(byte16)a); 110 } 111 else static if (LDC_with_SSSE3) 112 { 113 return __asm!__m128i("pabsb $1,$0","=x,x",a); 114 } 115 else 116 { 117 // A loop version like in _mm_abs_epi16/_mm_abs_epi32 would be very slow 118 // in LDC x86 and wouldn't vectorize. Doesn't generate pabsb in LDC though. 119 return _mm_min_epu8(a, _mm_sub_epi8(_mm_setzero_si128(), a)); 120 } 121 } 122 unittest 123 { 124 __m128i A = _mm_setr_epi8(0, -1, -128, -127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 125 byte16 B = cast(byte16) _mm_abs_epi8(A); 126 byte[16] correct = [0, 1, -128, 127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 127 assert(B.array == correct); 128 } 129 130 /// Compute the absolute value of packed signed 16-bit integers in `a`. 131 __m64 _mm_abs_pi16 (__m64 a) @trusted 132 { 133 return to_m64(_mm_abs_epi16(to_m128i(a))); 134 } 135 unittest 136 { 137 __m64 A = _mm_setr_pi16(0, -1, -32768, 32767); 138 short4 B = cast(short4) _mm_abs_pi16(A); 139 short[4] correct = [0, 1, -32768, 32767]; 140 assert(B.array == correct); 141 } 142 143 /// Compute the absolute value of packed signed 32-bit integers in `a`. 144 __m64 _mm_abs_pi32 (__m64 a) @trusted 145 { 146 return to_m64(_mm_abs_epi32(to_m128i(a))); 147 } 148 unittest 149 { 150 __m64 A = _mm_setr_pi32(-1, -2_147_483_648); 151 int2 B = cast(int2) _mm_abs_pi32(A); 152 int[2] correct = [1, -2_147_483_648]; 153 assert(B.array == correct); 154 } 155 156 /// Compute the absolute value of packed signed 8-bit integers in `a`. 157 __m64 _mm_abs_pi8 (__m64 a) @trusted 158 { 159 return to_m64(_mm_abs_epi8(to_m128i(a))); 160 } 161 unittest 162 { 163 __m64 A = _mm_setr_pi8(0, -1, -128, -127, 127, 0, 0, 0); 164 byte8 B = cast(byte8) _mm_abs_pi8(A); 165 byte[8] correct = [0, 1, -128, 127, 127, 0, 0, 0]; 166 assert(B.array == correct); 167 } 168 169 /// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result, shift the result right by `count` bytes, and return the low 16 bytes. 170 __m128i _mm_alignr_epi8(ubyte count)(__m128i a, __m128i b) @trusted 171 { 172 static assert(count < 32); 173 174 // PERF DMD 175 static if (GDC_with_SSSE3) 176 { 177 return cast(__m128i)__builtin_ia32_palignr128(cast(long2)a, cast(long2)b, count * 8); 178 } 179 else version(LDC) 180 { 181 static if (count < 16) 182 { 183 // Generates palignr since LDC 1.1 -O1 184 // Also generates a single ext instruction on arm64. 185 return cast(__m128i) shufflevector!(byte16, ( 0 + count) % 32, 186 ( 1 + count) % 32, 187 ( 2 + count) % 32, 188 ( 3 + count) % 32, 189 ( 4 + count) % 32, 190 ( 5 + count) % 32, 191 ( 6 + count) % 32, 192 ( 7 + count) % 32, 193 ( 8 + count) % 32, 194 ( 9 + count) % 32, 195 (10 + count) % 32, 196 (11 + count) % 32, 197 (12 + count) % 32, 198 (13 + count) % 32, 199 (14 + count) % 32, 200 (15 + count) % 32)(cast(byte16)b, cast(byte16)a); 201 } 202 else 203 { 204 return cast(__m128i) shufflevector!(byte16, ( 0 + count) % 32, 205 ( 1 + count) % 32, 206 ( 2 + count) % 32, 207 ( 3 + count) % 32, 208 ( 4 + count) % 32, 209 ( 5 + count) % 32, 210 ( 6 + count) % 32, 211 ( 7 + count) % 32, 212 ( 8 + count) % 32, 213 ( 9 + count) % 32, 214 (10 + count) % 32, 215 (11 + count) % 32, 216 (12 + count) % 32, 217 (13 + count) % 32, 218 (14 + count) % 32, 219 (15 + count) % 32)(cast(byte16)_mm_setzero_si128(), cast(byte16)a); 220 } 221 } 222 else 223 { 224 byte16 ab = cast(byte16)a; 225 byte16 bb = cast(byte16)b; 226 byte16 r; 227 228 for (int i = 0; i < 16; ++i) 229 { 230 const int srcpos = count + cast(int)i; 231 if (srcpos > 31) 232 { 233 r.ptr[i] = 0; 234 } 235 else if (srcpos > 15) 236 { 237 r.ptr[i] = ab[(srcpos) & 15]; 238 } 239 else 240 { 241 r.ptr[i] = bb[srcpos]; 242 } 243 } 244 return cast(__m128i)r; 245 } 246 } 247 unittest 248 { 249 __m128i A = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); 250 __m128i B = _mm_setr_epi8(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); 251 252 { 253 byte16 C = cast(byte16)_mm_alignr_epi8!0(A ,B); 254 byte[16] correct = [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]; 255 assert(C.array == correct); 256 } 257 { 258 byte16 C = cast(byte16)_mm_alignr_epi8!20(A ,B); 259 _mm_print_epi8(_mm_alignr_epi8!20(A ,B)); 260 byte[16] correct = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0]; 261 assert(C.array == correct); 262 } 263 264 __m128i D = _mm_setr_epi8(-123, -82, 103, -69, 103, -26, 9, 106, 58, -11, 79, -91, 114, -13, 110, 60); 265 __m128i E = _mm_setr_epi8(25, -51, -32, 91, -85, -39, -125, 31, -116, 104, 5, -101, 127, 82, 14, 81); 266 byte16 F = cast(byte16)_mm_alignr_epi8!8(D, E); 267 byte[16] correct = [-116, 104, 5, -101, 127, 82, 14, 81, -123, -82, 103, -69, 103, -26, 9, 106]; 268 assert(F.array == correct); 269 } 270 271 /// Concatenate 8-byte blocks in `a` and `b` into a 16-byte temporary result, shift the result right by `count` bytes, and return the low 8 bytes. 272 __m64 _mm_alignr_pi8(ubyte count)(__m64 a, __m64 b) @trusted 273 { 274 // PERF DMD 275 static if (GDC_with_SSSE3) 276 { 277 return cast(__m64)__builtin_ia32_palignr(cast(long)a, cast(long)b, count * 8); 278 } 279 else 280 { 281 // Note: in LDC x86 this uses a pshufb. 282 // Generates ext in arm64. 283 return cast(__m64) shufflevector!(byte8, (0 + count) % 16, 284 (1 + count) % 16, 285 (2 + count) % 16, 286 (3 + count) % 16, 287 (4 + count) % 16, 288 (5 + count) % 16, 289 (6 + count) % 16, 290 (7 + count) % 16)(cast(byte8)a, cast(byte8)b); 291 } 292 } 293 unittest 294 { 295 __m64 A = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8); 296 __m64 B = _mm_setr_pi8(17, 18, 19, 20, 21, 22, 23, 24); 297 298 { 299 byte8 C = cast(byte8)_mm_alignr_pi8!3(A ,B); 300 byte[8] correct = [4, 5, 6, 7, 8, 17, 18, 19]; 301 assert(C.array == correct); 302 } 303 { 304 byte8 C = cast(byte8)_mm_alignr_pi8!10(A ,B); 305 byte[8] correct = [19, 20, 21, 22, 23, 24, 1, 2]; 306 assert(C.array == correct); 307 } 308 } 309 310 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results. 311 __m128i _mm_hadd_epi16 (__m128i a, __m128i b) @trusted 312 { 313 // PERF DMD 314 static if (GDC_with_SSSE3) 315 { 316 return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b); 317 } 318 else static if (LDC_with_SSSE3) 319 { 320 return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b); 321 } 322 else static if (LDC_with_ARM64) 323 { 324 return cast(__m128i)vpaddq_s16(cast(short8)a, cast(short8)b); 325 } 326 else 327 { 328 short8 sa = cast(short8)a; 329 short8 sb = cast(short8)b; 330 short8 r; 331 r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]); 332 r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]); 333 r.ptr[2] = cast(short)(sa.array[4] + sa.array[5]); 334 r.ptr[3] = cast(short)(sa.array[6] + sa.array[7]); 335 r.ptr[4] = cast(short)(sb.array[0] + sb.array[1]); 336 r.ptr[5] = cast(short)(sb.array[2] + sb.array[3]); 337 r.ptr[6] = cast(short)(sb.array[4] + sb.array[5]); 338 r.ptr[7] = cast(short)(sb.array[6] + sb.array[7]); 339 return cast(__m128i)r; 340 } 341 } 342 unittest 343 { 344 __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768); 345 short8 C = cast(short8) _mm_hadd_epi16(A, A); 346 short[8] correct = [ -1, 12, 48, 32767, -1, 12, 48, 32767]; 347 assert(C.array == correct); 348 } 349 350 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results. 351 __m128i _mm_hadd_epi32 (__m128i a, __m128i b) @trusted 352 { 353 // PERF DMD 354 static if (GDC_with_SSSE3) 355 { 356 return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b); 357 } 358 else static if (LDC_with_SSSE3) 359 { 360 return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b); 361 } 362 else static if (LDC_with_ARM64) 363 { 364 return cast(__m128i)vpaddq_s32(cast(int4)a, cast(int4)b); 365 } 366 else 367 { 368 int4 ia = cast(int4)a; 369 int4 ib = cast(int4)b; 370 int4 r; 371 r.ptr[0] = ia.array[0] + ia.array[1]; 372 r.ptr[1] = ia.array[2] + ia.array[3]; 373 r.ptr[2] = ib.array[0] + ib.array[1]; 374 r.ptr[3] = ib.array[2] + ib.array[3]; 375 return cast(__m128i)r; 376 } 377 } 378 unittest 379 { 380 __m128i A = _mm_setr_epi32(1, -2, int.min, -1); 381 __m128i B = _mm_setr_epi32(1, int.max, 4, -4); 382 int4 C = cast(int4) _mm_hadd_epi32(A, B); 383 int[4] correct = [ -1, int.max, int.min, 0 ]; 384 assert(C.array == correct); 385 } 386 387 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results. 388 __m64 _mm_hadd_pi16 (__m64 a, __m64 b) @trusted 389 { 390 // PERF DMD 391 static if (GDC_with_SSSE3) 392 { 393 return cast(__m64) __builtin_ia32_phaddw(cast(short4)a, cast(short4)b); 394 } 395 else static if (LDC_with_ARM64) 396 { 397 return cast(__m64) vpadd_s16(cast(short4)a, cast(short4)b); 398 } 399 else 400 { 401 // LDC x86: generates phaddw since LDC 1.24 -O2. 402 short4 r; 403 short4 sa = cast(short4)a; 404 short4 sb = cast(short4)b; 405 r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]); 406 r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]); 407 r.ptr[2] = cast(short)(sb.array[0] + sb.array[1]); 408 r.ptr[3] = cast(short)(sb.array[2] + sb.array[3]); 409 return cast(__m64)r; 410 } 411 } 412 unittest 413 { 414 __m64 A = _mm_setr_pi16(1, -2, 4, 8); 415 __m64 B = _mm_setr_pi16(16, 32, -1, -32768); 416 short4 C = cast(short4) _mm_hadd_pi16(A, B); 417 short[4] correct = [ -1, 12, 48, 32767 ]; 418 assert(C.array == correct); 419 } 420 421 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, 422 /// and pack the signed 32-bit results. 423 __m64 _mm_hadd_pi32 (__m64 a, __m64 b) @trusted 424 { 425 // PERF DMD 426 static if (GDC_with_SSSE3) 427 { 428 return cast(__m64) __builtin_ia32_phaddd(cast(int2)a, cast(int2)b); 429 } 430 else static if (LDC_with_ARM64) 431 { 432 return cast(__m64)vpadd_s32(cast(int2)a, cast(int2)b); 433 } 434 else 435 { 436 // LDC x86: generates phaddd since LDC 1.24 -O2 437 int2 ia = cast(int2)a; 438 int2 ib = cast(int2)b; 439 int2 r; 440 r.ptr[0] = ia.array[0] + ia.array[1]; 441 r.ptr[1] = ib.array[0] + ib.array[1]; 442 return cast(__m64)r; 443 } 444 } 445 unittest 446 { 447 __m64 A = _mm_setr_pi32(int.min, -1); 448 __m64 B = _mm_setr_pi32(1, int.max); 449 int2 C = cast(int2) _mm_hadd_pi32(A, B); 450 int[2] correct = [ int.max, int.min ]; 451 assert(C.array == correct); 452 } 453 454 /// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 455 /// and pack the signed 16-bit results. 456 __m128i _mm_hadds_epi16 (__m128i a, __m128i b) @trusted 457 { 458 // PERF DMD 459 static if (GDC_with_SSSE3) 460 { 461 return cast(__m128i)__builtin_ia32_phaddsw128(cast(short8)a, cast(short8)b); 462 } 463 else static if (LDC_with_SSSE3) 464 { 465 return cast(__m128i)__builtin_ia32_phaddsw128(cast(short8)a, cast(short8)b); 466 } 467 else static if (LDC_with_ARM64) 468 { 469 // uzp1/uzp2/sqadd sequence 470 short8 sa = cast(short8)a; 471 short8 sb = cast(short8)b; 472 short8 c = shufflevector!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb); 473 short8 d = shufflevector!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb); 474 return cast(__m128i)vqaddq_s16(c, d); 475 } 476 else 477 { 478 short8 sa = cast(short8)a; 479 short8 sb = cast(short8)b; 480 short8 r; 481 r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] + sa.array[1]); 482 r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] + sa.array[3]); 483 r.ptr[2] = saturateSignedIntToSignedShort(sa.array[4] + sa.array[5]); 484 r.ptr[3] = saturateSignedIntToSignedShort(sa.array[6] + sa.array[7]); 485 r.ptr[4] = saturateSignedIntToSignedShort(sb.array[0] + sb.array[1]); 486 r.ptr[5] = saturateSignedIntToSignedShort(sb.array[2] + sb.array[3]); 487 r.ptr[6] = saturateSignedIntToSignedShort(sb.array[4] + sb.array[5]); 488 r.ptr[7] = saturateSignedIntToSignedShort(sb.array[6] + sb.array[7]); 489 return cast(__m128i)r; 490 } 491 } 492 unittest 493 { 494 __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768); 495 short8 C = cast(short8) _mm_hadds_epi16(A, A); 496 short[8] correct = [ -1, 12, 48, -32768, -1, 12, 48, -32768]; 497 assert(C.array == correct); 498 } 499 500 /// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 501 /// and pack the signed 16-bit results. 502 __m64 _mm_hadds_pi16 (__m64 a, __m64 b) @trusted 503 { 504 static if (GDC_with_SSSE3) 505 { 506 return cast(__m64)__builtin_ia32_phaddsw(cast(short4)a, cast(short4)b); 507 } 508 else static if (LDC_with_SSSE3) 509 { 510 // Note: LDC doesn't have __builtin_ia32_phaddsw 511 long2 la; 512 la.ptr[0] = a.array[0]; 513 long2 lb; 514 lb.ptr[0] = b.array[0]; 515 int4 sum = cast(int4)__builtin_ia32_phaddsw128(cast(short8)la, cast(short8)lb); 516 int2 r; 517 r.ptr[0] = sum.array[0]; 518 r.ptr[1] = sum.array[2]; 519 return cast(__m64)r; 520 } 521 else static if (LDC_with_ARM64) 522 { 523 // uzp1/uzp2/sqadd sequence 524 short4 sa = cast(short4)a; 525 short4 sb = cast(short4)b; 526 short4 c = shufflevector!(short4, 0, 2, 4, 6)(sa, sb); 527 short4 d = shufflevector!(short4, 1, 3, 5, 7)(sa, sb); 528 return cast(__m64)vqadd_s16(c, d); 529 } 530 else 531 { 532 short4 sa = cast(short4)a; 533 short4 sb = cast(short4)b; 534 short4 r; 535 r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] + sa.array[1]); 536 r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] + sa.array[3]); 537 r.ptr[2] = saturateSignedIntToSignedShort(sb.array[0] + sb.array[1]); 538 r.ptr[3] = saturateSignedIntToSignedShort(sb.array[2] + sb.array[3]); 539 return cast(__m64)r; 540 } 541 } 542 unittest 543 { 544 __m64 A = _mm_setr_pi16(-16, 32, -100, -32768); 545 __m64 B = _mm_setr_pi16( 64, 32, 1, 32767); 546 short4 C = cast(short4) _mm_hadds_pi16(A, B); 547 short[4] correct = [ 16, -32768, 96, 32767]; 548 assert(C.array == correct); 549 } 550 551 552 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results. 553 __m128i _mm_hsub_epi16 (__m128i a, __m128i b) @trusted 554 { 555 // PERF DMD 556 static if (GDC_with_SSSE3) 557 { 558 return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b); 559 } 560 else static if (LDC_with_SSSE3) 561 { 562 return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b); 563 } 564 else static if (LDC_with_ARM64) 565 { 566 // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 567 short8 sa = cast(short8)a; 568 short8 sb = cast(short8)b; 569 short8 c = shufflevector!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb); 570 short8 d = shufflevector!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb); 571 return cast(__m128i)(c - d); 572 } 573 else 574 { 575 short8 sa = cast(short8)a; 576 short8 sb = cast(short8)b; 577 short8 r; 578 r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]); 579 r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]); 580 r.ptr[2] = cast(short)(sa.array[4] - sa.array[5]); 581 r.ptr[3] = cast(short)(sa.array[6] - sa.array[7]); 582 r.ptr[4] = cast(short)(sb.array[0] - sb.array[1]); 583 r.ptr[5] = cast(short)(sb.array[2] - sb.array[3]); 584 r.ptr[6] = cast(short)(sb.array[4] - sb.array[5]); 585 r.ptr[7] = cast(short)(sb.array[6] - sb.array[7]); 586 return cast(__m128i)r; 587 } 588 } 589 unittest 590 { 591 __m128i A = _mm_setr_epi16(short.min, 1, 4, 8, 16, 32, 1, -32768); 592 short8 C = cast(short8) _mm_hsub_epi16(A, A); 593 short[8] correct = [ short.max, -4, -16, -32767, short.max, -4, -16, -32767]; 594 assert(C.array == correct); 595 } 596 597 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results. 598 __m128i _mm_hsub_epi32 (__m128i a, __m128i b) @trusted 599 { 600 // PERF DMD 601 static if (GDC_with_SSSE3) 602 { 603 return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b); 604 } 605 else static if (LDC_with_SSSE3) 606 { 607 return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b); 608 } 609 else static if (LDC_with_ARM64) 610 { 611 // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 612 int4 ia = cast(int4)a; 613 int4 ib = cast(int4)b; 614 int4 c = shufflevector!(int4, 0, 2, 4, 6)(ia, ib); 615 int4 d = shufflevector!(int4, 1, 3, 5, 7)(ia, ib); 616 return cast(__m128i)(c - d); 617 } 618 else 619 { 620 int4 ia = cast(int4)a; 621 int4 ib = cast(int4)b; 622 int4 r; 623 r.ptr[0] = ia.array[0] - ia.array[1]; 624 r.ptr[1] = ia.array[2] - ia.array[3]; 625 r.ptr[2] = ib.array[0] - ib.array[1]; 626 r.ptr[3] = ib.array[2] - ib.array[3]; 627 return cast(__m128i)r; 628 } 629 } 630 unittest 631 { 632 __m128i A = _mm_setr_epi32(1, 2, int.min, 1); 633 __m128i B = _mm_setr_epi32(int.max, -1, 4, 4); 634 int4 C = cast(int4) _mm_hsub_epi32(A, B); 635 int[4] correct = [ -1, int.max, int.min, 0 ]; 636 assert(C.array == correct); 637 } 638 639 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`, 640 /// and pack the signed 16-bit results. 641 __m64 _mm_hsub_pi16 (__m64 a, __m64 b) @trusted 642 { 643 // PERF DMD 644 static if (GDC_with_SSSE3) 645 { 646 return cast(__m64)__builtin_ia32_phsubw(cast(short4)a, cast(short4)b); 647 } 648 else static if (LDC_with_ARM64) 649 { 650 // Produce uzp1 uzp2 sub sequence since LDC 1.3 -O1 651 short4 sa = cast(short4)a; 652 short4 sb = cast(short4)b; 653 short4 c = shufflevector!(short4, 0, 2, 4, 6)(sa, sb); 654 short4 d = shufflevector!(short4, 1, 3, 5, 7)(sa, sb); 655 return cast(__m64)(c - d); 656 } 657 else 658 { 659 // LDC x86: generates phsubw since LDC 1.24 -O2 660 short4 sa = cast(short4)a; 661 short4 sb = cast(short4)b; 662 short4 r; 663 r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]); 664 r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]); 665 r.ptr[2] = cast(short)(sb.array[0] - sb.array[1]); 666 r.ptr[3] = cast(short)(sb.array[2] - sb.array[3]); 667 return cast(__m64)r; 668 } 669 } 670 unittest 671 { 672 __m64 A = _mm_setr_pi16(short.min, 1, 4, 8); 673 __m64 B = _mm_setr_pi16(16, 32, 1, -32768); 674 short4 C = cast(short4) _mm_hsub_pi16(A, B); 675 short[4] correct = [ short.max, -4, -16, -32767]; 676 assert(C.array == correct); 677 } 678 679 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`, 680 /// and pack the signed 32-bit results. 681 __m64 _mm_hsub_pi32 (__m64 a, __m64 b) @trusted 682 { 683 // PERF DMD 684 static if (GDC_with_SSSE3) 685 { 686 return cast(__m64)__builtin_ia32_phsubd(cast(int2)a, cast(int2)b); 687 } 688 else static if (LDC_with_ARM64) 689 { 690 // LDC arm64: generates zip1+zip2+sub sequence since LDC 1.8 -O1 691 int2 ia = cast(int2)a; 692 int2 ib = cast(int2)b; 693 int2 c = shufflevector!(int2, 0, 2)(ia, ib); 694 int2 d = shufflevector!(int2, 1, 3)(ia, ib); 695 return cast(__m64)(c - d); 696 } 697 else 698 { 699 // LDC x86: generates phsubd since LDC 1.24 -O2 700 int2 ia = cast(int2)a; 701 int2 ib = cast(int2)b; 702 int2 r; 703 r.ptr[0] = ia.array[0] - ia.array[1]; 704 r.ptr[1] = ib.array[0] - ib.array[1]; 705 return cast(__m64)r; 706 } 707 } 708 unittest 709 { 710 __m64 A = _mm_setr_pi32(int.min, 1); 711 __m64 B = _mm_setr_pi32(int.max, -1); 712 int2 C = cast(int2) _mm_hsub_pi32(A, B); 713 int[2] correct = [ int.max, int.min ]; 714 assert(C.array == correct); 715 } 716 717 /// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 718 /// and pack the signed 16-bit results. 719 __m128i _mm_hsubs_epi16 (__m128i a, __m128i b) @trusted 720 { 721 // PERF DMD 722 static if (GDC_with_SSSE3) 723 { 724 return cast(__m128i)__builtin_ia32_phsubsw128(cast(short8)a, cast(short8)b); 725 } 726 else static if (LDC_with_SSSE3) 727 { 728 return cast(__m128i)__builtin_ia32_phsubsw128(cast(short8)a, cast(short8)b); 729 } 730 else static if (LDC_with_ARM64) 731 { 732 // uzp1/uzp2/sqsub sequence 733 short8 sa = cast(short8)a; 734 short8 sb = cast(short8)b; 735 short8 c = shufflevector!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb); 736 short8 d = shufflevector!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb); 737 return cast(__m128i)vqsubq_s16(c, d); 738 } 739 else 740 { 741 short8 sa = cast(short8)a; 742 short8 sb = cast(short8)b; 743 short8 r; 744 r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] - sa.array[1]); 745 r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] - sa.array[3]); 746 r.ptr[2] = saturateSignedIntToSignedShort(sa.array[4] - sa.array[5]); 747 r.ptr[3] = saturateSignedIntToSignedShort(sa.array[6] - sa.array[7]); 748 r.ptr[4] = saturateSignedIntToSignedShort(sb.array[0] - sb.array[1]); 749 r.ptr[5] = saturateSignedIntToSignedShort(sb.array[2] - sb.array[3]); 750 r.ptr[6] = saturateSignedIntToSignedShort(sb.array[4] - sb.array[5]); 751 r.ptr[7] = saturateSignedIntToSignedShort(sb.array[6] - sb.array[7]); 752 return cast(__m128i)r; 753 } 754 } 755 unittest 756 { 757 __m128i A = _mm_setr_epi16(1, -2, 4, 8, 32767, -1, -10, 32767); 758 short8 C = cast(short8) _mm_hsubs_epi16(A, A); 759 short[8] correct = [ 3, -4, 32767, -32768, 3, -4, 32767, -32768 ]; 760 assert(C.array == correct); 761 } 762 763 764 /// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 765 /// and pack the signed 16-bit results. 766 __m64 _mm_hsubs_pi16 (__m64 a, __m64 b) @trusted 767 { 768 static if (GDC_with_SSSE3) 769 { 770 return cast(__m64)__builtin_ia32_phsubsw(cast(short4)a, cast(short4)b); 771 } 772 else static if (LDC_with_SSSE3) 773 { 774 // Note: LDC doesn't have __builtin_ia32_phsubsw 775 long2 la; 776 la.ptr[0] = a.array[0]; 777 long2 lb; 778 lb.ptr[0] = b.array[0]; 779 int4 sum = cast(int4)__builtin_ia32_phsubsw128(cast(short8)la, cast(short8)lb); 780 int2 r; 781 r.ptr[0] = sum.array[0]; 782 r.ptr[1] = sum.array[2]; 783 return cast(__m64)r; 784 } 785 else static if (LDC_with_ARM64) 786 { 787 // uzp1/uzp2/sqsub sequence in -O1 788 short4 sa = cast(short4)a; 789 short4 sb = cast(short4)b; 790 short4 c = shufflevector!(short4, 0, 2, 4, 6)(sa, sb); 791 short4 d = shufflevector!(short4, 1, 3, 5, 7)(sa, sb); 792 return cast(__m64)vqsub_s16(c, d); 793 } 794 else 795 { 796 short4 sa = cast(short4)a; 797 short4 sb = cast(short4)b; 798 short4 r; 799 r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] - sa.array[1]); 800 r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] - sa.array[3]); 801 r.ptr[2] = saturateSignedIntToSignedShort(sb.array[0] - sb.array[1]); 802 r.ptr[3] = saturateSignedIntToSignedShort(sb.array[2] - sb.array[3]); 803 return cast(__m64)r; 804 } 805 } 806 unittest 807 { 808 __m64 A = _mm_setr_pi16(-16, 32, 100, -32768); 809 __m64 B = _mm_setr_pi16( 64, 30, -9, 32767); 810 short4 C = cast(short4) _mm_hsubs_pi16(A, B); 811 short[4] correct = [ -48, 32767, 34, -32768]; 812 assert(C.array == correct); 813 } 814 815 816 /// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 817 /// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 818 /// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 819 /// and pack the saturated results. 820 __m128i _mm_maddubs_epi16 (__m128i a, __m128i b) @trusted 821 { 822 static if (GDC_with_SSSE3) 823 { 824 return cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16)a, cast(byte16)b); 825 } 826 else static if (LDC_with_SSSE3) 827 { 828 return cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16)a, cast(byte16)b); 829 } 830 else 831 { 832 // zero-extend a to 16-bit 833 __m128i zero = _mm_setzero_si128(); 834 __m128i a_lo = _mm_unpacklo_epi8(a, zero); 835 __m128i a_hi = _mm_unpackhi_epi8(a, zero); 836 837 // sign-extend b to 16-bit 838 __m128i b_lo = _mm_unpacklo_epi8(b, zero); 839 __m128i b_hi = _mm_unpackhi_epi8(b, zero); 840 b_lo = _mm_srai_epi16( _mm_slli_epi16(b_lo, 8), 8); 841 b_hi = _mm_srai_epi16( _mm_slli_epi16(b_hi, 8), 8); 842 843 // Multiply element-wise, no overflow can occur 844 __m128i c_lo = _mm_mullo_epi16(a_lo, b_lo); 845 __m128i c_hi = _mm_mullo_epi16(a_hi, b_hi); 846 847 // Add pairwise with saturating horizontal add 848 return _mm_hadds_epi16(c_lo, c_hi); 849 } 850 } 851 unittest 852 { 853 __m128i A = _mm_setr_epi8( -1, 10, 100, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // u8 854 __m128i B = _mm_setr_epi8(-128, -30, 100, 127, -1, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0); // i8 855 short8 C = cast(short8) _mm_maddubs_epi16(A, B); 856 short[8] correct = [ -32768, 26256, 0, 0, 0, 0, 0, 0]; 857 assert(C.array == correct); 858 } 859 860 /// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 861 /// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 862 /// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 863 /// and pack the saturated results. 864 __m64 _mm_maddubs_pi16 (__m64 a, __m64 b) @trusted 865 { 866 static if (GDC_with_SSSE3) 867 { 868 return cast(__m64)__builtin_ia32_pmaddubsw(cast(byte8)a, cast(byte8)b); 869 } 870 else static if (LDC_with_SSSE3) 871 { 872 __m128i A = to_m128i(a); 873 __m128i B = to_m128i(b); 874 return to_m64( cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16) to_m128i(a), cast(byte16) to_m128i(b))); 875 } 876 else 877 { 878 // zero-extend a to 16-bit 879 __m128i zero = _mm_setzero_si128(); 880 __m128i A = _mm_unpacklo_epi8(to_m128i(a), zero); 881 882 // sign-extend b to 16-bit 883 __m128i B = _mm_unpacklo_epi8(to_m128i(b), zero); 884 B = _mm_srai_epi16( _mm_slli_epi16(B, 8), 8); 885 886 // Multiply element-wise, no overflow can occur 887 __m128i c = _mm_mullo_epi16(A, B); 888 889 // Add pairwise with saturating horizontal add 890 return to_m64( _mm_hadds_epi16(c, zero)); 891 } 892 } 893 unittest 894 { 895 __m64 A = _mm_setr_pi8( -1, 10, 100, -128, 0, 0, 0, 0); // u8 896 __m64 B = _mm_setr_pi8(-128, -30, 100, 127, -1, 2, 4, 6); // i8 897 short4 C = cast(short4) _mm_maddubs_pi16(A, B); 898 short[4] correct = [ -32768, 26256, 0, 0]; 899 assert(C.array == correct); 900 } 901 902 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers. 903 /// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return bits `[16:1]`. 904 __m128i _mm_mulhrs_epi16 (__m128i a, __m128i b) @trusted 905 { 906 // PERF DMD 907 static if (GDC_with_SSSE3) 908 { 909 return cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8)a, cast(short8)b); 910 } 911 else static if (LDC_with_SSSE3) 912 { 913 return cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8)a, cast(short8)b); 914 } 915 else static if (LDC_with_ARM64) 916 { 917 int4 mul_lo = vmull_s16(vget_low_s16(cast(short8)a), 918 vget_low_s16(cast(short8)b)); 919 int4 mul_hi = vmull_s16(vget_high_s16(cast(short8)a), 920 vget_high_s16(cast(short8)b)); 921 922 // Rounding narrowing shift right 923 // narrow = (int16_t)((mul + 16384) >> 15); 924 short4 narrow_lo = vrshrn_n_s32(mul_lo, 15); 925 short4 narrow_hi = vrshrn_n_s32(mul_hi, 15); 926 927 // Join together. 928 return cast(__m128i) vcombine_s16(narrow_lo, narrow_hi); 929 } 930 else 931 { 932 short8 sa = cast(short8)a; 933 short8 sb = cast(short8)b; 934 short8 r; 935 936 for (int i = 0; i < 8; ++i) 937 { 938 // I doubted it at first, but an exhaustive search show this to be equivalent to Intel pseudocode. 939 r.ptr[i] = cast(short) ( (sa.array[i] * sb.array[i] + 0x4000) >> 15); 940 } 941 942 return cast(__m128i)r; 943 } 944 } 945 946 unittest 947 { 948 __m128i A = _mm_setr_epi16(12345, -32768, 32767, 0, 1, 845, -6999, -1); 949 __m128i B = _mm_setr_epi16(8877, -24487, 15678, 32760, 1, 0, -149, -1); 950 short8 C = cast(short8) _mm_mulhrs_epi16(A, B); 951 short[8] correct = [3344, 24487, 15678, 0, 0, 0, 32, 0]; 952 assert(C.array == correct); 953 } 954 955 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers. 956 /// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return bits `[16:1]`. 957 __m64 _mm_mulhrs_pi16 (__m64 a, __m64 b) @trusted 958 { 959 // PERF DMD 960 static if (GDC_with_SSSE3) 961 { 962 return cast(__m64) __builtin_ia32_pmulhrsw(cast(short4)a, cast(short4)b); 963 } 964 else static if (LDC_with_SSSE3) 965 { 966 return cast(__m64) to_m64( cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8) to_m128i(a), cast(short8) to_m128i(b))); 967 } 968 else static if (LDC_with_ARM64) 969 { 970 int4 mul = vmull_s16(cast(short4)a, cast(short4)b); 971 972 // Rounding narrowing shift right 973 // (int16_t)((mul + 16384) >> 15); 974 return cast(__m64) vrshrn_n_s32(mul, 15); 975 } 976 else 977 { 978 short4 sa = cast(short4)a; 979 short4 sb = cast(short4)b; 980 short4 r; 981 982 for (int i = 0; i < 4; ++i) 983 { 984 r.ptr[i] = cast(short) ( (sa.array[i] * sb.array[i] + 0x4000) >> 15); 985 } 986 return cast(__m64)r; 987 } 988 } 989 unittest 990 { 991 __m64 A = _mm_setr_pi16(12345, -32768, 32767, 0); 992 __m64 B = _mm_setr_pi16(8877, -24487, 15678, 32760); 993 short4 C = cast(short4) _mm_mulhrs_pi16(A, B); 994 short[4] correct = [3344, 24487, 15678, 0]; 995 assert(C.array == correct); 996 } 997 998 999 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`. 1000 __m128i _mm_shuffle_epi8 (__m128i a, __m128i b) @trusted 1001 { 1002 // This is the lovely pshufb. 1003 // PERF DMD 1004 static if (GDC_with_SSSE3) 1005 { 1006 return cast(__m128i) __builtin_ia32_pshufb128(cast(byte16) a, cast(byte16) b); 1007 } 1008 else static if (LDC_with_SSSE3) 1009 { 1010 return cast(__m128i) __builtin_ia32_pshufb128(cast(byte16) a, cast(byte16) b); 1011 } 1012 else static if (LDC_with_ARM64) 1013 { 1014 byte16 bb = cast(byte16)b; 1015 byte16 mask; 1016 mask = cast(byte)(0x8F); 1017 bb = bb & mask; 1018 byte16 r = vqtbl1q_s8(cast(byte16)a, bb); 1019 return cast(__m128i)r; 1020 } 1021 else 1022 { 1023 byte16 r; 1024 byte16 ba = cast(byte16)a; 1025 byte16 bb = cast(byte16)b; 1026 for (int i = 0; i < 16; ++i) 1027 { 1028 byte s = bb.array[i]; 1029 r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 15 ]; 1030 } 1031 return cast(__m128i)r; 1032 } 1033 } 1034 unittest 1035 { 1036 __m128i A = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 1037 __m128i B = _mm_setr_epi8(15, -128, 13 + 16, -12, 11, -10, 9, 8, 7, 6, -5, 4, 3, -2, 1, 0); 1038 byte16 C = cast(byte16) _mm_shuffle_epi8(A, B); 1039 byte[16] correct = [0, 0, 2, 0, 4, 0, 6, 7, 8, 9, 0, 11, 12, 0, 14, 15]; 1040 assert(C.array == correct); 1041 } 1042 1043 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`. 1044 __m64 _mm_shuffle_pi8 (__m64 a, __m64 b) @trusted 1045 { 1046 // PERF DMD 1047 static if (GDC_with_SSSE3) 1048 { 1049 alias ubyte8 =__vector(ubyte[8]); 1050 return cast(__m64) __builtin_ia32_pshufb(cast(ubyte8) a, cast(ubyte8) b); 1051 } 1052 else static if (LDC_with_SSSE3) 1053 { 1054 // GDC does proper dance to avoid mmx registers, do it manually in LDC since __builtin_ia32_pshufb doesn't exist there 1055 __m128i A = to_m128i(a); 1056 __m128i index = to_m128i(b); 1057 index = index & _mm_set1_epi32(0xF7F7F7F7); 1058 return to_m64( cast(__m128i) __builtin_ia32_pshufb128(cast(byte16)A, cast(byte16) index) ); 1059 } 1060 else static if (LDC_with_ARM64) 1061 { 1062 byte8 bb = cast(byte8)b; 1063 byte8 mask; 1064 mask = cast(byte)(0x87); 1065 bb = bb & mask; 1066 __m128i l = to_m128i(a); 1067 byte8 r = vtbl1_s8(cast(byte16)l, cast(byte8)bb); 1068 return cast(__m64)r; 1069 } 1070 else 1071 { 1072 byte8 r; 1073 byte8 ba = cast(byte8)a; 1074 byte8 bb = cast(byte8)b; 1075 for (int i = 0; i < 8; ++i) 1076 { 1077 byte s = bb.array[i]; 1078 r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 7 ]; 1079 } 1080 return cast(__m64)r; 1081 } 1082 } 1083 unittest 1084 { 1085 __m64 A = _mm_setr_pi8(7, 6, 5, 4, 3, 2, 1, 0); 1086 __m64 B = _mm_setr_pi8(7, 6, -5, 4, 3 + 8, -2, 1, 0); 1087 byte8 C = cast(byte8) _mm_shuffle_pi8(A, B); 1088 byte[8] correct = [0, 1, 0, 3, 4, 0, 6, 7]; 1089 assert(C.array == correct); 1090 } 1091 1092 /// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit integer in `b` is negative. 1093 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 1094 __m128i _mm_sign_epi16 (__m128i a, __m128i b) @trusted 1095 { 1096 // PERF DMD 1097 static if (GDC_with_SSSE3) 1098 { 1099 return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b); 1100 } 1101 else static if (LDC_with_SSSE3) 1102 { 1103 return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b); 1104 } 1105 else 1106 { 1107 // LDC arm64: 5 instructions 1108 __m128i mask = _mm_srai_epi16(b, 15); 1109 __m128i zeromask = _mm_cmpeq_epi16(b, _mm_setzero_si128()); 1110 return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi16(a, mask), mask)); 1111 } 1112 } 1113 unittest 1114 { 1115 __m128i A = _mm_setr_epi16(-2, -1, 0, 1, 2, short.min, short.min, short.min); 1116 __m128i B = _mm_setr_epi16(-1, 0,-1, 1, -2, -50, 0, 50); 1117 short8 C = cast(short8) _mm_sign_epi16(A, B); 1118 short[8] correct = [ 2, 0, 0, 1, -2, short.min, 0, short.min]; 1119 assert(C.array == correct); 1120 } 1121 1122 /// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit integer in `b` is negative. 1123 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 1124 __m128i _mm_sign_epi32 (__m128i a, __m128i b) @trusted 1125 { 1126 // PERF DMD 1127 static if (GDC_with_SSSE3) 1128 { 1129 return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b); 1130 } 1131 else static if (LDC_with_SSSE3) 1132 { 1133 return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b); 1134 } 1135 else 1136 { 1137 __m128i mask = _mm_srai_epi32(b, 31); 1138 __m128i zeromask = _mm_cmpeq_epi32(b, _mm_setzero_si128()); 1139 return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi32(a, mask), mask)); 1140 } 1141 } 1142 unittest 1143 { 1144 __m128i A = _mm_setr_epi32(-2, -1, 0, int.max); 1145 __m128i B = _mm_setr_epi32(-1, 0, -1, 1); 1146 int4 C = cast(int4) _mm_sign_epi32(A, B); 1147 int[4] correct = [ 2, 0, 0, int.max]; 1148 assert(C.array == correct); 1149 } 1150 1151 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 1152 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 1153 __m128i _mm_sign_epi8 (__m128i a, __m128i b) @trusted 1154 { 1155 // PERF DMD 1156 static if (GDC_with_SSSE3) 1157 { 1158 return cast(__m128i) __builtin_ia32_psignb128(cast(byte16)a, cast(byte16)b); 1159 } 1160 else static if (LDC_with_SSSE3) 1161 { 1162 return cast(__m128i) __builtin_ia32_psignb128(cast(byte16)a, cast(byte16)b); 1163 } 1164 else 1165 { 1166 __m128i mask = _mm_cmplt_epi8(b, _mm_setzero_si128()); // extend sign bit 1167 __m128i zeromask = _mm_cmpeq_epi8(b, _mm_setzero_si128()); 1168 return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi8(a, mask), mask)); 1169 } 1170 } 1171 unittest 1172 { 1173 __m128i A = _mm_setr_epi8(-2, -1, 0, 1, 2, byte.min, byte.min, byte.min, -1, 0,-1, 1, -2, -50, 0, 50); 1174 __m128i B = _mm_setr_epi8(-1, 0,-1, 1, -2, -50, 0, 50, -2, -1, 0, 1, 2, byte.min, byte.min, byte.min); 1175 byte16 C = cast(byte16) _mm_sign_epi8(A, B); 1176 byte[16] correct = [ 2, 0, 0, 1, -2, byte.min, 0, byte.min, 1, 0, 0, 1, -2, 50, 0, -50]; 1177 assert(C.array == correct); 1178 } 1179 1180 /// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit integer in `b` is negative. 1181 /// Element in result are zeroed out when the corresponding element in `b` is zero. 1182 __m64 _mm_sign_pi16 (__m64 a, __m64 b) @trusted 1183 { 1184 return to_m64( _mm_sign_epi16( to_m128i(a), to_m128i(b)) ); 1185 } 1186 unittest 1187 { 1188 __m64 A = _mm_setr_pi16( 2, short.min, short.min, short.min); 1189 __m64 B = _mm_setr_pi16(-2, -50, 0, 50); 1190 short4 C = cast(short4) _mm_sign_pi16(A, B); 1191 short[4] correct = [-2, short.min, 0, short.min]; 1192 assert(C.array == correct); 1193 } 1194 1195 /// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit integer in `b` is negative. 1196 /// Element in result are zeroed out when the corresponding element in `b` is zero. 1197 __m64 _mm_sign_pi32 (__m64 a, __m64 b) @trusted 1198 { 1199 return to_m64( _mm_sign_epi32( to_m128i(a), to_m128i(b)) ); 1200 } 1201 unittest 1202 { 1203 __m64 A = _mm_setr_pi32(-2, -100); 1204 __m64 B = _mm_setr_pi32(-1, 0); 1205 int2 C = cast(int2) _mm_sign_pi32(A, B); 1206 int[2] correct = [ 2, 0]; 1207 assert(C.array == correct); 1208 } 1209 1210 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 1211 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 1212 __m64 _mm_sign_pi8 (__m64 a, __m64 b) @trusted 1213 { 1214 return to_m64( _mm_sign_epi8( to_m128i(a), to_m128i(b)) ); 1215 } 1216 unittest 1217 { 1218 __m64 A = _mm_setr_pi8(-2, -1, 0, 1, 2, byte.min, byte.min, byte.min); 1219 __m64 B = _mm_setr_pi8(-1, 0,-1, 1, -2, -50, 0, 50); 1220 byte8 C = cast(byte8) _mm_sign_pi8(A, B); 1221 byte[8] correct = [ 2, 0, 0, 1, -2, byte.min, 0, byte.min]; 1222 assert(C.array == correct); 1223 }