1 /** 2 * SSSE3 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSSE3 4 * 5 * Copyright: Guillaume Piolat 2021. 6 * Johan Engelen 2021. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.tmmintrin; 10 11 public import inteli.types; 12 import inteli.internals; 13 14 public import inteli.pmmintrin; 15 import inteli.mmx; 16 17 nothrow @nogc: 18 19 20 // SSSE3 instructions 21 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSSE3 22 // Note: this header will work whether you have SSSE3 enabled or not. 23 // With LDC, use "dflags-ldc": ["-mattr=+ssse3"] or equivalent to actively 24 // generate SSE3 instructions. 25 // With GDC, use "dflags-gdc": ["-mssse3"] or equivalent to generate SSSE3 instructions. 26 27 /// Compute the absolute value of packed signed 16-bit integers in `a`. 28 __m128i _mm_abs_epi16 (__m128i a) @trusted 29 { 30 static if (DMD_with_DSIMD) 31 { 32 return cast(__m128i)__simd(XMM.PABSW, a); 33 } 34 else static if (GDC_with_SSSE3) 35 { 36 return cast(__m128i) __builtin_ia32_pabsw128(cast(short8)a); 37 } 38 else static if (LDC_with_ARM64) 39 { 40 return cast(__m128i) vabsq_s16(cast(short8)a); 41 } 42 else 43 { 44 // LDC x86: generate pabsw since LDC 1.1 -O2 45 short8 sa = cast(short8)a; 46 for (int i = 0; i < 8; ++i) 47 { 48 short s = sa.array[i]; 49 sa.ptr[i] = s >= 0 ? s : cast(short)(-cast(int)(s)); 50 } 51 return cast(__m128i)sa; 52 } 53 } 54 unittest 55 { 56 __m128i A = _mm_setr_epi16(0, -1, -32768, 32767, 10, -10, 1000, -1000); 57 short8 B = cast(short8) _mm_abs_epi16(A); 58 short[8] correct = [0, 1, -32768, 32767, 10, 10, 1000, 1000]; 59 assert(B.array == correct); 60 } 61 62 /// Compute the absolute value of packed signed 32-bit integers in `a`. 63 __m128i _mm_abs_epi32 (__m128i a) @trusted 64 { 65 static if (DMD_with_DSIMD) 66 { 67 return cast(__m128i)__simd(XMM.PABSD, cast(int4)a); 68 } 69 else static if (GDC_with_SSSE3) 70 { 71 return cast(__m128i) __builtin_ia32_pabsd128(cast(int4)a); 72 } 73 else static if (LDC_with_ARM64) 74 { 75 return cast(__m128i) vabsq_s32(cast(int4)a); 76 } 77 else 78 { 79 // LDC x86: generates pabsd since LDC 1.1 -O2 80 int4 sa = cast(int4)a; 81 for (int i = 0; i < 4; ++i) 82 { 83 int s = sa.array[i]; 84 sa.ptr[i] = s >= 0 ? s : -s; 85 } 86 return cast(__m128i)sa; 87 } 88 } 89 unittest 90 { 91 __m128i A = _mm_setr_epi32(0, -1, -2_147_483_648, -2_147_483_647); 92 int4 B = cast(int4) _mm_abs_epi32(A); 93 int[4] correct = [0, 1, -2_147_483_648, 2_147_483_647]; 94 assert(B.array == correct); 95 } 96 97 /// Compute the absolute value of packed signed 8-bit integers in `a`. 98 __m128i _mm_abs_epi8 (__m128i a) @trusted 99 { 100 static if (DMD_with_DSIMD) 101 { 102 return cast(__m128i)__simd(XMM.PABSB, cast(byte16)a); 103 } 104 else static if (GDC_with_SSSE3) 105 { 106 alias ubyte16 = __vector(ubyte[16]); 107 return cast(__m128i) __builtin_ia32_pabsb128(cast(ubyte16)a); 108 } 109 else static if (LDC_with_ARM64) 110 { 111 return cast(__m128i) vabsq_s8(cast(byte16)a); 112 } 113 else static if (LDC_with_optimizations) 114 { 115 // LDC x86: generates pabsb since LDC 1.1 -O1 116 // arm64: generates abs since LDC 1.8 -O1 117 enum ir = ` 118 %n = sub <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %0 119 %s = icmp slt <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %0 120 %r = select <16 x i1> %s, <16 x i8> %0, <16 x i8> %n 121 ret <16 x i8> %r`; 122 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16)(cast(byte16)a); 123 } 124 else 125 { 126 // A loop version like in _mm_abs_epi16/_mm_abs_epi32 would be very slow 127 // in LDC x86 and wouldn't vectorize. Doesn't generate pabsb in LDC though. 128 return _mm_min_epu8(a, _mm_sub_epi8(_mm_setzero_si128(), a)); 129 } 130 } 131 unittest 132 { 133 __m128i A = _mm_setr_epi8(0, -1, -128, -127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 134 byte16 B = cast(byte16) _mm_abs_epi8(A); 135 byte[16] correct = [0, 1, -128, 127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 136 assert(B.array == correct); 137 } 138 139 /// Compute the absolute value of packed 64-bit floating-point elements in `a`. 140 /// #BONUS. 141 __m128d _mm_abs_pd (__m128d a) @trusted 142 { 143 long2 mask = 0x7fff_ffff_ffff_ffff; 144 return cast(__m128d)((cast(long2)a) & mask); 145 } 146 unittest 147 { 148 __m128d A = _mm_setr_pd(-42.0f, -double.infinity); 149 __m128d R = _mm_abs_pd(A); 150 double[2] correct = [42.0f, +double.infinity]; 151 assert(R.array == correct); 152 } 153 154 /// Compute the absolute value of packed signed 16-bit integers in `a`. 155 __m64 _mm_abs_pi16 (__m64 a) @trusted 156 { 157 return to_m64(_mm_abs_epi16(to_m128i(a))); 158 } 159 unittest 160 { 161 __m64 A = _mm_setr_pi16(0, -1, -32768, 32767); 162 short4 B = cast(short4) _mm_abs_pi16(A); 163 short[4] correct = [0, 1, -32768, 32767]; 164 assert(B.array == correct); 165 } 166 167 /// Compute the absolute value of packed signed 32-bit integers in `a`. 168 __m64 _mm_abs_pi32 (__m64 a) @trusted 169 { 170 return to_m64(_mm_abs_epi32(to_m128i(a))); 171 } 172 unittest 173 { 174 __m64 A = _mm_setr_pi32(-1, -2_147_483_648); 175 int2 B = cast(int2) _mm_abs_pi32(A); 176 int[2] correct = [1, -2_147_483_648]; 177 assert(B.array == correct); 178 } 179 180 /// Compute the absolute value of packed signed 8-bit integers in `a`. 181 __m64 _mm_abs_pi8 (__m64 a) @trusted 182 { 183 return to_m64(_mm_abs_epi8(to_m128i(a))); 184 } 185 unittest 186 { 187 __m64 A = _mm_setr_pi8(0, -1, -128, -127, 127, 0, 0, 0); 188 byte8 B = cast(byte8) _mm_abs_pi8(A); 189 byte[8] correct = [0, 1, -128, 127, 127, 0, 0, 0]; 190 assert(B.array == correct); 191 } 192 193 /// Compute the absolute value of packed 32-bit floating-point elements in `a`. 194 /// #BONUS. 195 __m128 _mm_abs_ps (__m128 a) @trusted 196 { 197 __m128i mask = 0x7fffffff; 198 return cast(__m128)((cast(__m128i)a) & mask); 199 } 200 unittest 201 { 202 __m128 A = _mm_setr_ps(-0.0f, 10.0f, -42.0f, -float.infinity); 203 __m128 R = _mm_abs_ps(A); 204 float[4] correct = [0.0f, 10.0f, 42.0f, +float.infinity]; 205 assert(R.array == correct); 206 } 207 208 /// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result, shift the result right by `count` bytes, and return the low 16 bytes. 209 __m128i _mm_alignr_epi8(ubyte count)(__m128i a, __m128i b) @trusted 210 { 211 // PERF DMD 212 static if (GDC_with_SSSE3) 213 { 214 return cast(__m128i)__builtin_ia32_palignr128(cast(long2)a, cast(long2)b, count * 8); 215 } 216 else version(LDC) 217 { 218 static if (count >= 32) 219 { 220 return _mm_setzero_si128(); 221 } 222 else static if (count < 16) 223 { 224 // Generates palignr since LDC 1.1 -O1 225 // Also generates a single ext instruction on arm64. 226 return cast(__m128i) shufflevectorLDC!(byte16, ( 0 + count), 227 ( 1 + count), 228 ( 2 + count), 229 ( 3 + count), 230 ( 4 + count), 231 ( 5 + count), 232 ( 6 + count), 233 ( 7 + count), 234 ( 8 + count), 235 ( 9 + count), 236 (10 + count), 237 (11 + count), 238 (12 + count), 239 (13 + count), 240 (14 + count), 241 (15 + count))(cast(byte16)b, cast(byte16)a); 242 } 243 else 244 { 245 return cast(__m128i) shufflevectorLDC!(byte16, ( 0 + count) % 32, 246 ( 1 + count) % 32, 247 ( 2 + count) % 32, 248 ( 3 + count) % 32, 249 ( 4 + count) % 32, 250 ( 5 + count) % 32, 251 ( 6 + count) % 32, 252 ( 7 + count) % 32, 253 ( 8 + count) % 32, 254 ( 9 + count) % 32, 255 (10 + count) % 32, 256 (11 + count) % 32, 257 (12 + count) % 32, 258 (13 + count) % 32, 259 (14 + count) % 32, 260 (15 + count) % 32)(cast(byte16)_mm_setzero_si128(), cast(byte16)a); 261 } 262 } 263 else 264 { 265 byte16 ab = cast(byte16)a; 266 byte16 bb = cast(byte16)b; 267 byte16 r; 268 269 for (int i = 0; i < 16; ++i) 270 { 271 const int srcpos = count + cast(int)i; 272 if (srcpos > 31) 273 { 274 r.ptr[i] = 0; 275 } 276 else if (srcpos > 15) 277 { 278 r.ptr[i] = ab.array[(srcpos) & 15]; 279 } 280 else 281 { 282 r.ptr[i] = bb.array[srcpos]; 283 } 284 } 285 return cast(__m128i)r; 286 } 287 } 288 unittest 289 { 290 __m128i A = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); 291 __m128i B = _mm_setr_epi8(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); 292 293 { 294 byte16 C = cast(byte16)_mm_alignr_epi8!0(A ,B); 295 byte[16] correct = [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]; 296 assert(C.array == correct); 297 } 298 { 299 byte16 C = cast(byte16)_mm_alignr_epi8!20(A ,B); 300 byte[16] correct = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0]; 301 assert(C.array == correct); 302 } 303 { 304 byte16 C = cast(byte16)_mm_alignr_epi8!34(A ,B); 305 byte[16] correct = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 306 assert(C.array == correct); 307 } 308 309 __m128i D = _mm_setr_epi8(-123, -82, 103, -69, 103, -26, 9, 106, 58, -11, 79, -91, 114, -13, 110, 60); 310 __m128i E = _mm_setr_epi8(25, -51, -32, 91, -85, -39, -125, 31, -116, 104, 5, -101, 127, 82, 14, 81); 311 byte16 F = cast(byte16)_mm_alignr_epi8!8(D, E); 312 byte[16] correct = [-116, 104, 5, -101, 127, 82, 14, 81, -123, -82, 103, -69, 103, -26, 9, 106]; 313 assert(F.array == correct); 314 } 315 316 /// Concatenate 8-byte blocks in `a` and `b` into a 16-byte temporary result, shift the result right by `count` bytes, and return the low 8 bytes. 317 __m64 _mm_alignr_pi8(ubyte count)(__m64 a, __m64 b) @trusted 318 { 319 // PERF DMD 320 static if (GDC_with_SSSE3) 321 { 322 return cast(__m64)__builtin_ia32_palignr(cast(long1)a, cast(long1)b, count * 8); 323 } 324 else version(LDC) 325 { 326 static if (count >= 16) 327 { 328 return _mm_setzero_si64(); 329 } 330 else static if (count < 8) 331 { 332 // Note: in LDC x86 this uses a pshufb. 333 // Generates ext in arm64. 334 return cast(__m64) shufflevectorLDC!(byte8, (0 + count), 335 (1 + count), 336 (2 + count), 337 (3 + count), 338 (4 + count), 339 (5 + count), 340 (6 + count), 341 (7 + count))(cast(byte8)b, cast(byte8)a); 342 } 343 else 344 { 345 return cast(__m64) shufflevectorLDC!(byte8, (0 + count)%16, 346 (1 + count)%16, 347 (2 + count)%16, 348 (3 + count)%16, 349 (4 + count)%16, 350 (5 + count)%16, 351 (6 + count)%16, 352 (7 + count)%16)(cast(byte8)_mm_setzero_si64(), cast(byte8)a); 353 } 354 } 355 else 356 { 357 byte8 ab = cast(byte8)a; 358 byte8 bb = cast(byte8)b; 359 byte8 r; 360 361 for (int i = 0; i < 8; ++i) 362 { 363 const int srcpos = count + cast(int)i; 364 if (srcpos > 15) 365 { 366 r.ptr[i] = 0; 367 } 368 else if (srcpos > 7) 369 { 370 r.ptr[i] = ab.array[(srcpos) & 7]; 371 } 372 else 373 { 374 r.ptr[i] = bb.array[srcpos]; 375 } 376 } 377 return cast(__m64)r; 378 } 379 } 380 unittest 381 { 382 __m64 A = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8); 383 __m64 B = _mm_setr_pi8(17, 18, 19, 20, 21, 22, 23, 24); 384 385 { 386 byte8 C = cast(byte8)_mm_alignr_pi8!0(A ,B); 387 byte[8] correct = [17, 18, 19, 20, 21, 22, 23, 24]; 388 assert(C.array == correct); 389 } 390 391 { 392 byte8 C = cast(byte8)_mm_alignr_pi8!3(A ,B); 393 byte[8] correct = [ 20, 21, 22, 23, 24, 1, 2, 3]; 394 assert(C.array == correct); 395 } 396 { 397 byte8 C = cast(byte8)_mm_alignr_pi8!11(A ,B); 398 byte[8] correct = [4, 5, 6, 7, 8, 0, 0, 0]; 399 assert(C.array == correct); 400 } 401 { 402 byte8 C = cast(byte8)_mm_alignr_pi8!17(A ,B); 403 byte[8] correct = [0, 0, 0, 0, 0, 0, 0, 0]; 404 assert(C.array == correct); 405 } 406 } 407 408 /// Reverse endianness of 16-bit integers in `a`. 409 __m128i _mm_bswap_epi16 (__m128i a) pure @safe // #BONUS 410 { 411 __m128i order = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); 412 return _mm_shuffle_epi8(a, order); 413 } 414 unittest 415 { 416 __m128i A = _mm_setr_epi16(0x1122, 0x3344, 0, -1, 0x1122, 0x3344, 0, -1); 417 short8 R = cast(short8) _mm_bswap_epi16(A); 418 short[8] correct = [0x2211, 0x4433, 0, -1, 0x2211, 0x4433, 0, -1]; 419 assert(R.array == correct); 420 } 421 422 /// Reverse endianness of 32-bit integers in `a`. 423 __m128i _mm_bswap_epi32 (__m128i a) pure @safe // #BONUS 424 { 425 __m128i order = _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); 426 return _mm_shuffle_epi8(a, order); 427 } 428 unittest 429 { 430 __m128i A = _mm_setr_epi32(0x11223344, 0x33445566, 0, -1); 431 int4 R = cast(int4) _mm_bswap_epi32(A); 432 int[4] correct = [0x44332211, 0x66554433, 0, -1]; 433 assert(R.array == correct); 434 } 435 436 /// Reverse endianness of 64-bit integers in `a`. 437 __m128i _mm_bswap_epi64 (__m128i a) pure @safe // #BONUS 438 { 439 __m128i order = _mm_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); 440 return _mm_shuffle_epi8(a, order); 441 } 442 unittest 443 { 444 __m128i A = _mm_setr_epi64(0x11223344_55667788, -1); 445 long2 R = cast(long2) _mm_bswap_epi64(A); 446 long[2] correct = [0x88776655_44332211, -1]; 447 assert(R.array == correct); 448 } 449 450 /// Reverse endianness of 128-bit register `a`. 451 __m128i _mm_bswap_si128 (__m128i a) pure @safe // #BONUS 452 { 453 __m128i order = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 454 return _mm_shuffle_epi8(a, order); 455 } 456 unittest 457 { 458 __m128i A = _mm_setr_epi64(0x11223344_55667788, -1); 459 long2 R = cast(long2) _mm_bswap_si128(A); 460 long[2] correct = [-1, 0x88776655_44332211, ]; 461 assert(R.array == correct); 462 } 463 464 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results. 465 __m128i _mm_hadd_epi16 (__m128i a, __m128i b) pure @trusted 466 { 467 // PERF DMD 468 static if (GDC_with_SSSE3) 469 { 470 return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b); 471 } 472 else static if (LDC_with_SSSE3) 473 { 474 return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b); 475 } 476 else static if (LDC_with_ARM64) 477 { 478 return cast(__m128i)vpaddq_s16(cast(short8)a, cast(short8)b); 479 } 480 else 481 { 482 short8 sa = cast(short8)a; 483 short8 sb = cast(short8)b; 484 short8 r; 485 r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]); 486 r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]); 487 r.ptr[2] = cast(short)(sa.array[4] + sa.array[5]); 488 r.ptr[3] = cast(short)(sa.array[6] + sa.array[7]); 489 r.ptr[4] = cast(short)(sb.array[0] + sb.array[1]); 490 r.ptr[5] = cast(short)(sb.array[2] + sb.array[3]); 491 r.ptr[6] = cast(short)(sb.array[4] + sb.array[5]); 492 r.ptr[7] = cast(short)(sb.array[6] + sb.array[7]); 493 return cast(__m128i)r; 494 } 495 } 496 unittest 497 { 498 __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768); 499 short8 C = cast(short8) _mm_hadd_epi16(A, A); 500 short[8] correct = [ -1, 12, 48, 32767, -1, 12, 48, 32767]; 501 assert(C.array == correct); 502 } 503 504 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results. 505 __m128i _mm_hadd_epi32 (__m128i a, __m128i b) pure @trusted 506 { 507 // PERF DMD 508 static if (GDC_with_SSSE3) 509 { 510 return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b); 511 } 512 else static if (LDC_with_SSSE3) 513 { 514 return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b); 515 } 516 else static if (LDC_with_ARM64) 517 { 518 return cast(__m128i)vpaddq_s32(cast(int4)a, cast(int4)b); 519 } 520 else 521 { 522 int4 ia = cast(int4)a; 523 int4 ib = cast(int4)b; 524 int4 r; 525 r.ptr[0] = ia.array[0] + ia.array[1]; 526 r.ptr[1] = ia.array[2] + ia.array[3]; 527 r.ptr[2] = ib.array[0] + ib.array[1]; 528 r.ptr[3] = ib.array[2] + ib.array[3]; 529 return cast(__m128i)r; 530 } 531 } 532 unittest 533 { 534 __m128i A = _mm_setr_epi32(1, -2, int.min, -1); 535 __m128i B = _mm_setr_epi32(1, int.max, 4, -4); 536 int4 C = cast(int4) _mm_hadd_epi32(A, B); 537 int[4] correct = [ -1, int.max, int.min, 0 ]; 538 assert(C.array == correct); 539 } 540 541 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results. 542 __m64 _mm_hadd_pi16 (__m64 a, __m64 b) @trusted 543 { 544 // PERF DMD 545 static if (GDC_with_SSSE3) 546 { 547 return cast(__m64) __builtin_ia32_phaddw(cast(short4)a, cast(short4)b); 548 } 549 else static if (LDC_with_ARM64) 550 { 551 return cast(__m64) vpadd_s16(cast(short4)a, cast(short4)b); 552 } 553 else 554 { 555 // LDC x86: generates phaddw since LDC 1.24 -O2. 556 short4 r; 557 short4 sa = cast(short4)a; 558 short4 sb = cast(short4)b; 559 r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]); 560 r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]); 561 r.ptr[2] = cast(short)(sb.array[0] + sb.array[1]); 562 r.ptr[3] = cast(short)(sb.array[2] + sb.array[3]); 563 return cast(__m64)r; 564 } 565 } 566 unittest 567 { 568 __m64 A = _mm_setr_pi16(1, -2, 4, 8); 569 __m64 B = _mm_setr_pi16(16, 32, -1, -32768); 570 short4 C = cast(short4) _mm_hadd_pi16(A, B); 571 short[4] correct = [ -1, 12, 48, 32767 ]; 572 assert(C.array == correct); 573 } 574 575 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, 576 /// and pack the signed 32-bit results. 577 __m64 _mm_hadd_pi32 (__m64 a, __m64 b) @trusted 578 { 579 // PERF DMD 580 static if (GDC_with_SSSE3) 581 { 582 return cast(__m64) __builtin_ia32_phaddd(cast(int2)a, cast(int2)b); 583 } 584 else static if (LDC_with_ARM64) 585 { 586 return cast(__m64)vpadd_s32(cast(int2)a, cast(int2)b); 587 } 588 else 589 { 590 // LDC x86: generates phaddd since LDC 1.24 -O2 591 int2 ia = cast(int2)a; 592 int2 ib = cast(int2)b; 593 int2 r; 594 r.ptr[0] = ia.array[0] + ia.array[1]; 595 r.ptr[1] = ib.array[0] + ib.array[1]; 596 return cast(__m64)r; 597 } 598 } 599 unittest 600 { 601 __m64 A = _mm_setr_pi32(int.min, -1); 602 __m64 B = _mm_setr_pi32(1, int.max); 603 int2 C = cast(int2) _mm_hadd_pi32(A, B); 604 int[2] correct = [ int.max, int.min ]; 605 assert(C.array == correct); 606 } 607 608 /// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 609 /// and pack the signed 16-bit results. 610 __m128i _mm_hadds_epi16 (__m128i a, __m128i b) pure @trusted 611 { 612 // PERF DMD 613 static if (GDC_with_SSSE3) 614 { 615 return cast(__m128i)__builtin_ia32_phaddsw128(cast(short8)a, cast(short8)b); 616 } 617 else static if (LDC_with_SSSE3) 618 { 619 return cast(__m128i)__builtin_ia32_phaddsw128(cast(short8)a, cast(short8)b); 620 } 621 else static if (LDC_with_ARM64) 622 { 623 // uzp1/uzp2/sqadd sequence 624 short8 sa = cast(short8)a; 625 short8 sb = cast(short8)b; 626 short8 c = shufflevectorLDC!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb); 627 short8 d = shufflevectorLDC!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb); 628 return cast(__m128i)vqaddq_s16(c, d); 629 } 630 else 631 { 632 // PERF well that doesn't look very fast? 633 short8 sa = cast(short8)a; 634 short8 sb = cast(short8)b; 635 short8 r; 636 r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] + sa.array[1]); 637 r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] + sa.array[3]); 638 r.ptr[2] = saturateSignedIntToSignedShort(sa.array[4] + sa.array[5]); 639 r.ptr[3] = saturateSignedIntToSignedShort(sa.array[6] + sa.array[7]); 640 r.ptr[4] = saturateSignedIntToSignedShort(sb.array[0] + sb.array[1]); 641 r.ptr[5] = saturateSignedIntToSignedShort(sb.array[2] + sb.array[3]); 642 r.ptr[6] = saturateSignedIntToSignedShort(sb.array[4] + sb.array[5]); 643 r.ptr[7] = saturateSignedIntToSignedShort(sb.array[6] + sb.array[7]); 644 return cast(__m128i)r; 645 } 646 } 647 unittest 648 { 649 __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768); 650 short8 C = cast(short8) _mm_hadds_epi16(A, A); 651 short[8] correct = [ -1, 12, 48, -32768, -1, 12, 48, -32768]; 652 assert(C.array == correct); 653 } 654 655 /// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 656 /// and pack the signed 16-bit results. 657 __m64 _mm_hadds_pi16 (__m64 a, __m64 b) @trusted 658 { 659 static if (GDC_with_SSSE3) 660 { 661 return cast(__m64)__builtin_ia32_phaddsw(cast(short4)a, cast(short4)b); 662 } 663 else static if (LDC_with_SSSE3) 664 { 665 // Note: LDC doesn't have __builtin_ia32_phaddsw 666 long2 la; 667 la.ptr[0] = a.array[0]; 668 long2 lb; 669 lb.ptr[0] = b.array[0]; 670 int4 sum = cast(int4)__builtin_ia32_phaddsw128(cast(short8)la, cast(short8)lb); 671 int2 r; 672 r.ptr[0] = sum.array[0]; 673 r.ptr[1] = sum.array[2]; 674 return cast(__m64)r; 675 } 676 else static if (LDC_with_ARM64) 677 { 678 // uzp1/uzp2/sqadd sequence 679 short4 sa = cast(short4)a; 680 short4 sb = cast(short4)b; 681 short4 c = shufflevectorLDC!(short4, 0, 2, 4, 6)(sa, sb); 682 short4 d = shufflevectorLDC!(short4, 1, 3, 5, 7)(sa, sb); 683 return cast(__m64)vqadd_s16(c, d); 684 } 685 else 686 { 687 short4 sa = cast(short4)a; 688 short4 sb = cast(short4)b; 689 short4 r; 690 r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] + sa.array[1]); 691 r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] + sa.array[3]); 692 r.ptr[2] = saturateSignedIntToSignedShort(sb.array[0] + sb.array[1]); 693 r.ptr[3] = saturateSignedIntToSignedShort(sb.array[2] + sb.array[3]); 694 return cast(__m64)r; 695 } 696 } 697 unittest 698 { 699 __m64 A = _mm_setr_pi16(-16, 32, -100, -32768); 700 __m64 B = _mm_setr_pi16( 64, 32, 1, 32767); 701 short4 C = cast(short4) _mm_hadds_pi16(A, B); 702 short[4] correct = [ 16, -32768, 96, 32767]; 703 assert(C.array == correct); 704 } 705 706 707 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results. 708 __m128i _mm_hsub_epi16 (__m128i a, __m128i b) @trusted 709 { 710 // PERF DMD 711 static if (GDC_with_SSSE3) 712 { 713 return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b); 714 } 715 else static if (LDC_with_SSSE3) 716 { 717 return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b); 718 } 719 else static if (LDC_with_ARM64) 720 { 721 // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 722 short8 sa = cast(short8)a; 723 short8 sb = cast(short8)b; 724 short8 c = shufflevectorLDC!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb); 725 short8 d = shufflevectorLDC!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb); 726 return cast(__m128i)(c - d); 727 } 728 else 729 { 730 short8 sa = cast(short8)a; 731 short8 sb = cast(short8)b; 732 short8 r; 733 r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]); 734 r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]); 735 r.ptr[2] = cast(short)(sa.array[4] - sa.array[5]); 736 r.ptr[3] = cast(short)(sa.array[6] - sa.array[7]); 737 r.ptr[4] = cast(short)(sb.array[0] - sb.array[1]); 738 r.ptr[5] = cast(short)(sb.array[2] - sb.array[3]); 739 r.ptr[6] = cast(short)(sb.array[4] - sb.array[5]); 740 r.ptr[7] = cast(short)(sb.array[6] - sb.array[7]); 741 return cast(__m128i)r; 742 } 743 } 744 unittest 745 { 746 __m128i A = _mm_setr_epi16(short.min, 1, 4, 8, 16, 32, 1, -32768); 747 short8 C = cast(short8) _mm_hsub_epi16(A, A); 748 short[8] correct = [ short.max, -4, -16, -32767, short.max, -4, -16, -32767]; 749 assert(C.array == correct); 750 } 751 752 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results. 753 __m128i _mm_hsub_epi32 (__m128i a, __m128i b) pure @trusted 754 { 755 // PERF DMD 756 static if (GDC_with_SSSE3) 757 { 758 return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b); 759 } 760 else static if (LDC_with_SSSE3) 761 { 762 return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b); 763 } 764 else static if (LDC_with_ARM64) 765 { 766 // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 767 int4 ia = cast(int4)a; 768 int4 ib = cast(int4)b; 769 int4 c = shufflevectorLDC!(int4, 0, 2, 4, 6)(ia, ib); 770 int4 d = shufflevectorLDC!(int4, 1, 3, 5, 7)(ia, ib); 771 return cast(__m128i)(c - d); 772 } 773 else 774 { 775 int4 ia = cast(int4)a; 776 int4 ib = cast(int4)b; 777 int4 r; 778 r.ptr[0] = ia.array[0] - ia.array[1]; 779 r.ptr[1] = ia.array[2] - ia.array[3]; 780 r.ptr[2] = ib.array[0] - ib.array[1]; 781 r.ptr[3] = ib.array[2] - ib.array[3]; 782 return cast(__m128i)r; 783 } 784 } 785 unittest 786 { 787 __m128i A = _mm_setr_epi32(1, 2, int.min, 1); 788 __m128i B = _mm_setr_epi32(int.max, -1, 4, 4); 789 int4 C = cast(int4) _mm_hsub_epi32(A, B); 790 int[4] correct = [ -1, int.max, int.min, 0 ]; 791 assert(C.array == correct); 792 } 793 794 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`, 795 /// and pack the signed 16-bit results. 796 __m64 _mm_hsub_pi16 (__m64 a, __m64 b) @trusted 797 { 798 // PERF DMD 799 static if (GDC_with_SSSE3) 800 { 801 return cast(__m64)__builtin_ia32_phsubw(cast(short4)a, cast(short4)b); 802 } 803 else static if (LDC_with_ARM64) 804 { 805 // Produce uzp1 uzp2 sub sequence since LDC 1.3 -O1 806 short4 sa = cast(short4)a; 807 short4 sb = cast(short4)b; 808 short4 c = shufflevectorLDC!(short4, 0, 2, 4, 6)(sa, sb); 809 short4 d = shufflevectorLDC!(short4, 1, 3, 5, 7)(sa, sb); 810 return cast(__m64)(c - d); 811 } 812 else 813 { 814 // LDC x86: generates phsubw since LDC 1.24 -O2 815 short4 sa = cast(short4)a; 816 short4 sb = cast(short4)b; 817 short4 r; 818 r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]); 819 r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]); 820 r.ptr[2] = cast(short)(sb.array[0] - sb.array[1]); 821 r.ptr[3] = cast(short)(sb.array[2] - sb.array[3]); 822 return cast(__m64)r; 823 } 824 } 825 unittest 826 { 827 __m64 A = _mm_setr_pi16(short.min, 1, 4, 8); 828 __m64 B = _mm_setr_pi16(16, 32, 1, -32768); 829 short4 C = cast(short4) _mm_hsub_pi16(A, B); 830 short[4] correct = [ short.max, -4, -16, -32767]; 831 assert(C.array == correct); 832 } 833 834 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`, 835 /// and pack the signed 32-bit results. 836 __m64 _mm_hsub_pi32 (__m64 a, __m64 b) @trusted 837 { 838 // PERF DMD 839 static if (GDC_with_SSSE3) 840 { 841 return cast(__m64)__builtin_ia32_phsubd(cast(int2)a, cast(int2)b); 842 } 843 else static if (LDC_with_ARM64) 844 { 845 // LDC arm64: generates zip1+zip2+sub sequence since LDC 1.8 -O1 846 int2 ia = cast(int2)a; 847 int2 ib = cast(int2)b; 848 int2 c = shufflevectorLDC!(int2, 0, 2)(ia, ib); 849 int2 d = shufflevectorLDC!(int2, 1, 3)(ia, ib); 850 return cast(__m64)(c - d); 851 } 852 else 853 { 854 // LDC x86: generates phsubd since LDC 1.24 -O2 855 int2 ia = cast(int2)a; 856 int2 ib = cast(int2)b; 857 int2 r; 858 r.ptr[0] = ia.array[0] - ia.array[1]; 859 r.ptr[1] = ib.array[0] - ib.array[1]; 860 return cast(__m64)r; 861 } 862 } 863 unittest 864 { 865 __m64 A = _mm_setr_pi32(int.min, 1); 866 __m64 B = _mm_setr_pi32(int.max, -1); 867 int2 C = cast(int2) _mm_hsub_pi32(A, B); 868 int[2] correct = [ int.max, int.min ]; 869 assert(C.array == correct); 870 } 871 872 /// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 873 /// and pack the signed 16-bit results. 874 __m128i _mm_hsubs_epi16 (__m128i a, __m128i b) pure @trusted 875 { 876 // PERF DMD 877 static if (GDC_with_SSSE3) 878 { 879 return cast(__m128i)__builtin_ia32_phsubsw128(cast(short8)a, cast(short8)b); 880 } 881 else static if (LDC_with_SSSE3) 882 { 883 return cast(__m128i)__builtin_ia32_phsubsw128(cast(short8)a, cast(short8)b); 884 } 885 else static if (LDC_with_ARM64) 886 { 887 // uzp1/uzp2/sqsub sequence 888 short8 sa = cast(short8)a; 889 short8 sb = cast(short8)b; 890 short8 c = shufflevectorLDC!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb); 891 short8 d = shufflevectorLDC!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb); 892 return cast(__m128i)vqsubq_s16(c, d); 893 } 894 else 895 { 896 short8 sa = cast(short8)a; 897 short8 sb = cast(short8)b; 898 short8 r; 899 r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] - sa.array[1]); 900 r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] - sa.array[3]); 901 r.ptr[2] = saturateSignedIntToSignedShort(sa.array[4] - sa.array[5]); 902 r.ptr[3] = saturateSignedIntToSignedShort(sa.array[6] - sa.array[7]); 903 r.ptr[4] = saturateSignedIntToSignedShort(sb.array[0] - sb.array[1]); 904 r.ptr[5] = saturateSignedIntToSignedShort(sb.array[2] - sb.array[3]); 905 r.ptr[6] = saturateSignedIntToSignedShort(sb.array[4] - sb.array[5]); 906 r.ptr[7] = saturateSignedIntToSignedShort(sb.array[6] - sb.array[7]); 907 return cast(__m128i)r; 908 } 909 } 910 unittest 911 { 912 __m128i A = _mm_setr_epi16(1, -2, 4, 8, 32767, -1, -10, 32767); 913 short8 C = cast(short8) _mm_hsubs_epi16(A, A); 914 short[8] correct = [ 3, -4, 32767, -32768, 3, -4, 32767, -32768 ]; 915 assert(C.array == correct); 916 } 917 918 919 /// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 920 /// and pack the signed 16-bit results. 921 __m64 _mm_hsubs_pi16 (__m64 a, __m64 b) @trusted 922 { 923 static if (GDC_with_SSSE3) 924 { 925 return cast(__m64)__builtin_ia32_phsubsw(cast(short4)a, cast(short4)b); 926 } 927 else static if (LDC_with_SSSE3) 928 { 929 // Note: LDC doesn't have __builtin_ia32_phsubsw 930 long2 la; 931 la.ptr[0] = a.array[0]; 932 long2 lb; 933 lb.ptr[0] = b.array[0]; 934 int4 sum = cast(int4)__builtin_ia32_phsubsw128(cast(short8)la, cast(short8)lb); 935 int2 r; 936 r.ptr[0] = sum.array[0]; 937 r.ptr[1] = sum.array[2]; 938 return cast(__m64)r; 939 } 940 else static if (LDC_with_ARM64) 941 { 942 // uzp1/uzp2/sqsub sequence in -O1 943 short4 sa = cast(short4)a; 944 short4 sb = cast(short4)b; 945 short4 c = shufflevectorLDC!(short4, 0, 2, 4, 6)(sa, sb); 946 short4 d = shufflevectorLDC!(short4, 1, 3, 5, 7)(sa, sb); 947 return cast(__m64)vqsub_s16(c, d); 948 } 949 else 950 { 951 short4 sa = cast(short4)a; 952 short4 sb = cast(short4)b; 953 short4 r; 954 r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] - sa.array[1]); 955 r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] - sa.array[3]); 956 r.ptr[2] = saturateSignedIntToSignedShort(sb.array[0] - sb.array[1]); 957 r.ptr[3] = saturateSignedIntToSignedShort(sb.array[2] - sb.array[3]); 958 return cast(__m64)r; 959 } 960 } 961 unittest 962 { 963 __m64 A = _mm_setr_pi16(-16, 32, 100, -32768); 964 __m64 B = _mm_setr_pi16( 64, 30, -9, 32767); 965 short4 C = cast(short4) _mm_hsubs_pi16(A, B); 966 short[4] correct = [ -48, 32767, 34, -32768]; 967 assert(C.array == correct); 968 } 969 970 /// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 971 /// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 972 /// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 973 /// and pack the saturated results. 974 __m128i _mm_maddubs_epi16 (__m128i a, __m128i b) @trusted 975 { 976 static if (GDC_with_SSSE3) 977 { 978 return cast(__m128i)__builtin_ia32_pmaddubsw128(cast(ubyte16)a, cast(ubyte16)b); 979 } 980 else static if (LDC_with_SSSE3) 981 { 982 return cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16)a, cast(byte16)b); 983 } 984 else 985 { 986 // zero-extend a to 16-bit 987 __m128i zero = _mm_setzero_si128(); 988 __m128i a_lo = _mm_unpacklo_epi8(a, zero); 989 __m128i a_hi = _mm_unpackhi_epi8(a, zero); 990 991 // sign-extend b to 16-bit 992 __m128i b_lo = _mm_unpacklo_epi8(b, zero); 993 __m128i b_hi = _mm_unpackhi_epi8(b, zero); 994 b_lo = _mm_srai_epi16( _mm_slli_epi16(b_lo, 8), 8); 995 b_hi = _mm_srai_epi16( _mm_slli_epi16(b_hi, 8), 8); 996 997 // Multiply element-wise, no overflow can occur 998 __m128i c_lo = _mm_mullo_epi16(a_lo, b_lo); 999 __m128i c_hi = _mm_mullo_epi16(a_hi, b_hi); 1000 1001 // Add pairwise with saturating horizontal add 1002 return _mm_hadds_epi16(c_lo, c_hi); 1003 } 1004 } 1005 unittest 1006 { 1007 __m128i A = _mm_setr_epi8( -1, 10, 100, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // u8 1008 __m128i B = _mm_setr_epi8(-128, -30, 100, 127, -1, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0); // i8 1009 short8 C = cast(short8) _mm_maddubs_epi16(A, B); 1010 short[8] correct = [ -32768, 26256, 0, 0, 0, 0, 0, 0]; 1011 assert(C.array == correct); 1012 } 1013 1014 /// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 1015 /// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 1016 /// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 1017 /// and pack the saturated results. 1018 __m64 _mm_maddubs_pi16 (__m64 a, __m64 b) @trusted 1019 { 1020 static if (GDC_with_SSSE3) 1021 { 1022 return cast(__m64)__builtin_ia32_pmaddubsw(cast(ubyte8)a, cast(ubyte8)b); 1023 } 1024 else static if (LDC_with_SSSE3) 1025 { 1026 __m128i A = to_m128i(a); 1027 __m128i B = to_m128i(b); 1028 return to_m64( cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16) to_m128i(a), cast(byte16) to_m128i(b))); 1029 } 1030 else 1031 { 1032 // zero-extend a to 16-bit 1033 __m128i zero = _mm_setzero_si128(); 1034 __m128i A = _mm_unpacklo_epi8(to_m128i(a), zero); 1035 1036 // sign-extend b to 16-bit 1037 __m128i B = _mm_unpacklo_epi8(to_m128i(b), zero); 1038 B = _mm_srai_epi16( _mm_slli_epi16(B, 8), 8); 1039 1040 // Multiply element-wise, no overflow can occur 1041 __m128i c = _mm_mullo_epi16(A, B); 1042 1043 // Add pairwise with saturating horizontal add 1044 return to_m64( _mm_hadds_epi16(c, zero)); 1045 } 1046 } 1047 unittest 1048 { 1049 __m64 A = _mm_setr_pi8( -1, 10, 100, -128, 0, 0, 0, 0); // u8 1050 __m64 B = _mm_setr_pi8(-128, -30, 100, 127, -1, 2, 4, 6); // i8 1051 short4 C = cast(short4) _mm_maddubs_pi16(A, B); 1052 short[4] correct = [ -32768, 26256, 0, 0]; 1053 assert(C.array == correct); 1054 } 1055 1056 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers. 1057 /// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return bits `[16:1]`. 1058 __m128i _mm_mulhrs_epi16 (__m128i a, __m128i b) pure @trusted 1059 { 1060 // PERF DMD 1061 static if (GDC_with_SSSE3) 1062 { 1063 return cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8)a, cast(short8)b); 1064 } 1065 else static if (LDC_with_SSSE3) 1066 { 1067 return cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8)a, cast(short8)b); 1068 } 1069 else static if (LDC_with_ARM64) 1070 { 1071 int4 mul_lo = vmull_s16(vget_low_s16(cast(short8)a), 1072 vget_low_s16(cast(short8)b)); 1073 int4 mul_hi = vmull_s16(vget_high_s16(cast(short8)a), 1074 vget_high_s16(cast(short8)b)); 1075 1076 // Rounding narrowing shift right 1077 // narrow = (int16_t)((mul + 16384) >> 15); 1078 short4 narrow_lo = vrshrn_n_s32(mul_lo, 15); 1079 short4 narrow_hi = vrshrn_n_s32(mul_hi, 15); 1080 1081 // Join together. 1082 return cast(__m128i) vcombine_s16(narrow_lo, narrow_hi); 1083 } 1084 else 1085 { 1086 short8 sa = cast(short8)a; 1087 short8 sb = cast(short8)b; 1088 short8 r; 1089 1090 for (int i = 0; i < 8; ++i) 1091 { 1092 // I doubted it at first, but an exhaustive search show this to be equivalent to Intel pseudocode. 1093 r.ptr[i] = cast(short) ( (sa.array[i] * sb.array[i] + 0x4000) >> 15); 1094 } 1095 1096 return cast(__m128i)r; 1097 } 1098 } 1099 unittest 1100 { 1101 __m128i A = _mm_setr_epi16(12345, -32768, 32767, 0, 1, 845, -6999, -1); 1102 __m128i B = _mm_setr_epi16(8877, -24487, 15678, 32760, 1, 0, -149, -1); 1103 short8 C = cast(short8) _mm_mulhrs_epi16(A, B); 1104 short[8] correct = [3344, 24487, 15678, 0, 0, 0, 32, 0]; 1105 assert(C.array == correct); 1106 } 1107 1108 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers. 1109 /// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return bits `[16:1]`. 1110 __m64 _mm_mulhrs_pi16 (__m64 a, __m64 b) @trusted 1111 { 1112 // PERF DMD 1113 static if (GDC_with_SSSE3) 1114 { 1115 return cast(__m64) __builtin_ia32_pmulhrsw(cast(short4)a, cast(short4)b); 1116 } 1117 else static if (LDC_with_SSSE3) 1118 { 1119 return cast(__m64) to_m64( cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8) to_m128i(a), cast(short8) to_m128i(b))); 1120 } 1121 else static if (LDC_with_ARM64) 1122 { 1123 int4 mul = vmull_s16(cast(short4)a, cast(short4)b); 1124 1125 // Rounding narrowing shift right 1126 // (int16_t)((mul + 16384) >> 15); 1127 return cast(__m64) vrshrn_n_s32(mul, 15); 1128 } 1129 else 1130 { 1131 short4 sa = cast(short4)a; 1132 short4 sb = cast(short4)b; 1133 short4 r; 1134 1135 for (int i = 0; i < 4; ++i) 1136 { 1137 r.ptr[i] = cast(short) ( (sa.array[i] * sb.array[i] + 0x4000) >> 15); 1138 } 1139 return cast(__m64)r; 1140 } 1141 } 1142 unittest 1143 { 1144 __m64 A = _mm_setr_pi16(12345, -32768, 32767, 0); 1145 __m64 B = _mm_setr_pi16(8877, -24487, 15678, 32760); 1146 short4 C = cast(short4) _mm_mulhrs_pi16(A, B); 1147 short[4] correct = [3344, 24487, 15678, 0]; 1148 assert(C.array == correct); 1149 } 1150 1151 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`. 1152 __m128i _mm_shuffle_epi8 (__m128i a, __m128i b) pure @trusted 1153 { 1154 // This is the lovely pshufb. 1155 // PERF DMD 1156 static if (GDC_with_SSSE3) 1157 { 1158 return cast(__m128i) __builtin_ia32_pshufb128(cast(ubyte16) a, cast(ubyte16) b); 1159 } 1160 else static if (LDC_with_SSSE3) 1161 { 1162 return cast(__m128i) __builtin_ia32_pshufb128(cast(byte16) a, cast(byte16) b); 1163 } 1164 else static if (LDC_with_ARM64) 1165 { 1166 byte16 bb = cast(byte16)b; 1167 byte16 mask; 1168 mask = cast(byte)(0x8F); 1169 bb = bb & mask; 1170 // "If an index is out of range for the table, the result for that lookup is 0." 1171 // So, having bit 7 in indices will yield 0 correctly. 1172 byte16 r = vqtbl1q_s8(cast(byte16)a, bb); 1173 return cast(__m128i)r; 1174 } 1175 else 1176 { 1177 byte16 r; 1178 byte16 ba = cast(byte16)a; 1179 byte16 bb = cast(byte16)b; 1180 for (int i = 0; i < 16; ++i) 1181 { 1182 byte s = bb.array[i]; 1183 r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 15 ]; 1184 } 1185 return cast(__m128i)r; 1186 } 1187 } 1188 unittest 1189 { 1190 __m128i A = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 1191 __m128i B = _mm_setr_epi8(15, -128, 13 + 16, -12, 11, -10, 9, 8, 7, 6, -5, 4, 3, -2, 1, 0); 1192 byte16 C = cast(byte16) _mm_shuffle_epi8(A, B); 1193 byte[16] correct = [0, 0, 2, 0, 4, 0, 6, 7, 8, 9, 0, 11, 12, 0, 14, 15]; 1194 assert(C.array == correct); 1195 } 1196 1197 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`. 1198 __m64 _mm_shuffle_pi8 (__m64 a, __m64 b) @trusted 1199 { 1200 // PERF DMD 1201 static if (GDC_with_SSSE3) 1202 { 1203 alias ubyte8 =__vector(ubyte[8]); 1204 return cast(__m64) __builtin_ia32_pshufb(cast(ubyte8) a, cast(ubyte8) b); 1205 } 1206 else static if (LDC_with_SSSE3) 1207 { 1208 // GDC does proper dance to avoid mmx registers, do it manually in LDC since __builtin_ia32_pshufb doesn't exist there 1209 __m128i A = to_m128i(a); 1210 __m128i index = to_m128i(b); 1211 index = index & _mm_set1_epi32(0xF7F7F7F7); 1212 return to_m64( cast(__m128i) __builtin_ia32_pshufb128(cast(byte16)A, cast(byte16) index) ); 1213 } 1214 else static if (LDC_with_ARM64) 1215 { 1216 byte8 bb = cast(byte8)b; 1217 byte8 mask; 1218 mask = cast(byte)(0x87); 1219 bb = bb & mask; 1220 __m128i l = to_m128i(a); 1221 byte8 r = vtbl1_s8(cast(byte16)l, cast(byte8)bb); 1222 return cast(__m64)r; 1223 } 1224 else 1225 { 1226 byte8 r; 1227 byte8 ba = cast(byte8)a; 1228 byte8 bb = cast(byte8)b; 1229 for (int i = 0; i < 8; ++i) 1230 { 1231 byte s = bb.array[i]; 1232 r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 7 ]; 1233 } 1234 return cast(__m64)r; 1235 } 1236 } 1237 unittest 1238 { 1239 __m64 A = _mm_setr_pi8(7, 6, 5, 4, 3, 2, 1, 0); 1240 __m64 B = _mm_setr_pi8(7, 6, -5, 4, 3 + 8, -2, 1, 0); 1241 byte8 C = cast(byte8) _mm_shuffle_pi8(A, B); 1242 byte[8] correct = [0, 1, 0, 3, 4, 0, 6, 7]; 1243 assert(C.array == correct); 1244 } 1245 1246 /// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit integer in `b` is negative. 1247 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 1248 __m128i _mm_sign_epi16 (__m128i a, __m128i b) pure @safe 1249 { 1250 // PERF DMD 1251 static if (GDC_with_SSSE3) 1252 { 1253 return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b); 1254 } 1255 else static if (LDC_with_SSSE3) 1256 { 1257 return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b); 1258 } 1259 else 1260 { 1261 // LDC arm64: 5 instructions 1262 __m128i mask = _mm_srai_epi16(b, 15); 1263 __m128i zeromask = _mm_cmpeq_epi16(b, _mm_setzero_si128()); 1264 return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi16(a, mask), mask)); 1265 } 1266 } 1267 unittest 1268 { 1269 __m128i A = _mm_setr_epi16(-2, -1, 0, 1, 2, short.min, short.min, short.min); 1270 __m128i B = _mm_setr_epi16(-1, 0,-1, 1, -2, -50, 0, 50); 1271 short8 C = cast(short8) _mm_sign_epi16(A, B); 1272 short[8] correct = [ 2, 0, 0, 1, -2, short.min, 0, short.min]; 1273 assert(C.array == correct); 1274 } 1275 1276 /// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit integer in `b` is negative. 1277 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 1278 __m128i _mm_sign_epi32 (__m128i a, __m128i b) pure @safe 1279 { 1280 // PERF DMD 1281 static if (GDC_with_SSSE3) 1282 { 1283 return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b); 1284 } 1285 else static if (LDC_with_SSSE3) 1286 { 1287 return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b); 1288 } 1289 else 1290 { 1291 __m128i mask = _mm_srai_epi32(b, 31); 1292 __m128i zeromask = _mm_cmpeq_epi32(b, _mm_setzero_si128()); 1293 return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi32(a, mask), mask)); 1294 } 1295 } 1296 unittest 1297 { 1298 __m128i A = _mm_setr_epi32(-2, -1, 0, int.max); 1299 __m128i B = _mm_setr_epi32(-1, 0, -1, 1); 1300 int4 C = cast(int4) _mm_sign_epi32(A, B); 1301 int[4] correct = [ 2, 0, 0, int.max]; 1302 assert(C.array == correct); 1303 } 1304 1305 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 1306 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 1307 __m128i _mm_sign_epi8 (__m128i a, __m128i b) pure @safe 1308 { 1309 // PERF DMD 1310 static if (GDC_with_SSSE3) 1311 { 1312 return cast(__m128i) __builtin_ia32_psignb128(cast(ubyte16)a, cast(ubyte16)b); 1313 } 1314 else static if (LDC_with_SSSE3) 1315 { 1316 return cast(__m128i) __builtin_ia32_psignb128(cast(byte16)a, cast(byte16)b); 1317 } 1318 else 1319 { 1320 __m128i mask = _mm_cmplt_epi8(b, _mm_setzero_si128()); // extend sign bit 1321 __m128i zeromask = _mm_cmpeq_epi8(b, _mm_setzero_si128()); 1322 return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi8(a, mask), mask)); 1323 } 1324 } 1325 unittest 1326 { 1327 __m128i A = _mm_setr_epi8(-2, -1, 0, 1, 2, byte.min, byte.min, byte.min, -1, 0,-1, 1, -2, -50, 0, 50); 1328 __m128i B = _mm_setr_epi8(-1, 0,-1, 1, -2, -50, 0, 50, -2, -1, 0, 1, 2, byte.min, byte.min, byte.min); 1329 byte16 C = cast(byte16) _mm_sign_epi8(A, B); 1330 byte[16] correct = [ 2, 0, 0, 1, -2, byte.min, 0, byte.min, 1, 0, 0, 1, -2, 50, 0, -50]; 1331 assert(C.array == correct); 1332 } 1333 1334 /// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit integer in `b` is negative. 1335 /// Element in result are zeroed out when the corresponding element in `b` is zero. 1336 __m64 _mm_sign_pi16 (__m64 a, __m64 b) @trusted 1337 { 1338 return to_m64( _mm_sign_epi16( to_m128i(a), to_m128i(b)) ); 1339 } 1340 unittest 1341 { 1342 __m64 A = _mm_setr_pi16( 2, short.min, short.min, short.min); 1343 __m64 B = _mm_setr_pi16(-2, -50, 0, 50); 1344 short4 C = cast(short4) _mm_sign_pi16(A, B); 1345 short[4] correct = [-2, short.min, 0, short.min]; 1346 assert(C.array == correct); 1347 } 1348 1349 /// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit integer in `b` is negative. 1350 /// Element in result are zeroed out when the corresponding element in `b` is zero. 1351 __m64 _mm_sign_pi32 (__m64 a, __m64 b) @trusted 1352 { 1353 return to_m64( _mm_sign_epi32( to_m128i(a), to_m128i(b)) ); 1354 } 1355 unittest 1356 { 1357 __m64 A = _mm_setr_pi32(-2, -100); 1358 __m64 B = _mm_setr_pi32(-1, 0); 1359 int2 C = cast(int2) _mm_sign_pi32(A, B); 1360 int[2] correct = [ 2, 0]; 1361 assert(C.array == correct); 1362 } 1363 1364 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 1365 /// Elements in result are zeroed out when the corresponding element in `b` is zero. 1366 __m64 _mm_sign_pi8 (__m64 a, __m64 b) @trusted 1367 { 1368 return to_m64( _mm_sign_epi8( to_m128i(a), to_m128i(b)) ); 1369 } 1370 unittest 1371 { 1372 __m64 A = _mm_setr_pi8(-2, -1, 0, 1, 2, byte.min, byte.min, byte.min); 1373 __m64 B = _mm_setr_pi8(-1, 0,-1, 1, -2, -50, 0, 50); 1374 byte8 C = cast(byte8) _mm_sign_pi8(A, B); 1375 byte[8] correct = [ 2, 0, 0, 1, -2, byte.min, 0, byte.min]; 1376 assert(C.array == correct); 1377 }