1 /** 2 * SSE2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2 4 * 5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.emmintrin; 9 10 public import inteli.types; 11 public import inteli.xmmintrin; // SSE2 includes SSE1 12 import inteli.mmx; 13 import inteli.internals; 14 15 nothrow @nogc: 16 17 18 // SSE2 instructions 19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 20 21 /// Add packed 16-bit integers in `a` and `b`. 22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 23 { 24 pragma(inline, true); 25 return cast(__m128i)(cast(short8)a + cast(short8)b); 26 } 27 unittest 28 { 29 __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77); 30 short8 R = cast(short8) _mm_add_epi16(A, A); 31 short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154]; 32 assert(R.array == correct); 33 } 34 35 /// Add packed 32-bit integers in `a` and `b`. 36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 37 { 38 pragma(inline, true); 39 return cast(__m128i)(cast(int4)a + cast(int4)b); 40 } 41 unittest 42 { 43 __m128i A = _mm_setr_epi32( -7, -1, 0, 9); 44 int4 R = _mm_add_epi32(A, A); 45 int[4] correct = [ -14, -2, 0, 18 ]; 46 assert(R.array == correct); 47 } 48 49 /// Add packed 64-bit integers in `a` and `b`. 50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 51 { 52 pragma(inline, true); 53 return cast(__m128i)(cast(long2)a + cast(long2)b); 54 } 55 unittest 56 { 57 __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000); 58 long2 R = cast(long2) _mm_add_epi64(A, A); 59 long[2] correct = [ -2, 0 ]; 60 assert(R.array == correct); 61 } 62 63 /// Add packed 8-bit integers in `a` and `b`. 64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 65 { 66 pragma(inline, true); 67 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 68 } 69 unittest 70 { 71 __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78); 72 byte16 R = cast(byte16) _mm_add_epi8(A, A); 73 byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100]; 74 assert(R.array == correct); 75 } 76 77 /// Add the lower double-precision (64-bit) floating-point element 78 /// in `a` and `b`, store the result in the lower element of dst, 79 /// and copy the upper element from `a` to the upper element of destination. 80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 81 { 82 static if (DMD_with_DSIMD) 83 { 84 return cast(__m128d) __simd(XMM.ADDSD, a, b); 85 } 86 else static if (GDC_with_SSE2) 87 { 88 return __builtin_ia32_addsd(a, b); 89 } 90 else version(DigitalMars) 91 { 92 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 93 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 94 asm pure nothrow @nogc @trusted { nop;} 95 a[0] = a[0] + b[0]; 96 return a; 97 } 98 else 99 { 100 a[0] += b[0]; 101 return a; 102 } 103 } 104 unittest 105 { 106 __m128d a = [1.5, -2.0]; 107 a = _mm_add_sd(a, a); 108 assert(a.array == [3.0, -2.0]); 109 } 110 111 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 112 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 113 { 114 pragma(inline, true); 115 return a + b; 116 } 117 unittest 118 { 119 __m128d a = [1.5, -2.0]; 120 a = _mm_add_pd(a, a); 121 assert(a.array == [3.0, -4.0]); 122 } 123 124 /// Add 64-bit integers `a` and `b`. 125 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe 126 { 127 // PERF DMD 128 pragma(inline, true); 129 return a + b; 130 } 131 132 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 133 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 134 { 135 static if (DMD_with_DSIMD) 136 { 137 return cast(__m128i) __simd(XMM.PADDSW, a, b); 138 } 139 else static if (GDC_with_SSE2) 140 { 141 return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 142 } 143 else version(LDC) 144 { 145 return cast(__m128i) inteli_llvm_adds!short8(cast(short8)a, cast(short8)b); 146 } 147 else 148 { 149 short[8] res; // PERF =void; 150 short8 sa = cast(short8)a; 151 short8 sb = cast(short8)b; 152 foreach(i; 0..8) 153 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 154 return _mm_loadu_si128(cast(int4*)res.ptr); 155 } 156 } 157 unittest 158 { 159 short8 res = cast(short8) _mm_adds_epi16(_mm_setr_epi16( 7, 6, 5, -32768, 3, 3, 32767, 0), 160 _mm_setr_epi16( 7, 6, 5, -30000, 3, 1, 1, -10)); 161 static immutable short[8] correctResult = [14, 12, 10, -32768, 6, 4, 32767, -10]; 162 assert(res.array == correctResult); 163 } 164 165 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 166 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 167 { 168 static if (DMD_with_DSIMD) 169 { 170 return cast(__m128i) __simd(XMM.PADDSB, a, b); 171 } 172 else static if (GDC_with_SSE2) 173 { 174 return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b); 175 } 176 else version(LDC) 177 { 178 return cast(__m128i) inteli_llvm_adds!byte16(cast(byte16)a, cast(byte16)b); 179 } 180 else 181 { 182 byte[16] res; // PERF =void; 183 byte16 sa = cast(byte16)a; 184 byte16 sb = cast(byte16)b; 185 foreach(i; 0..16) 186 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 187 return _mm_loadu_si128(cast(int4*)res.ptr); 188 } 189 } 190 unittest 191 { 192 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0), 193 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, -4, 3, 2, 1, 0)); 194 static immutable byte[16] correctResult = [0, 2, 4, 6, -128, 10, 12, 14, 195 16, 18, 127, 22, 24, 26, 28, 30]; 196 assert(res.array == correctResult); 197 } 198 199 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 200 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 201 { 202 static if (DMD_with_DSIMD) 203 { 204 return cast(__m128i) __simd(XMM.PADDUSB, a, b); 205 } 206 else static if (GDC_with_SSE2) 207 { 208 return cast(__m128i) __builtin_ia32_paddusb128(cast(ubyte16)a, cast(ubyte16)b); 209 } 210 else version(LDC) 211 { 212 return cast(__m128i) inteli_llvm_addus!byte16(cast(byte16)a, cast(byte16)b); 213 } 214 else 215 { 216 ubyte[16] res; // PERF =void; 217 byte16 sa = cast(byte16)a; 218 byte16 sb = cast(byte16)b; 219 foreach(i; 0..16) 220 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 221 return _mm_loadu_si128(cast(int4*)res.ptr); 222 } 223 } 224 unittest 225 { 226 byte16 res = cast(byte16) 227 _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0), 228 _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0)); 229 static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 230 0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 231 assert(res.array == correctResult); 232 } 233 234 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation. 235 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 236 { 237 static if (DMD_with_DSIMD) 238 { 239 // Note: DMD generates a reverted paddusw vs LDC and GDC, but that doesn't change the result anyway 240 return cast(__m128i) __simd(XMM.PADDUSW, a, b); 241 } 242 else static if (GDC_with_SSE2) 243 { 244 return cast(__m128i) __builtin_ia32_paddusw128(cast(short8)a, cast(short8)b); 245 } 246 else version(LDC) 247 { 248 return cast(__m128i) inteli_llvm_addus!short8(cast(short8)a, cast(short8)b); 249 } 250 else 251 { 252 ushort[8] res; // PERF =void; 253 short8 sa = cast(short8)a; 254 short8 sb = cast(short8)b; 255 foreach(i; 0..8) 256 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 257 return _mm_loadu_si128(cast(int4*)res.ptr); 258 } 259 } 260 unittest 261 { 262 short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0), 263 _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0)); 264 static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6]; 265 assert(res.array == correctResult); 266 } 267 268 /// Compute the bitwise AND of packed double-precision (64-bit) 269 /// floating-point elements in `a` and `b`. 270 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 271 { 272 pragma(inline, true); 273 return cast(__m128d)( cast(long2)a & cast(long2)b ); 274 } 275 unittest 276 { 277 double a = 4.32; 278 double b = -78.99; 279 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 280 __m128d A = _mm_set_pd(a, b); 281 __m128d B = _mm_set_pd(b, a); 282 long2 R = cast(long2)( _mm_and_pd(A, B) ); 283 assert(R.array[0] == correct); 284 assert(R.array[1] == correct); 285 } 286 287 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`. 288 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 289 { 290 pragma(inline, true); 291 return a & b; 292 } 293 unittest 294 { 295 __m128i A = _mm_set1_epi32(7); 296 __m128i B = _mm_set1_epi32(14); 297 __m128i R = _mm_and_si128(A, B); 298 int[4] correct = [6, 6, 6, 6]; 299 assert(R.array == correct); 300 } 301 302 /// Compute the bitwise NOT of packed double-precision (64-bit) 303 /// floating-point elements in `a` and then AND with `b`. 304 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 305 { 306 static if (DMD_with_DSIMD) 307 { 308 return cast(__m128d) __simd(XMM.ANDNPD, a, b); 309 } 310 else 311 { 312 return cast(__m128d)( ~(cast(long2)a) & cast(long2)b); 313 } 314 } 315 unittest 316 { 317 double a = 4.32; 318 double b = -78.99; 319 long correct = (~*cast(long*)(&a)) & ( *cast(long*)(&b)); 320 long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b)); 321 __m128d A = _mm_setr_pd(a, b); 322 __m128d B = _mm_setr_pd(b, a); 323 long2 R = cast(long2)( _mm_andnot_pd(A, B) ); 324 assert(R.array[0] == correct); 325 assert(R.array[1] == correct2); 326 } 327 328 /// Compute the bitwise NOT of 128 bits (representing integer data) 329 /// in `a` and then AND with `b`. 330 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 331 { 332 static if (DMD_with_DSIMD) 333 { 334 return cast(__m128i) __simd(XMM.PANDN, a, b); 335 } 336 else 337 { 338 return (~a) & b; 339 } 340 } 341 unittest 342 { 343 __m128i A = _mm_setr_epi32(7, -2, 9, 54654); 344 __m128i B = _mm_setr_epi32(14, 78, 111, -256); 345 __m128i R = _mm_andnot_si128(A, B); 346 int[4] correct = [8, 0, 102, -54784]; 347 assert(R.array == correct); 348 } 349 350 /// Average packed unsigned 16-bit integers in `a` and `b`. 351 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted 352 { 353 static if (DMD_with_DSIMD) 354 { 355 return cast(__m128i) __simd(XMM.PAVGW, a, b); 356 } 357 else static if (GDC_with_SSE2) 358 { 359 return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b); 360 } 361 else static if (LDC_with_ARM64) 362 { 363 return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b); 364 } 365 else version(LDC) 366 { 367 // Generates pavgw even in LDC 1.0, even in -O0 368 // But not in ARM 369 enum ir = ` 370 %ia = zext <8 x i16> %0 to <8 x i32> 371 %ib = zext <8 x i16> %1 to <8 x i32> 372 %isum = add <8 x i32> %ia, %ib 373 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 374 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 375 %r = trunc <8 x i32> %isums to <8 x i16> 376 ret <8 x i16> %r`; 377 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 378 } 379 else 380 { 381 short8 sa = cast(short8)a; 382 short8 sb = cast(short8)b; 383 short8 sr = void; 384 foreach(i; 0..8) 385 { 386 sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 ); 387 } 388 return cast(int4)sr; 389 } 390 } 391 unittest 392 { 393 __m128i A = _mm_set1_epi16(31); 394 __m128i B = _mm_set1_epi16(64); 395 short8 avg = cast(short8)(_mm_avg_epu16(A, B)); 396 foreach(i; 0..8) 397 assert(avg.array[i] == 48); 398 } 399 400 /// Average packed unsigned 8-bit integers in `a` and `b`. 401 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted 402 { 403 static if (DMD_with_DSIMD) 404 { 405 return cast(__m128i) __simd(XMM.PAVGB, a, b); 406 } 407 else static if (GDC_with_SSE2) 408 { 409 return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b); 410 } 411 else static if (LDC_with_ARM64) 412 { 413 return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b); 414 } 415 else version(LDC) 416 { 417 // Generates pavgb even in LDC 1.0, even in -O0 418 // But not in ARM 419 enum ir = ` 420 %ia = zext <16 x i8> %0 to <16 x i16> 421 %ib = zext <16 x i8> %1 to <16 x i16> 422 %isum = add <16 x i16> %ia, %ib 423 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 424 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 425 %r = trunc <16 x i16> %isums to <16 x i8> 426 ret <16 x i8> %r`; 427 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 428 } 429 else 430 { 431 byte16 sa = cast(byte16)a; 432 byte16 sb = cast(byte16)b; 433 byte16 sr = void; 434 foreach(i; 0..16) 435 { 436 sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 ); 437 } 438 return cast(int4)sr; 439 } 440 } 441 unittest 442 { 443 __m128i A = _mm_set1_epi8(31); 444 __m128i B = _mm_set1_epi8(64); 445 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B)); 446 foreach(i; 0..16) 447 assert(avg.array[i] == 48); 448 } 449 450 /// Shift `a` left by `bytes` bytes while shifting in zeros. 451 alias _mm_bslli_si128 = _mm_slli_si128; 452 unittest 453 { 454 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 455 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 456 __m128i result = _mm_bslli_si128!5(toShift); 457 assert( (cast(byte16)result).array == exact); 458 } 459 460 /// Shift `v` right by `bytes` bytes while shifting in zeros. 461 alias _mm_bsrli_si128 = _mm_srli_si128; 462 unittest 463 { 464 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 465 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0]; 466 __m128i result = _mm_bsrli_si128!5(toShift); 467 assert( (cast(byte16)result).array == exact); 468 } 469 470 /// Cast vector of type `__m128d` to type `__m128`. 471 /// Note: Also possible with a regular `cast(__m128)(a)`. 472 __m128 _mm_castpd_ps (__m128d a) pure @safe 473 { 474 return cast(__m128)a; 475 } 476 477 /// Cast vector of type `__m128d` to type `__m128i`. 478 /// Note: Also possible with a regular `cast(__m128i)(a)`. 479 __m128i _mm_castpd_si128 (__m128d a) pure @safe 480 { 481 return cast(__m128i)a; 482 } 483 484 /// Cast vector of type `__m128` to type `__m128d`. 485 /// Note: Also possible with a regular `cast(__m128d)(a)`. 486 __m128d _mm_castps_pd (__m128 a) pure @safe 487 { 488 return cast(__m128d)a; 489 } 490 491 /// Cast vector of type `__m128` to type `__m128i`. 492 /// Note: Also possible with a regular `cast(__m128i)(a)`. 493 __m128i _mm_castps_si128 (__m128 a) pure @safe 494 { 495 return cast(__m128i)a; 496 } 497 498 /// Cast vector of type `__m128i` to type `__m128d`. 499 /// Note: Also possible with a regular `cast(__m128d)(a)`. 500 __m128d _mm_castsi128_pd (__m128i a) pure @safe 501 { 502 return cast(__m128d)a; 503 } 504 505 /// Cast vector of type `__m128i` to type `__m128`. 506 /// Note: Also possible with a regular `cast(__m128)(a)`. 507 __m128 _mm_castsi128_ps (__m128i a) pure @safe 508 { 509 return cast(__m128)a; 510 } 511 512 /// Invalidate and flush the cache line that contains `p` 513 /// from all levels of the cache hierarchy. 514 void _mm_clflush (const(void)* p) @trusted 515 { 516 static if (GDC_with_SSE2) 517 { 518 __builtin_ia32_clflush(p); 519 } 520 else static if (LDC_with_SSE2) 521 { 522 __builtin_ia32_clflush(cast(void*)p); 523 } 524 else version(D_InlineAsm_X86) 525 { 526 asm pure nothrow @nogc @safe 527 { 528 mov EAX, p; 529 clflush [EAX]; 530 } 531 } 532 else version(D_InlineAsm_X86_64) 533 { 534 asm pure nothrow @nogc @safe 535 { 536 mov RAX, p; 537 clflush [RAX]; 538 } 539 } 540 else 541 { 542 // Do nothing. Invalidating cacheline does 543 // not affect correctness. 544 } 545 } 546 unittest 547 { 548 ubyte[64] cacheline; 549 _mm_clflush(cacheline.ptr); 550 } 551 552 /// Compare packed 16-bit integers in `a` and `b` for equality. 553 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 554 { 555 static if (GDC_with_SSE2) 556 { 557 return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b); 558 } 559 else 560 { 561 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 562 } 563 } 564 unittest 565 { 566 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 567 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 568 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 569 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 570 assert(R.array == E); 571 } 572 573 /// Compare packed 32-bit integers in `a` and `b` for equality. 574 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 575 { 576 static if (GDC_with_SSE2) 577 { 578 return __builtin_ia32_pcmpeqd128(a, b); 579 } 580 else 581 { 582 return equalMask!__m128i(a, b); 583 } 584 } 585 unittest 586 { 587 int4 A = [-3, -2, -1, 0]; 588 int4 B = [ 4, -2, 2, 0]; 589 int[4] E = [ 0, -1, 0, -1]; 590 int4 R = cast(int4)(_mm_cmpeq_epi32(A, B)); 591 assert(R.array == E); 592 } 593 594 /// Compare packed 8-bit integers in `a` and `b` for equality. 595 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 596 { 597 static if (GDC_with_SSE2) 598 { 599 return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b); 600 } 601 else 602 { 603 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 604 } 605 } 606 unittest 607 { 608 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 609 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 610 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 611 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 612 assert(C.array == correct); 613 } 614 615 /// Compare packed double-precision (64-bit) floating-point elements 616 /// in `a` and `b` for equality. 617 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 618 { 619 static if (GDC_with_SSE2) 620 { 621 return __builtin_ia32_cmpeqpd(a, b); 622 } 623 else 624 { 625 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b); 626 } 627 } 628 629 /// Compare the lower double-precision (64-bit) floating-point elements 630 /// in `a` and `b` for equality, store the result in the lower element, 631 /// and copy the upper element from `a`. 632 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 633 { 634 static if (GDC_with_SSE2) 635 { 636 return __builtin_ia32_cmpeqsd(a, b); 637 } 638 else 639 { 640 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b); 641 } 642 } 643 644 /// Compare packed 16-bit integers elements in `a` and `b` for greater-than-or-equal. 645 /// #BONUS 646 __m128i _mm_cmpge_epi16 (__m128i a, __m128i b) pure @safe 647 { 648 version (LDC) 649 { 650 // LDC ARM64: generates cmge since -O1 651 return cast(__m128i) greaterOrEqualMask!short8(cast(short8)a, cast(short8)b); 652 } 653 else 654 { 655 return _mm_xor_si128(_mm_cmpeq_epi16(a, b), _mm_cmpgt_epi16(a, b)); 656 } 657 } 658 unittest 659 { 660 short8 A = [-3, -2, -32768, 0, 0, 1, 2, 3]; 661 short8 B = [ 4, 3, 32767, 1, 0, -1, -2, -3]; 662 short[8] E = [ 0, 0, 0, 0, -1, -1, -1, -1]; 663 short8 R = cast(short8)(_mm_cmpge_epi16(cast(__m128i)A, cast(__m128i)B)); 664 assert(R.array == E); 665 } 666 667 /// Compare packed double-precision (64-bit) floating-point elements 668 /// in `a` and `b` for greater-than-or-equal. 669 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 670 { 671 static if (GDC_with_SSE2) 672 { 673 return __builtin_ia32_cmpgepd(a, b); 674 } 675 else 676 { 677 return cast(__m128d) cmppd!(FPComparison.oge)(a, b); 678 } 679 } 680 681 /// Compare the lower double-precision (64-bit) floating-point elements 682 /// in `a` and `b` for greater-than-or-equal, store the result in the 683 /// lower element, and copy the upper element from `a`. 684 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 685 { 686 // Note: There is no __builtin_ia32_cmpgesd builtin. 687 static if (GDC_with_SSE2) 688 { 689 return __builtin_ia32_cmpnltsd(b, a); 690 } 691 else 692 { 693 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b); 694 } 695 } 696 697 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 698 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 699 { 700 static if (GDC_with_SSE2) 701 { 702 return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b); 703 } 704 else 705 { 706 return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b); 707 } 708 } 709 unittest 710 { 711 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 712 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 713 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 714 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 715 assert(R.array == E); 716 } 717 718 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 719 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 720 { 721 static if (GDC_with_SSE2) 722 { 723 return __builtin_ia32_pcmpgtd128(a, b); 724 } 725 else 726 { 727 return cast(__m128i)( greaterMask!int4(a, b)); 728 } 729 } 730 unittest 731 { 732 int4 A = [-3, 2, -1, 0]; 733 int4 B = [ 4, -2, 2, 0]; 734 int[4] E = [ 0, -1, 0, 0]; 735 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 736 assert(R.array == E); 737 } 738 739 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 740 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 741 { 742 // Workaround of a GCC bug here. 743 // Of course the GCC builtin is buggy and generates a weird (and wrong) sequence 744 // with __builtin_ia32_pcmpgtb128. 745 // GCC's emmintrin.h uses comparison operators we don't have instead. 746 // PERF: this is a quite severe GDC performance problem. 747 // Could be workarounded with inline assembly, or another algorithm I guess. 748 749 /* 750 static if (GDC_with_SSE2) 751 { 752 return cast(__m128i) __builtin_ia32_pcmpgtb128(cast(ubyte16)a, cast(ubyte16)b); 753 } 754 else */ 755 { 756 return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b); 757 } 758 } 759 unittest 760 { 761 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 762 __m128i B = _mm_setr_epi8(2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 763 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 764 byte[16] correct = [0, 0,-1, 0, -1, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 765 __m128i D = _mm_cmpeq_epi8(A, B); 766 assert(C.array == correct); 767 } 768 769 /// Compare packed double-precision (64-bit) floating-point elements 770 /// in `a` and `b` for greater-than. 771 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 772 { 773 static if (GDC_with_SSE2) 774 { 775 return __builtin_ia32_cmpgtpd(a, b); 776 } 777 else 778 { 779 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b); 780 } 781 } 782 783 /// Compare the lower double-precision (64-bit) floating-point elements 784 /// in `a` and `b` for greater-than, store the result in the lower element, 785 /// and copy the upper element from `a`. 786 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 787 { 788 // Note: There is no __builtin_ia32_cmpgtsd builtin. 789 static if (GDC_with_SSE2) 790 { 791 return __builtin_ia32_cmpnlesd(b, a); 792 } 793 else 794 { 795 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b); 796 } 797 } 798 799 /// Compare packed 16-bit integers elements in `a` and `b` for greater-than-or-equal. 800 /// #BONUS 801 __m128i _mm_cmple_epi16 (__m128i a, __m128i b) pure @safe 802 { 803 version (LDC) 804 { 805 // LDC ARM64: generates cmge since -O1 806 return cast(__m128i) greaterOrEqualMask!short8(cast(short8)b, cast(short8)a); 807 } 808 else 809 { 810 return _mm_xor_si128(_mm_cmpeq_epi16(b, a), _mm_cmpgt_epi16(b, a)); 811 } 812 } 813 unittest 814 { 815 short8 A = [-3, -2, -32768, 1, 0, 1, 2, 3]; 816 short8 B = [ 4, 3, 32767, 0, 0, -1, -2, -3]; 817 short[8] E = [-1, -1, -1, 0, -1, 0, 0, 0]; 818 short8 R = cast(short8)(_mm_cmple_epi16(cast(__m128i)A, cast(__m128i)B)); 819 assert(R.array == E); 820 } 821 822 /// Compare packed double-precision (64-bit) floating-point elements 823 /// in `a` and `b` for less-than-or-equal. 824 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 825 { 826 static if (GDC_with_SSE2) 827 { 828 return __builtin_ia32_cmplepd(a, b); 829 } 830 else 831 { 832 return cast(__m128d) cmppd!(FPComparison.ole)(a, b); 833 } 834 } 835 836 /// Compare the lower double-precision (64-bit) floating-point elements 837 /// in `a` and `b` for less-than-or-equal, store the result in the 838 /// lower element, and copy the upper element from `a`. 839 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 840 { 841 static if (GDC_with_SSE2) 842 { 843 return __builtin_ia32_cmplesd(a, b); 844 } 845 else 846 { 847 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b); 848 } 849 } 850 851 /// Compare packed 16-bit integers in `a` and `b` for less-than. 852 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 853 { 854 return _mm_cmpgt_epi16(b, a); 855 } 856 857 /// Compare packed 32-bit integers in `a` and `b` for less-than. 858 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 859 { 860 return _mm_cmpgt_epi32(b, a); 861 } 862 863 /// Compare packed 8-bit integers in `a` and `b` for less-than. 864 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 865 { 866 return _mm_cmpgt_epi8(b, a); 867 } 868 869 /// Compare packed double-precision (64-bit) floating-point elements 870 /// in `a` and `b` for less-than. 871 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 872 { 873 static if (GDC_with_SSE2) 874 { 875 return __builtin_ia32_cmpltpd(a, b); 876 } 877 else 878 { 879 return cast(__m128d) cmppd!(FPComparison.olt)(a, b); 880 } 881 } 882 883 /// Compare the lower double-precision (64-bit) floating-point elements 884 /// in `a` and `b` for less-than, store the result in the lower 885 /// element, and copy the upper element from `a`. 886 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 887 { 888 static if (GDC_with_SSE2) 889 { 890 return __builtin_ia32_cmpltsd(a, b); 891 } 892 else 893 { 894 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b); 895 } 896 } 897 898 /// Compare packed double-precision (64-bit) floating-point elements 899 /// in `a` and `b` for not-equal. 900 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 901 { 902 static if (GDC_with_SSE2) 903 { 904 return __builtin_ia32_cmpneqpd(a, b); 905 } 906 else 907 { 908 return cast(__m128d) cmppd!(FPComparison.une)(a, b); 909 } 910 } 911 912 /// Compare the lower double-precision (64-bit) floating-point elements 913 /// in `a` and `b` for not-equal, store the result in the lower 914 /// element, and copy the upper element from `a`. 915 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 916 { 917 static if (GDC_with_SSE2) 918 { 919 return __builtin_ia32_cmpneqsd(a, b); 920 } 921 else 922 { 923 return cast(__m128d) cmpsd!(FPComparison.une)(a, b); 924 } 925 } 926 927 /// Compare packed double-precision (64-bit) floating-point elements 928 /// in `a` and `b` for not-greater-than-or-equal. 929 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 930 { 931 static if (GDC_with_SSE2) 932 { 933 return __builtin_ia32_cmpngepd(a, b); 934 } 935 else 936 { 937 return cast(__m128d) cmppd!(FPComparison.ult)(a, b); 938 } 939 } 940 941 /// Compare the lower double-precision (64-bit) floating-point elements 942 /// in `a` and `b` for not-greater-than-or-equal, store the result in 943 /// the lower element, and copy the upper element from `a`. 944 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 945 { 946 // Note: There is no __builtin_ia32_cmpngesd builtin. 947 static if (GDC_with_SSE2) 948 { 949 return __builtin_ia32_cmpltsd(b, a); 950 } 951 else 952 { 953 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b); 954 } 955 } 956 957 /// Compare packed double-precision (64-bit) floating-point elements 958 /// in `a` and `b` for not-greater-than. 959 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 960 { 961 static if (GDC_with_SSE2) 962 { 963 return __builtin_ia32_cmpngtpd(a, b); 964 } 965 else 966 { 967 return cast(__m128d) cmppd!(FPComparison.ule)(a, b); 968 } 969 } 970 971 /// Compare the lower double-precision (64-bit) floating-point elements 972 /// in `a` and `b` for not-greater-than, store the result in the 973 /// lower element, and copy the upper element from `a`. 974 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 975 { 976 // Note: There is no __builtin_ia32_cmpngtsd builtin. 977 static if (GDC_with_SSE2) 978 { 979 return __builtin_ia32_cmplesd(b, a); 980 } 981 else 982 { 983 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b); 984 } 985 } 986 987 /// Compare packed double-precision (64-bit) floating-point elements 988 /// in `a` and `b` for not-less-than-or-equal. 989 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 990 { 991 static if (GDC_with_SSE2) 992 { 993 return __builtin_ia32_cmpnlepd(a, b); 994 } 995 else 996 { 997 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b); 998 } 999 } 1000 1001 /// Compare the lower double-precision (64-bit) floating-point elements 1002 /// in `a` and `b` for not-less-than-or-equal, store the result in the 1003 /// lower element, and copy the upper element from `a`. 1004 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 1005 { 1006 static if (GDC_with_SSE2) 1007 { 1008 return __builtin_ia32_cmpnlesd(a, b); 1009 } 1010 else 1011 { 1012 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b); 1013 } 1014 } 1015 1016 /// Compare packed double-precision (64-bit) floating-point elements 1017 /// in `a` and `b` for not-less-than. 1018 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 1019 { 1020 static if (GDC_with_SSE2) 1021 { 1022 return __builtin_ia32_cmpnltpd(a, b); 1023 } 1024 else 1025 { 1026 return cast(__m128d) cmppd!(FPComparison.uge)(a, b); 1027 } 1028 } 1029 1030 /// Compare the lower double-precision (64-bit) floating-point elements 1031 /// in `a` and `b` for not-less-than, store the result in the lower 1032 /// element, and copy the upper element from `a`. 1033 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 1034 { 1035 static if (GDC_with_SSE2) 1036 { 1037 return __builtin_ia32_cmpnltsd(a, b); 1038 } 1039 else 1040 { 1041 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b); 1042 } 1043 } 1044 1045 /// Compare packed double-precision (64-bit) floating-point elements 1046 /// in `a` and `b` to see if neither is NaN. 1047 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 1048 { 1049 static if (GDC_with_SSE2) 1050 { 1051 return __builtin_ia32_cmpordpd(a, b); 1052 } 1053 else 1054 { 1055 return cast(__m128d) cmppd!(FPComparison.ord)(a, b); 1056 } 1057 } 1058 1059 /// Compare the lower double-precision (64-bit) floating-point elements 1060 /// in `a` and `b` to see if neither is NaN, store the result in the 1061 /// lower element, and copy the upper element from `a` to the upper element. 1062 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 1063 { 1064 static if (GDC_with_SSE2) 1065 { 1066 return __builtin_ia32_cmpordsd(a, b); 1067 } 1068 else 1069 { 1070 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b); 1071 } 1072 } 1073 1074 /// Compare packed double-precision (64-bit) floating-point elements 1075 /// in `a` and `b` to see if either is NaN. 1076 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 1077 { 1078 static if (GDC_with_SSE2) 1079 { 1080 return __builtin_ia32_cmpunordpd(a, b); 1081 } 1082 else 1083 { 1084 return cast(__m128d) cmppd!(FPComparison.uno)(a, b); 1085 } 1086 } 1087 1088 /// Compare the lower double-precision (64-bit) floating-point elements 1089 /// in `a` and `b` to see if either is NaN, store the result in the lower 1090 /// element, and copy the upper element from `a` to the upper element. 1091 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 1092 { 1093 static if (GDC_with_SSE2) 1094 { 1095 return __builtin_ia32_cmpunordsd(a, b); 1096 } 1097 else 1098 { 1099 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b); 1100 } 1101 } 1102 1103 /// Compare the lower double-precision (64-bit) floating-point element 1104 /// in `a` and `b` for equality, and return the boolean result (0 or 1). 1105 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe 1106 { 1107 // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 1108 // comisd instruction, it returns false in case of unordered instead. 1109 // 1110 // Actually C++ compilers disagree over the meaning of that instruction. 1111 // GCC will manage NaNs like the comisd instruction (return true if unordered), 1112 // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says. 1113 // We choose to do like the most numerous. It seems GCC is buggy with NaNs. 1114 return a.array[0] == b.array[0]; 1115 } 1116 unittest 1117 { 1118 assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1119 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1120 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1121 assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1122 assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1123 } 1124 1125 /// Compare the lower double-precision (64-bit) floating-point element 1126 /// in `a` and `b` for greater-than-or-equal, and return the boolean 1127 /// result (0 or 1). 1128 int _mm_comige_sd (__m128d a, __m128d b) pure @safe 1129 { 1130 return a.array[0] >= b.array[0]; 1131 } 1132 unittest 1133 { 1134 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1135 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1136 assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1137 assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1138 assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1139 assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1140 } 1141 1142 /// Compare the lower double-precision (64-bit) floating-point element 1143 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1). 1144 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe 1145 { 1146 return a.array[0] > b.array[0]; 1147 } 1148 unittest 1149 { 1150 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1151 assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1152 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1153 assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1154 assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1155 } 1156 1157 /// Compare the lower double-precision (64-bit) floating-point element 1158 /// in `a` and `b` for less-than-or-equal. 1159 int _mm_comile_sd (__m128d a, __m128d b) pure @safe 1160 { 1161 return a.array[0] <= b.array[0]; 1162 } 1163 unittest 1164 { 1165 assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1166 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1167 assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1168 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1169 assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1170 assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1171 } 1172 1173 /// Compare the lower double-precision (64-bit) floating-point element 1174 /// in `a` and `b` for less-than, and return the boolean result (0 or 1). 1175 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe 1176 { 1177 return a.array[0] < b.array[0]; 1178 } 1179 unittest 1180 { 1181 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1182 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1183 assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1184 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1185 assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1186 assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1187 } 1188 1189 /// Compare the lower double-precision (64-bit) floating-point element 1190 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1). 1191 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe 1192 { 1193 return a.array[0] != b.array[0]; 1194 } 1195 unittest 1196 { 1197 assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1198 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1199 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1200 assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1201 assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1202 } 1203 1204 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) 1205 /// floating-point elements. 1206 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted 1207 { 1208 version(LDC) 1209 { 1210 // Generates cvtdq2pd since LDC 1.0, even without optimizations 1211 enum ir = ` 1212 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 1213 %r = sitofp <2 x i32> %v to <2 x double> 1214 ret <2 x double> %r`; 1215 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a); 1216 } 1217 else static if (GDC_with_SSE2) 1218 { 1219 return __builtin_ia32_cvtdq2pd(a); 1220 } 1221 else 1222 { 1223 double2 r = void; 1224 r.ptr[0] = a.array[0]; 1225 r.ptr[1] = a.array[1]; 1226 return r; 1227 } 1228 } 1229 unittest 1230 { 1231 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54)); 1232 assert(A.array[0] == 54.0); 1233 assert(A.array[1] == 54.0); 1234 } 1235 1236 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 1237 /// floating-point elements. 1238 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted 1239 { 1240 static if (DMD_with_DSIMD) 1241 { 1242 return cast(__m128)__simd(XMM.CVTDQ2PS, cast(void16) a); 1243 } 1244 else static if (GDC_with_SSE2) 1245 { 1246 return __builtin_ia32_cvtdq2ps(a); 1247 } 1248 else version(LDC) 1249 { 1250 // See #86 for why we had to resort to LLVM IR. 1251 // Plain code below was leading to catastrophic behaviour. 1252 // x86: Generates cvtdq2ps since LDC 1.1.0 -O0 1253 // ARM: Generats scvtf.4s since LDC 1.8.0 -O0 1254 enum ir = ` 1255 %r = sitofp <4 x i32> %0 to <4 x float> 1256 ret <4 x float> %r`; 1257 return cast(__m128) LDCInlineIR!(ir, float4, int4)(a); 1258 } 1259 else 1260 { 1261 __m128 res; // PERF =void; 1262 res.ptr[0] = cast(float)a.array[0]; 1263 res.ptr[1] = cast(float)a.array[1]; 1264 res.ptr[2] = cast(float)a.array[2]; 1265 res.ptr[3] = cast(float)a.array[3]; 1266 return res; 1267 } 1268 } 1269 unittest 1270 { 1271 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 1272 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 1273 } 1274 1275 /// Convert packed double-precision (64-bit) floating-point elements 1276 /// in `a` to packed 32-bit integers. 1277 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted 1278 { 1279 // PERF ARM32 1280 static if (LDC_with_SSE2) 1281 { 1282 return __builtin_ia32_cvtpd2dq(a); 1283 } 1284 else static if (GDC_with_SSE2) 1285 { 1286 return __builtin_ia32_cvtpd2dq(a); 1287 } 1288 else static if (LDC_with_ARM64) 1289 { 1290 // Get current rounding mode. 1291 uint fpscr = arm_get_fpcr(); 1292 long2 i; 1293 switch(fpscr & _MM_ROUND_MASK_ARM) 1294 { 1295 default: 1296 case _MM_ROUND_NEAREST_ARM: i = vcvtnq_s64_f64(a); break; 1297 case _MM_ROUND_DOWN_ARM: i = vcvtmq_s64_f64(a); break; 1298 case _MM_ROUND_UP_ARM: i = vcvtpq_s64_f64(a); break; 1299 case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break; 1300 } 1301 int4 zero = 0; 1302 return cast(__m128i) shufflevectorLDC!(int4, 0, 2, 4, 6)(cast(int4)i, zero); 1303 } 1304 else 1305 { 1306 // PERF ARM32 1307 __m128i r = _mm_setzero_si128(); 1308 r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]); 1309 r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]); 1310 return r; 1311 } 1312 } 1313 unittest 1314 { 1315 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0)); 1316 assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0); 1317 } 1318 1319 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1320 /// to packed 32-bit integers 1321 __m64 _mm_cvtpd_pi32 (__m128d v) @safe 1322 { 1323 return to_m64(_mm_cvtpd_epi32(v)); 1324 } 1325 unittest 1326 { 1327 int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0)); 1328 assert(A.array[0] == 55 && A.array[1] == 61); 1329 } 1330 1331 /// Convert packed double-precision (64-bit) floating-point elements 1332 /// in `a` to packed single-precision (32-bit) floating-point elements. 1333 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted 1334 { 1335 static if (LDC_with_SSE2) 1336 { 1337 return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately 1338 } 1339 else static if (GDC_with_SSE2) 1340 { 1341 return __builtin_ia32_cvtpd2ps(a); 1342 } 1343 else 1344 { 1345 __m128 r = void; 1346 r.ptr[0] = a.array[0]; 1347 r.ptr[1] = a.array[1]; 1348 r.ptr[2] = 0; 1349 r.ptr[3] = 0; 1350 return r; 1351 } 1352 } 1353 unittest 1354 { 1355 __m128d A = _mm_set_pd(5.25, 4.0); 1356 __m128 B = _mm_cvtpd_ps(A); 1357 assert(B.array == [4.0f, 5.25f, 0, 0]); 1358 } 1359 1360 /// Convert packed 32-bit integers in `v` to packed double-precision 1361 /// (64-bit) floating-point elements. 1362 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe 1363 { 1364 return _mm_cvtepi32_pd(to_m128i(v)); 1365 } 1366 unittest 1367 { 1368 __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5)); 1369 assert(A.array[0] == 4.0 && A.array[1] == -5.0); 1370 } 1371 1372 /// Convert packed single-precision (32-bit) floating-point elements 1373 /// in `a` to packed 32-bit integers 1374 __m128i _mm_cvtps_epi32 (__m128 a) @trusted 1375 { 1376 static if (LDC_with_SSE2) 1377 { 1378 return cast(__m128i) __builtin_ia32_cvtps2dq(a); 1379 } 1380 else static if (GDC_with_SSE2) 1381 { 1382 return __builtin_ia32_cvtps2dq(a); 1383 } 1384 else static if (LDC_with_ARM64) 1385 { 1386 // Get current rounding mode. 1387 uint fpscr = arm_get_fpcr(); 1388 switch(fpscr & _MM_ROUND_MASK_ARM) 1389 { 1390 default: 1391 case _MM_ROUND_NEAREST_ARM: return vcvtnq_s32_f32(a); 1392 case _MM_ROUND_DOWN_ARM: return vcvtmq_s32_f32(a); 1393 case _MM_ROUND_UP_ARM: return vcvtpq_s32_f32(a); 1394 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a); 1395 } 1396 } 1397 else 1398 { 1399 __m128i r = void; 1400 r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]); 1401 r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]); 1402 r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]); 1403 r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]); 1404 return r; 1405 } 1406 } 1407 unittest 1408 { 1409 // GDC bug #98607 1410 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607 1411 // GDC does not provide optimization barrier for rounding mode. 1412 // Workarounded with different literals. This bug will likely only manifest in unittest. 1413 // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't. 1414 1415 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1416 1417 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1418 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1419 assert(A.array == [1, -2, 54, -3]); 1420 1421 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1422 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f)); 1423 assert(A.array == [1, -3, 53, -3]); 1424 1425 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1426 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f)); 1427 assert(A.array == [2, -2, 54, -2]); 1428 1429 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1430 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f)); 1431 assert(A.array == [1, -2, 53, -2]); 1432 1433 _MM_SET_ROUNDING_MODE(savedRounding); 1434 } 1435 1436 /// Convert packed single-precision (32-bit) floating-point elements 1437 /// in `a` to packed double-precision (64-bit) floating-point elements. 1438 __m128d _mm_cvtps_pd (__m128 a) pure @trusted 1439 { 1440 version(LDC) 1441 { 1442 // Generates cvtps2pd since LDC 1.0 -O0 1443 enum ir = ` 1444 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1> 1445 %r = fpext <2 x float> %v to <2 x double> 1446 ret <2 x double> %r`; 1447 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a); 1448 } 1449 else static if (GDC_with_SSE2) 1450 { 1451 return __builtin_ia32_cvtps2pd(a); 1452 } 1453 else 1454 { 1455 double2 r = void; 1456 r.ptr[0] = a.array[0]; 1457 r.ptr[1] = a.array[1]; 1458 return r; 1459 } 1460 } 1461 unittest 1462 { 1463 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f)); 1464 assert(A.array[0] == 54.0); 1465 assert(A.array[1] == 54.0); 1466 } 1467 1468 /// Copy the lower double-precision (64-bit) floating-point element of `a`. 1469 double _mm_cvtsd_f64 (__m128d a) pure @safe 1470 { 1471 return a.array[0]; 1472 } 1473 1474 /// Convert the lower double-precision (64-bit) floating-point element 1475 /// in `a` to a 32-bit integer. 1476 int _mm_cvtsd_si32 (__m128d a) @safe 1477 { 1478 static if (LDC_with_SSE2) 1479 { 1480 return __builtin_ia32_cvtsd2si(a); 1481 } 1482 else static if (GDC_with_SSE2) 1483 { 1484 return __builtin_ia32_cvtsd2si(a); 1485 } 1486 else 1487 { 1488 return convertDoubleToInt32UsingMXCSR(a[0]); 1489 } 1490 } 1491 unittest 1492 { 1493 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0))); 1494 } 1495 1496 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer. 1497 long _mm_cvtsd_si64 (__m128d a) @trusted 1498 { 1499 version (LDC) 1500 { 1501 version (X86_64) 1502 { 1503 return __builtin_ia32_cvtsd2si64(a); 1504 } 1505 else 1506 { 1507 // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer 1508 // using SSE instructions only. So the builtin doesn't exit for this arch. 1509 return convertDoubleToInt64UsingMXCSR(a[0]); 1510 } 1511 } 1512 else 1513 { 1514 return convertDoubleToInt64UsingMXCSR(a.array[0]); 1515 } 1516 } 1517 unittest 1518 { 1519 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0))); 1520 1521 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1522 1523 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1524 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49))); 1525 1526 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1527 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1))); 1528 1529 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1530 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1))); 1531 1532 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1533 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9))); 1534 1535 _MM_SET_ROUNDING_MODE(savedRounding); 1536 } 1537 1538 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; /// 1539 1540 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 1541 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a` 1542 /// to the upper elements of result. 1543 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted 1544 { 1545 static if (GDC_with_SSE2) 1546 { 1547 return __builtin_ia32_cvtsd2ss(a, b); 1548 } 1549 else 1550 { 1551 // Generates cvtsd2ss since LDC 1.3 -O0 1552 a.ptr[0] = b.array[0]; 1553 return a; 1554 } 1555 } 1556 unittest 1557 { 1558 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0)); 1559 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]); 1560 } 1561 1562 /// Get the lower 32-bit integer in `a`. 1563 int _mm_cvtsi128_si32 (__m128i a) pure @safe 1564 { 1565 return a.array[0]; 1566 } 1567 1568 /// Get the lower 64-bit integer in `a`. 1569 long _mm_cvtsi128_si64 (__m128i a) pure @safe 1570 { 1571 long2 la = cast(long2)a; 1572 return la.array[0]; 1573 } 1574 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 1575 1576 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 1577 /// lower element of result, and copy the upper element from `a` to the upper element of result. 1578 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted 1579 { 1580 a.ptr[0] = cast(double)b; 1581 return a; 1582 } 1583 unittest 1584 { 1585 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 1586 assert(a.array == [42.0, 0]); 1587 } 1588 1589 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements. 1590 __m128i _mm_cvtsi32_si128 (int a) pure @trusted 1591 { 1592 int4 r = [0, 0, 0, 0]; 1593 r.ptr[0] = a; 1594 return r; 1595 } 1596 unittest 1597 { 1598 __m128i a = _mm_cvtsi32_si128(65); 1599 assert(a.array == [65, 0, 0, 0]); 1600 } 1601 1602 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 1603 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 1604 1605 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted 1606 { 1607 a.ptr[0] = cast(double)b; 1608 return a; 1609 } 1610 unittest 1611 { 1612 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 1613 assert(a.array == [42.0, 0]); 1614 } 1615 1616 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element. 1617 __m128i _mm_cvtsi64_si128 (long a) pure @trusted 1618 { 1619 long2 r = [0, 0]; 1620 r.ptr[0] = a; 1621 return cast(__m128i)(r); 1622 } 1623 1624 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; /// 1625 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; /// 1626 1627 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 1628 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 1629 // element of result. 1630 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted 1631 { 1632 a.ptr[0] = b.array[0]; 1633 return a; 1634 } 1635 unittest 1636 { 1637 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 1638 assert(a.array == [42.0, 0]); 1639 } 1640 1641 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation. 1642 long _mm_cvttss_si64 (__m128 a) pure @safe 1643 { 1644 return cast(long)(a.array[0]); // Generates cvttss2si as expected 1645 } 1646 unittest 1647 { 1648 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1649 } 1650 1651 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1652 /// Put zeroes in the upper elements of result. 1653 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted 1654 { 1655 static if (LDC_with_SSE2) 1656 { 1657 return __builtin_ia32_cvttpd2dq(a); 1658 } 1659 else static if (GDC_with_SSE2) 1660 { 1661 return __builtin_ia32_cvttpd2dq(a); 1662 } 1663 else 1664 { 1665 // Note: doesn't generate cvttpd2dq as of LDC 1.13 1666 __m128i r; // PERF =void; 1667 r.ptr[0] = cast(int)a.array[0]; 1668 r.ptr[1] = cast(int)a.array[1]; 1669 r.ptr[2] = 0; 1670 r.ptr[3] = 0; 1671 return r; 1672 } 1673 } 1674 unittest 1675 { 1676 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f)); 1677 assert(R.array == [-4, 45641, 0, 0]); 1678 } 1679 1680 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1681 /// to packed 32-bit integers with truncation. 1682 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe 1683 { 1684 return to_m64(_mm_cvttpd_epi32(v)); 1685 } 1686 unittest 1687 { 1688 int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f)); 1689 int[2] correct = [-4, 45641]; 1690 assert(R.array == correct); 1691 } 1692 1693 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1694 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted 1695 { 1696 // x86: Generates cvttps2dq since LDC 1.3 -O2 1697 // ARM64: generates fcvtze since LDC 1.8 -O2 1698 __m128i r; // PERF = void; 1699 r.ptr[0] = cast(int)a.array[0]; 1700 r.ptr[1] = cast(int)a.array[1]; 1701 r.ptr[2] = cast(int)a.array[2]; 1702 r.ptr[3] = cast(int)a.array[3]; 1703 return r; 1704 } 1705 unittest 1706 { 1707 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f)); 1708 assert(R.array == [-4, 45641, 0, 1]); 1709 } 1710 1711 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation. 1712 int _mm_cvttsd_si32 (__m128d a) 1713 { 1714 // Generates cvttsd2si since LDC 1.3 -O0 1715 return cast(int)a.array[0]; 1716 } 1717 1718 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation. 1719 long _mm_cvttsd_si64 (__m128d a) 1720 { 1721 // Generates cvttsd2si since LDC 1.3 -O0 1722 // but in 32-bit instead, it's a long sequence that resort to FPU 1723 return cast(long)a.array[0]; 1724 } 1725 1726 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; /// 1727 1728 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`. 1729 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe 1730 { 1731 pragma(inline, true); 1732 return a / b; 1733 } 1734 1735 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted 1736 { 1737 static if (GDC_with_SSE2) 1738 { 1739 return __builtin_ia32_divsd(a, b); 1740 } 1741 else version(DigitalMars) 1742 { 1743 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1744 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 1745 asm pure nothrow @nogc @trusted { nop;} 1746 a.array[0] = a.array[0] / b.array[0]; 1747 return a; 1748 } 1749 else 1750 { 1751 a.ptr[0] /= b.array[0]; 1752 return a; 1753 } 1754 } 1755 unittest 1756 { 1757 __m128d a = [2.0, 4.5]; 1758 a = _mm_div_sd(a, a); 1759 assert(a.array == [1.0, 4.5]); 1760 } 1761 1762 /// Extract a 16-bit integer from `v`, selected with `index`. 1763 /// Warning: the returned value is zero-extended to 32-bits. 1764 int _mm_extract_epi16(__m128i v, int index) pure @safe 1765 { 1766 short8 r = cast(short8)v; 1767 return cast(ushort)(r.array[index & 7]); 1768 } 1769 unittest 1770 { 1771 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1); 1772 assert(_mm_extract_epi16(A, 6) == 6); 1773 assert(_mm_extract_epi16(A, 0) == 65535); 1774 assert(_mm_extract_epi16(A, 5 + 8) == 5); 1775 } 1776 1777 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`. 1778 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted 1779 { 1780 short8 r = cast(short8)v; 1781 r.ptr[index & 7] = cast(short)i; 1782 return cast(__m128i)r; 1783 } 1784 unittest 1785 { 1786 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1787 short8 R = cast(short8) _mm_insert_epi16(A, 42, 6); 1788 short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7]; 1789 assert(R.array == correct); 1790 } 1791 1792 /// Perform a serializing operation on all load-from-memory instructions that were issued prior 1793 /// to this instruction. Guarantees that every load instruction that precedes, in program order, 1794 /// is globally visible before any load instruction which follows the fence in program order. 1795 void _mm_lfence() @trusted 1796 { 1797 version(GNU) 1798 { 1799 static if (GDC_with_SSE2) 1800 { 1801 __builtin_ia32_lfence(); 1802 } 1803 else version(X86) 1804 { 1805 asm pure nothrow @nogc @trusted 1806 { 1807 "lfence;\n" : : : ; 1808 } 1809 } 1810 else 1811 static assert(false); 1812 } 1813 else static if (LDC_with_SSE2) 1814 { 1815 __builtin_ia32_lfence(); 1816 } 1817 else static if (LDC_with_ARM64) 1818 { 1819 __builtin_arm_dmb(9); // dmb ishld 1820 } 1821 else static if (DMD_with_asm) 1822 { 1823 asm nothrow @nogc pure @safe 1824 { 1825 lfence; 1826 } 1827 } 1828 else version(LDC) 1829 { 1830 // When the architecture is unknown, generate a full memory barrier, 1831 // as the semantics of sfence do not really match those of atomics. 1832 llvm_memory_fence(); 1833 } 1834 else 1835 static assert(false); 1836 } 1837 unittest 1838 { 1839 _mm_lfence(); 1840 } 1841 1842 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1843 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1844 __m128d _mm_load_pd (const(double) * mem_addr) pure 1845 { 1846 pragma(inline, true); 1847 __m128d* aligned = cast(__m128d*)mem_addr; 1848 return *aligned; 1849 } 1850 unittest 1851 { 1852 align(16) double[2] S = [-5.0, 7.0]; 1853 __m128d R = _mm_load_pd(S.ptr); 1854 assert(R.array == S); 1855 } 1856 1857 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst. 1858 /// `mem_addr` does not need to be aligned on any particular boundary. 1859 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 1860 { 1861 double m = *mem_addr; 1862 __m128d r; // PERF =void; 1863 r.ptr[0] = m; 1864 r.ptr[1] = m; 1865 return r; 1866 } 1867 unittest 1868 { 1869 double what = 4; 1870 __m128d R = _mm_load_pd1(&what); 1871 double[2] correct = [4.0, 4]; 1872 assert(R.array == correct); 1873 } 1874 1875 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 1876 /// element. `mem_addr` does not need to be aligned on any particular boundary. 1877 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted 1878 { 1879 double2 r = [0, 0]; 1880 r.ptr[0] = *mem_addr; 1881 return r; 1882 } 1883 unittest 1884 { 1885 double x = -42; 1886 __m128d a = _mm_load_sd(&x); 1887 assert(a.array == [-42.0, 0.0]); 1888 } 1889 1890 /// Load 128-bits of integer data from memory into dst. 1891 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1892 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @safe 1893 { 1894 pragma(inline, true); 1895 return *mem_addr; 1896 } 1897 unittest 1898 { 1899 align(16) int[4] correct = [-1, 2, 3, 4]; 1900 int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr); 1901 assert(A.array == correct); 1902 } 1903 1904 alias _mm_load1_pd = _mm_load_pd1; /// 1905 1906 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 1907 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary. 1908 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted 1909 { 1910 pragma(inline, true); 1911 a.ptr[1] = *mem_addr; 1912 return a; 1913 } 1914 unittest 1915 { 1916 double A = 7.0; 1917 __m128d B = _mm_setr_pd(4.0, -5.0); 1918 __m128d R = _mm_loadh_pd(B, &A); 1919 double[2] correct = [ 4.0, 7.0 ]; 1920 assert(R.array == correct); 1921 } 1922 1923 /// Load 64-bit integer from memory into the first element of result. Zero out the other. 1924 /// Note: strange signature since the memory doesn't have to aligned, and should point to addressable 64-bit, not 128-bit. 1925 /// You may use `_mm_loadu_si64` instead. 1926 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted 1927 { 1928 pragma(inline, true); 1929 static if (DMD_with_DSIMD) 1930 { 1931 return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr); 1932 } 1933 else 1934 { 1935 auto pLong = cast(const(long)*)mem_addr; 1936 long2 r = [0, 0]; 1937 r.ptr[0] = *pLong; 1938 return cast(__m128i)(r); 1939 } 1940 } 1941 unittest 1942 { 1943 long A = 0x7878787870707070; 1944 long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A); 1945 long[2] correct = [0x7878787870707070, 0]; 1946 assert(R.array == correct); 1947 } 1948 1949 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 1950 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary. 1951 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted 1952 { 1953 a.ptr[0] = *mem_addr; 1954 return a; 1955 } 1956 unittest 1957 { 1958 double A = 7.0; 1959 __m128d B = _mm_setr_pd(4.0, -5.0); 1960 __m128d R = _mm_loadl_pd(B, &A); 1961 double[2] correct = [ 7.0, -5.0 ]; 1962 assert(R.array == correct); 1963 } 1964 1965 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 1966 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1967 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted 1968 { 1969 __m128d a = *cast(__m128d*)(mem_addr); 1970 __m128d r; // PERF =void; 1971 r.ptr[0] = a.array[1]; 1972 r.ptr[1] = a.array[0]; 1973 return r; 1974 } 1975 unittest 1976 { 1977 align(16) double[2] A = [56.0, -74.0]; 1978 __m128d R = _mm_loadr_pd(A.ptr); 1979 double[2] correct = [-74.0, 56.0]; 1980 assert(R.array == correct); 1981 } 1982 1983 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1984 /// `mem_addr` does not need to be aligned on any particular boundary. 1985 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted 1986 { 1987 pragma(inline, true); 1988 static if (GDC_with_SSE2) 1989 { 1990 return __builtin_ia32_loadupd(mem_addr); 1991 } 1992 else version(LDC) 1993 { 1994 return loadUnaligned!(double2)(mem_addr); 1995 } 1996 else version(DigitalMars) 1997 { 1998 // Apparently inside __simd you can use aligned dereferences without fear. 1999 // That was issue 23048 on dlang's Bugzilla. 2000 static if (DMD_with_DSIMD) 2001 { 2002 return cast(__m128d)__simd(XMM.LODUPD, *cast(double2*)mem_addr); 2003 } 2004 else static if (SSESizedVectorsAreEmulated) 2005 { 2006 // Since this vector is emulated, it doesn't have alignement constraints 2007 // and as such we can just cast it. 2008 return *cast(__m128d*)(mem_addr); 2009 } 2010 else 2011 { 2012 __m128d result; 2013 result.ptr[0] = mem_addr[0]; 2014 result.ptr[1] = mem_addr[1]; 2015 return result; 2016 } 2017 } 2018 else 2019 { 2020 __m128d result; 2021 result.ptr[0] = mem_addr[0]; 2022 result.ptr[1] = mem_addr[1]; 2023 return result; 2024 } 2025 } 2026 unittest 2027 { 2028 double[2] A = [56.0, -75.0]; 2029 __m128d R = _mm_loadu_pd(A.ptr); 2030 double[2] correct = [56.0, -75.0]; 2031 assert(R.array == correct); 2032 } 2033 2034 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary. 2035 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 2036 { 2037 // PERF DMD 2038 pragma(inline, true); 2039 static if (GDC_with_SSE2) 2040 { 2041 return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr); 2042 } 2043 else version(LDC) 2044 { 2045 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 2046 } 2047 else 2048 { 2049 const(int)* p = cast(const(int)*)mem_addr; 2050 __m128i r = void; 2051 r.ptr[0] = p[0]; 2052 r.ptr[1] = p[1]; 2053 r.ptr[2] = p[2]; 2054 r.ptr[3] = p[3]; 2055 return r; 2056 } 2057 } 2058 unittest 2059 { 2060 align(16) int[4] correct = [-1, 2, -3, 4]; 2061 int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr); 2062 assert(A.array == correct); 2063 } 2064 2065 /// Load unaligned 16-bit integer from memory into the first element, fill with zeroes otherwise. 2066 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted // TODO: should be @system actually 2067 { 2068 static if (DMD_with_DSIMD) 2069 { 2070 int r = *cast(short*)(mem_addr); 2071 return cast(__m128i) __simd(XMM.LODD, *cast(__m128i*)&r); 2072 } 2073 else version(DigitalMars) 2074 { 2075 // Workaround issue: https://issues.dlang.org/show_bug.cgi?id=21672 2076 // DMD cannot handle the below code... 2077 align(16) short[8] r = [0, 0, 0, 0, 0, 0, 0, 0]; 2078 r[0] = *cast(short*)(mem_addr); 2079 return *cast(int4*)(r.ptr); 2080 } 2081 else 2082 { 2083 short r = *cast(short*)(mem_addr); 2084 short8 result = [0, 0, 0, 0, 0, 0, 0, 0]; 2085 result.ptr[0] = r; 2086 return cast(__m128i)result; 2087 } 2088 } 2089 unittest 2090 { 2091 short r = 13; 2092 short8 A = cast(short8) _mm_loadu_si16(&r); 2093 short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0]; 2094 assert(A.array == correct); 2095 } 2096 2097 /// Load unaligned 32-bit integer from memory into the first element of result. 2098 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted // TODO: should be @system actually 2099 { 2100 pragma(inline, true); 2101 int r = *cast(int*)(mem_addr); 2102 int4 result = [0, 0, 0, 0]; 2103 result.ptr[0] = r; 2104 return result; 2105 } 2106 unittest 2107 { 2108 int r = 42; 2109 __m128i A = _mm_loadu_si32(&r); 2110 int[4] correct = [42, 0, 0, 0]; 2111 assert(A.array == correct); 2112 } 2113 2114 /// Load unaligned 64-bit integer from memory into the first element of result. 2115 /// Upper 64-bit is zeroed. 2116 __m128i _mm_loadu_si64 (const(void)* mem_addr) pure @system 2117 { 2118 pragma(inline, true); 2119 static if (DMD_with_DSIMD) 2120 { 2121 return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr); 2122 } 2123 else 2124 { 2125 auto pLong = cast(const(long)*)mem_addr; 2126 long2 r = [0, 0]; 2127 r.ptr[0] = *pLong; 2128 return cast(__m128i)r; 2129 } 2130 } 2131 unittest 2132 { 2133 long r = 446446446446; 2134 long2 A = cast(long2) _mm_loadu_si64(&r); 2135 long[2] correct = [446446446446, 0]; 2136 assert(A.array == correct); 2137 } 2138 2139 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 2140 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 2141 /// and pack the results in destination. 2142 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted 2143 { 2144 static if (GDC_with_SSE2) 2145 { 2146 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2147 } 2148 else static if (LDC_with_SSE2) 2149 { 2150 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2151 } 2152 else static if (LDC_with_ARM64) 2153 { 2154 int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b)); 2155 int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b)); 2156 int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl)); 2157 int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph)); 2158 return vcombine_s32(rl, rh); 2159 } 2160 else 2161 { 2162 short8 sa = cast(short8)a; 2163 short8 sb = cast(short8)b; 2164 int4 r; 2165 foreach(i; 0..4) 2166 { 2167 r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1]; 2168 } 2169 return r; 2170 } 2171 } 2172 unittest 2173 { 2174 short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2175 short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2176 int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B); 2177 int[4] correct = [1, 13, -2147483648, 2*32767*32767]; 2178 assert(R.array == correct); 2179 } 2180 2181 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 2182 /// (elements are not stored when the highest bit is not set in the corresponding element) 2183 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 2184 /// boundary. 2185 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted 2186 { 2187 static if (GDC_with_SSE2) 2188 { 2189 return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr); 2190 } 2191 else static if (LDC_with_SSE2) 2192 { 2193 return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr); 2194 } 2195 else static if (LDC_with_ARM64) 2196 { 2197 // PERF: catastrophic on ARM32 2198 byte16 bmask = cast(byte16)mask; 2199 byte16 shift = 7; 2200 bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask 2201 mask = cast(__m128i) bmask; 2202 __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr); 2203 dest = (a & mask) | (dest & ~mask); 2204 storeUnaligned!__m128i(dest, cast(int*)mem_addr); 2205 } 2206 else 2207 { 2208 byte16 b = cast(byte16)a; 2209 byte16 m = cast(byte16)mask; 2210 byte* dest = cast(byte*)(mem_addr); 2211 foreach(j; 0..16) 2212 { 2213 if (m.array[j] & 128) 2214 { 2215 dest[j] = b.array[j]; 2216 } 2217 } 2218 } 2219 } 2220 unittest 2221 { 2222 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]; 2223 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0); 2224 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); 2225 _mm_maskmoveu_si128(A, mask, dest.ptr); 2226 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42]; 2227 assert(dest == correct); 2228 } 2229 2230 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values. 2231 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe 2232 { 2233 static if (GDC_with_SSE2) 2234 { 2235 return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b); 2236 } 2237 else version(LDC) 2238 { 2239 // x86: pmaxsw since LDC 1.0 -O1 2240 // ARM: smax.8h since LDC 1.5 -01 2241 short8 sa = cast(short8)a; 2242 short8 sb = cast(short8)b; 2243 short8 greater = greaterMask!short8(sa, sb); 2244 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2245 } 2246 else 2247 { 2248 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else 2249 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2250 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2251 return _mm_xor_si128(b, mask); 2252 } 2253 } 2254 unittest 2255 { 2256 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57), 2257 _mm_setr_epi16(-4,-8, 9, 7, 0,-32768, 0, 0)); 2258 short[8] correct = [32767, 1, 9, 7, 9, 7, 0, 0]; 2259 assert(R.array == correct); 2260 } 2261 2262 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values. 2263 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe 2264 { 2265 version(LDC) 2266 { 2267 // x86: pmaxub since LDC 1.0.0 -O1 2268 // ARM64: umax.16b since LDC 1.5.0 -O1 2269 // PERF: catastrophic on ARM32 2270 ubyte16 sa = cast(ubyte16)a; 2271 ubyte16 sb = cast(ubyte16)b; 2272 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2273 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2274 } 2275 else 2276 { 2277 __m128i value128 = _mm_set1_epi8(-128); 2278 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2279 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2280 __m128i mask = _mm_and_si128(aTob, higher); 2281 return _mm_xor_si128(b, mask); 2282 } 2283 } 2284 unittest 2285 { 2286 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2287 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2288 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 2289 assert(R.array == correct); 2290 } 2291 2292 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 2293 /// packed maximum values. 2294 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted 2295 { 2296 static if (GDC_with_SSE2) 2297 { 2298 return __builtin_ia32_maxpd(a, b); 2299 } 2300 else 2301 { 2302 // x86: Generates maxpd starting with LDC 1.9 -O2 2303 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2304 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2305 return a; 2306 } 2307 } 2308 unittest 2309 { 2310 __m128d A = _mm_setr_pd(4.0, 1.0); 2311 __m128d B = _mm_setr_pd(1.0, 8.0); 2312 __m128d M = _mm_max_pd(A, B); 2313 assert(M.array[0] == 4.0); 2314 assert(M.array[1] == 8.0); 2315 } 2316 2317 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 2318 /// lower element of result, and copy the upper element from `a` to the upper element of result. 2319 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted 2320 { 2321 static if (GDC_with_SSE2) 2322 { 2323 return __builtin_ia32_maxsd(a, b); 2324 } 2325 else 2326 { 2327 __m128d r = a; 2328 // Generates maxsd starting with LDC 1.3 2329 r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2330 return r; 2331 } 2332 } 2333 unittest 2334 { 2335 __m128d A = _mm_setr_pd(1.0, 1.0); 2336 __m128d B = _mm_setr_pd(4.0, 2.0); 2337 __m128d M = _mm_max_sd(A, B); 2338 assert(M.array[0] == 4.0); 2339 assert(M.array[1] == 1.0); 2340 } 2341 2342 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 2343 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 2344 /// is globally visible before any memory instruction which follows the fence in program order. 2345 void _mm_mfence() @trusted // not pure! 2346 { 2347 version(GNU) 2348 { 2349 static if (GDC_with_SSE2) 2350 { 2351 __builtin_ia32_mfence(); 2352 } 2353 else version(X86) 2354 { 2355 asm pure nothrow @nogc @trusted 2356 { 2357 "mfence;\n" : : : ; 2358 } 2359 } 2360 else 2361 static assert(false); 2362 } 2363 else static if (LDC_with_SSE2) 2364 { 2365 __builtin_ia32_mfence(); 2366 } 2367 else static if (DMD_with_asm) 2368 { 2369 asm nothrow @nogc pure @safe 2370 { 2371 mfence; 2372 } 2373 } 2374 else version(LDC) 2375 { 2376 // Note: will generate the DMB ish instruction on ARM 2377 llvm_memory_fence(); 2378 } 2379 else 2380 static assert(false); 2381 } 2382 unittest 2383 { 2384 _mm_mfence(); 2385 } 2386 2387 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values. 2388 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe 2389 { 2390 static if (GDC_with_SSE2) 2391 { 2392 return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b); 2393 } 2394 else version(LDC) 2395 { 2396 // x86: pminsw since LDC 1.0 -O1 2397 // ARM64: smin.8h since LDC 1.5 -01 2398 short8 sa = cast(short8)a; 2399 short8 sb = cast(short8)b; 2400 short8 greater = greaterMask!short8(sa, sb); 2401 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2402 } 2403 else 2404 { 2405 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else 2406 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2407 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2408 return _mm_xor_si128(b, mask); 2409 } 2410 } 2411 unittest 2412 { 2413 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-32768), 2414 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 2415 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -32768]; 2416 assert(R.array == correct); 2417 } 2418 2419 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values. 2420 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe 2421 { 2422 version(LDC) 2423 { 2424 // x86: pminub since LDC 1.0.0 -O1 2425 // ARM: umin.16b since LDC 1.5.0 -O1 2426 // PERF: catastrophic on ARM32 2427 ubyte16 sa = cast(ubyte16)a; 2428 ubyte16 sb = cast(ubyte16)b; 2429 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2430 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2431 } 2432 else 2433 { 2434 __m128i value128 = _mm_set1_epi8(-128); 2435 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2436 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2437 __m128i mask = _mm_and_si128(aTob, lower); 2438 return _mm_xor_si128(b, mask); 2439 } 2440 } 2441 unittest 2442 { 2443 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2444 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2445 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 2446 assert(R.array == correct); 2447 } 2448 2449 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values. 2450 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted 2451 { 2452 static if (GDC_with_SSE2) 2453 { 2454 return __builtin_ia32_minpd(a, b); 2455 } 2456 else 2457 { 2458 // Generates minpd starting with LDC 1.9 2459 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2460 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2461 return a; 2462 } 2463 } 2464 unittest 2465 { 2466 __m128d A = _mm_setr_pd(1.0, 2.0); 2467 __m128d B = _mm_setr_pd(4.0, 1.0); 2468 __m128d M = _mm_min_pd(A, B); 2469 assert(M.array[0] == 1.0); 2470 assert(M.array[1] == 1.0); 2471 } 2472 2473 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 2474 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 2475 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe 2476 { 2477 static if (GDC_with_SSE2) 2478 { 2479 return __builtin_ia32_minsd(a, b); 2480 } 2481 else 2482 { 2483 // Generates minsd starting with LDC 1.3 2484 __m128d r = a; 2485 r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2486 return r; 2487 } 2488 } 2489 unittest 2490 { 2491 __m128d A = _mm_setr_pd(1.0, 3.0); 2492 __m128d B = _mm_setr_pd(4.0, 2.0); 2493 __m128d M = _mm_min_sd(A, B); 2494 assert(M.array[0] == 1.0); 2495 assert(M.array[1] == 3.0); 2496 } 2497 2498 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element. 2499 __m128i _mm_move_epi64 (__m128i a) pure @trusted 2500 { 2501 static if (GDC_with_SSE2) 2502 { 2503 // slightly better with GDC -O0 2504 return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 2505 } 2506 else 2507 { 2508 long2 result = [ 0, 0 ]; 2509 long2 la = cast(long2) a; 2510 result.ptr[0] = la.array[0]; 2511 return cast(__m128i)(result); 2512 } 2513 } 2514 unittest 2515 { 2516 long2 A = [13, 47]; 2517 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 2518 long[2] correct = [13, 0]; 2519 assert(B.array == correct); 2520 } 2521 2522 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 2523 /// the upper element from `a` to the upper element of dst. 2524 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted 2525 { 2526 static if (GDC_with_SSE2) 2527 { 2528 return __builtin_ia32_movsd(a, b); 2529 } 2530 else 2531 { 2532 b.ptr[1] = a.array[1]; 2533 return b; 2534 } 2535 } 2536 unittest 2537 { 2538 double2 A = [13.0, 47.0]; 2539 double2 B = [34.0, 58.0]; 2540 double2 C = _mm_move_sd(A, B); 2541 double[2] correct = [34.0, 47.0]; 2542 assert(C.array == correct); 2543 } 2544 2545 /// Create mask from the most significant bit of each 8-bit element in `v`. 2546 int _mm_movemask_epi8 (__m128i a) pure @trusted 2547 { 2548 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2549 static if (GDC_with_SSE2) 2550 { 2551 return __builtin_ia32_pmovmskb128(cast(ubyte16)a); 2552 } 2553 else static if (LDC_with_SSE2) 2554 { 2555 return __builtin_ia32_pmovmskb128(cast(byte16)a); 2556 } 2557 else static if (LDC_with_ARM64) 2558 { 2559 // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon 2560 // The other two solutions lead to unfound intrinsics in LLVM and that took a long time. 2561 // SO there might be something a bit faster, but this one is reasonable and branchless. 2562 byte8 mask_shift; 2563 mask_shift.ptr[0] = 7; 2564 mask_shift.ptr[1] = 6; 2565 mask_shift.ptr[2] = 5; 2566 mask_shift.ptr[3] = 4; 2567 mask_shift.ptr[4] = 3; 2568 mask_shift.ptr[5] = 2; 2569 mask_shift.ptr[6] = 1; 2570 mask_shift.ptr[7] = 0; 2571 byte8 mask_and = byte8(-128); 2572 byte8 lo = vget_low_u8(cast(byte16)a); 2573 byte8 hi = vget_high_u8(cast(byte16)a); 2574 lo = vand_u8(lo, mask_and); 2575 lo = vshr_u8(lo, mask_shift); 2576 hi = vand_u8(hi, mask_and); 2577 hi = vshr_u8(hi, mask_shift); 2578 lo = vpadd_u8(lo,lo); 2579 lo = vpadd_u8(lo,lo); 2580 lo = vpadd_u8(lo,lo); 2581 hi = vpadd_u8(hi,hi); 2582 hi = vpadd_u8(hi,hi); 2583 hi = vpadd_u8(hi,hi); 2584 return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]); 2585 } 2586 else 2587 { 2588 byte16 ai = cast(byte16)a; 2589 int r = 0; 2590 foreach(bit; 0..16) 2591 { 2592 if (ai.array[bit] < 0) r += (1 << bit); 2593 } 2594 return r; 2595 } 2596 } 2597 unittest 2598 { 2599 assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0))); 2600 } 2601 2602 /// Create mask from the most significant bit of each 16-bit element in `v`. #BONUS 2603 int _mm_movemask_epi16 (__m128i a) pure @trusted 2604 { 2605 return _mm_movemask_epi8(_mm_packs_epi16(a, _mm_setzero_si128())); 2606 } 2607 unittest 2608 { 2609 assert(0x9C == _mm_movemask_epi16(_mm_set_epi16(-1, 1, 2, -3, -32768, -1, 32767, 8))); 2610 } 2611 2612 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 2613 /// loating-point element in `v`. 2614 int _mm_movemask_pd(__m128d v) pure @safe 2615 { 2616 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2617 static if (GDC_or_LDC_with_SSE2) 2618 { 2619 return __builtin_ia32_movmskpd(v); 2620 } 2621 else 2622 { 2623 long2 lv = cast(long2)v; 2624 int r = 0; 2625 if (lv.array[0] < 0) r += 1; 2626 if (lv.array[1] < 0) r += 2; 2627 return r; 2628 } 2629 } 2630 unittest 2631 { 2632 __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0); 2633 assert(_mm_movemask_pd(A) == 2); 2634 } 2635 2636 /// Copy the lower 64-bit integer in `v`. 2637 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe 2638 { 2639 long2 lv = cast(long2)v; 2640 return long1(lv.array[0]); 2641 } 2642 unittest 2643 { 2644 __m128i A = _mm_set_epi64x(-1, -2); 2645 __m64 R = _mm_movepi64_pi64(A); 2646 assert(R.array[0] == -2); 2647 } 2648 2649 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element. 2650 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted 2651 { 2652 long2 r; 2653 r.ptr[0] = a.array[0]; 2654 r.ptr[1] = 0; 2655 return cast(__m128i)r; 2656 } 2657 2658 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in `a` and `b`, 2659 /// and store the unsigned 64-bit results. 2660 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted 2661 { 2662 // PERF DMD D_SIMD 2663 static if (GDC_with_SSE2) 2664 { 2665 return cast(__m128i) __builtin_ia32_pmuludq128 (a, b); 2666 } 2667 else 2668 { 2669 version(LDC) 2670 { 2671 static if (__VERSION__ >= 2088) 2672 { 2673 // Need LLVM9 for proper optimization 2674 long2 la, lb; 2675 la.ptr[0] = cast(uint)a.array[0]; 2676 la.ptr[1] = cast(uint)a.array[2]; 2677 lb.ptr[0] = cast(uint)b.array[0]; 2678 lb.ptr[1] = cast(uint)b.array[2]; 2679 } 2680 else 2681 { 2682 __m128i zero; 2683 zero = 0; 2684 long2 la = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(a, zero); 2685 long2 lb = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(b, zero); 2686 } 2687 } 2688 else 2689 { 2690 long2 la, lb; 2691 la.ptr[0] = cast(uint)a.array[0]; 2692 la.ptr[1] = cast(uint)a.array[2]; 2693 lb.ptr[0] = cast(uint)b.array[0]; 2694 lb.ptr[1] = cast(uint)b.array[2]; 2695 } 2696 2697 version(DigitalMars) 2698 { 2699 // DMD has no long2 mul 2700 la.ptr[0] *= lb.array[0]; 2701 la.ptr[1] *= lb.array[1]; 2702 return cast(__m128i)(la); 2703 } 2704 else 2705 { 2706 static if (__VERSION__ >= 2076) 2707 { 2708 return cast(__m128i)(la * lb); 2709 } 2710 else 2711 { 2712 // long2 mul not supported before LDC 1.5 2713 la.ptr[0] *= lb.array[0]; 2714 la.ptr[1] *= lb.array[1]; 2715 return cast(__m128i)(la); 2716 } 2717 } 2718 } 2719 } 2720 unittest 2721 { 2722 __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff); 2723 __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff); 2724 __m128i C = _mm_mul_epu32(A, B); 2725 long2 LC = cast(long2)C; 2726 assert(LC.array[0] == 18446744065119617025uL); 2727 assert(LC.array[1] == 12723420444339690338uL); 2728 } 2729 2730 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 2731 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 2732 { 2733 pragma(inline, true); 2734 return a * b; 2735 } 2736 unittest 2737 { 2738 __m128d a = [-2.0, 1.5]; 2739 a = _mm_mul_pd(a, a); 2740 assert(a.array == [4.0, 2.25]); 2741 } 2742 2743 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 2744 /// element of result, and copy the upper element from `a` to the upper element of result. 2745 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted 2746 { 2747 version(DigitalMars) 2748 { 2749 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 2750 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 2751 asm pure nothrow @nogc @trusted { nop;} 2752 a.array[0] = a.array[0] * b.array[0]; 2753 return a; 2754 } 2755 else static if (GDC_with_SSE2) 2756 { 2757 return __builtin_ia32_mulsd(a, b); 2758 } 2759 else 2760 { 2761 a.ptr[0] *= b.array[0]; 2762 return a; 2763 } 2764 } 2765 unittest 2766 { 2767 __m128d a = [-2.0, 1.5]; 2768 a = _mm_mul_sd(a, a); 2769 assert(a.array == [4.0, 1.5]); 2770 } 2771 2772 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 2773 /// and get an unsigned 64-bit result. 2774 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe 2775 { 2776 return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b))); 2777 } 2778 unittest 2779 { 2780 __m64 A = _mm_set_pi32(42, 0xDEADBEEF); 2781 __m64 B = _mm_set_pi32(42, 0xCAFEBABE); 2782 __m64 C = _mm_mul_su32(A, B); 2783 assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL); 2784 } 2785 2786 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2787 /// high 16 bits of the intermediate integers. 2788 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted 2789 { 2790 static if (GDC_with_SSE2) 2791 { 2792 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2793 } 2794 else static if (LDC_with_SSE2) 2795 { 2796 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2797 } 2798 else 2799 { 2800 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h 2801 // PERF: it seems the simde solution has one less instruction in ARM64. 2802 // PERF: Catastrophic in ARM32. 2803 short8 sa = cast(short8)a; 2804 short8 sb = cast(short8)b; 2805 short8 r = void; 2806 r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16; 2807 r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16; 2808 r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16; 2809 r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16; 2810 r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16; 2811 r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16; 2812 r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16; 2813 r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16; 2814 return cast(__m128i)r; 2815 } 2816 } 2817 unittest 2818 { 2819 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2820 __m128i B = _mm_set1_epi16(16384); 2821 short8 R = cast(short8)_mm_mulhi_epi16(A, B); 2822 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1]; 2823 assert(R.array == correct); 2824 } 2825 2826 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2827 /// high 16 bits of the intermediate integers. 2828 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted 2829 { 2830 static if (GDC_with_SSE2) 2831 { 2832 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2833 } 2834 else static if (LDC_with_SSE2) 2835 { 2836 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2837 } 2838 else 2839 { 2840 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h 2841 // it seems the simde solution has one less instruction in ARM64 2842 // PERF: Catastrophic in ARM32. 2843 short8 sa = cast(short8)a; 2844 short8 sb = cast(short8)b; 2845 short8 r = void; 2846 r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 ); 2847 r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 ); 2848 r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 ); 2849 r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 ); 2850 r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 ); 2851 r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 ); 2852 r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 ); 2853 r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 ); 2854 return cast(__m128i)r; 2855 } 2856 } 2857 unittest 2858 { 2859 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2860 __m128i B = _mm_set1_epi16(16384); 2861 short8 R = cast(short8)_mm_mulhi_epu16(A, B); 2862 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1]; 2863 assert(R.array == correct); 2864 } 2865 2866 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 2867 /// bits of the intermediate integers. 2868 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe 2869 { 2870 return cast(__m128i)(cast(short8)a * cast(short8)b); 2871 } 2872 unittest 2873 { 2874 __m128i A = _mm_setr_epi16(16384, -16, 0, 3, 4, 1, 16, 7); 2875 __m128i B = _mm_set1_epi16(16384); 2876 short8 R = cast(short8)_mm_mullo_epi16(A, B); 2877 short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384]; 2878 assert(R.array == correct); 2879 } 2880 2881 /// Compute the bitwise NOT of 128 bits in `a`. #BONUS 2882 __m128i _mm_not_si128 (__m128i a) pure @safe 2883 { 2884 return ~a; 2885 } 2886 unittest 2887 { 2888 __m128i A = _mm_set1_epi32(-748); 2889 int4 notA = cast(int4) _mm_not_si128(A); 2890 int[4] correct = [747, 747, 747, 747]; 2891 assert(notA.array == correct); 2892 } 2893 2894 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 2895 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 2896 { 2897 pragma(inline, true); 2898 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 2899 } 2900 2901 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`. 2902 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 2903 { 2904 pragma(inline, true); 2905 return a | b; 2906 } 2907 2908 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation. 2909 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted 2910 { 2911 static if (GDC_with_SSE2) 2912 { 2913 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2914 } 2915 else static if (LDC_with_SSE2) 2916 { 2917 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2918 } 2919 else static if (LDC_with_ARM64) 2920 { 2921 short4 ra = vqmovn_s32(cast(int4)a); 2922 short4 rb = vqmovn_s32(cast(int4)b); 2923 return cast(__m128i)vcombine_s16(ra, rb); 2924 } 2925 else 2926 { 2927 // PERF: catastrophic on ARM32 2928 short8 r; 2929 r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]); 2930 r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]); 2931 r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]); 2932 r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]); 2933 r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]); 2934 r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]); 2935 r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]); 2936 r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]); 2937 return cast(__m128i)r; 2938 } 2939 } 2940 unittest 2941 { 2942 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 2943 short8 R = cast(short8) _mm_packs_epi32(A, A); 2944 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0]; 2945 assert(R.array == correct); 2946 } 2947 2948 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation. 2949 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted 2950 { 2951 static if (GDC_with_SSE2) 2952 { 2953 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2954 } 2955 else static if (LDC_with_SSE2) 2956 { 2957 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2958 } 2959 else static if (LDC_with_ARM64) 2960 { 2961 // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02 2962 byte8 ra = vqmovn_s16(cast(short8)a); 2963 byte8 rb = vqmovn_s16(cast(short8)b); 2964 return cast(__m128i)vcombine_s8(ra, rb); 2965 } 2966 else 2967 { 2968 // PERF: ARM32 is missing 2969 byte16 r; 2970 short8 sa = cast(short8)a; 2971 short8 sb = cast(short8)b; 2972 foreach(i; 0..8) 2973 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]); 2974 foreach(i; 0..8) 2975 r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]); 2976 return cast(__m128i)r; 2977 } 2978 } 2979 unittest 2980 { 2981 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0); 2982 byte16 R = cast(byte16) _mm_packs_epi16(A, A); 2983 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0, 2984 127, -128, 127, 0, 127, -128, 127, 0]; 2985 assert(R.array == correct); 2986 } 2987 2988 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation. 2989 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 2990 { 2991 // PERF DMD catastrophic 2992 static if (GDC_with_SSE2) 2993 { 2994 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2995 } 2996 else static if (LDC_with_SSE2) 2997 { 2998 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2999 } 3000 else static if (LDC_with_ARM64) 3001 { 3002 // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02 3003 byte8 ra = vqmovun_s16(cast(short8)a); 3004 byte8 rb = vqmovun_s16(cast(short8)b); 3005 return cast(__m128i)vcombine_s8(ra, rb); 3006 } 3007 else 3008 { 3009 short8 sa = cast(short8)a; 3010 short8 sb = cast(short8)b; 3011 align(16) ubyte[16] result = void; 3012 for (int i = 0; i < 8; ++i) 3013 { 3014 short s = sa[i]; 3015 if (s < 0) s = 0; 3016 if (s > 255) s = 255; 3017 result[i] = cast(ubyte)s; 3018 3019 s = sb[i]; 3020 if (s < 0) s = 0; 3021 if (s > 255) s = 255; 3022 result[i+8] = cast(ubyte)s; 3023 } 3024 return *cast(__m128i*)(result.ptr); 3025 } 3026 } 3027 unittest 3028 { 3029 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 3030 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 3031 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 3032 0, 255, 0, 255, 255, 2, 1, 0]; 3033 foreach(i; 0..16) 3034 assert(AA.array[i] == cast(byte)(correctResult[i])); 3035 } 3036 3037 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 3038 /// and power consumption of spin-wait loops. 3039 void _mm_pause() @trusted 3040 { 3041 version(GNU) 3042 { 3043 static if (GDC_with_SSE2) 3044 { 3045 __builtin_ia32_pause(); 3046 } 3047 else version(X86) 3048 { 3049 asm pure nothrow @nogc @trusted 3050 { 3051 "pause;\n" : : : ; 3052 } 3053 } 3054 else 3055 static assert(false); 3056 } 3057 else static if (LDC_with_SSE2) 3058 { 3059 __builtin_ia32_pause(); 3060 } 3061 else static if (DMD_with_asm) 3062 { 3063 asm nothrow @nogc pure @safe 3064 { 3065 rep; nop; // F3 90 = pause 3066 } 3067 } 3068 else version (LDC) 3069 { 3070 // PERF: Do nothing currently , could be the "yield" intruction on ARM. 3071 } 3072 else 3073 static assert(false); 3074 } 3075 unittest 3076 { 3077 _mm_pause(); 3078 } 3079 3080 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 3081 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 3082 /// low 16 bits of 64-bit elements in result. 3083 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted 3084 { 3085 static if (GDC_with_SSE2) 3086 { 3087 return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b); 3088 } 3089 else static if (LDC_with_SSE2) 3090 { 3091 return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b); 3092 } 3093 else static if (LDC_with_ARM64) 3094 { 3095 ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b)); 3096 3097 // PERF: Looks suboptimal vs addp 3098 ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]); 3099 ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]); 3100 ushort8 r = 0; 3101 r[0] = r0; 3102 r[4] = r4; 3103 return cast(__m128i) r; 3104 } 3105 else 3106 { 3107 // PERF: ARM32 is lacking 3108 byte16 ab = cast(byte16)a; 3109 byte16 bb = cast(byte16)b; 3110 ubyte[16] t; 3111 foreach(i; 0..16) 3112 { 3113 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]); 3114 if (diff < 0) diff = -diff; 3115 t[i] = cast(ubyte)(diff); 3116 } 3117 int4 r = _mm_setzero_si128(); 3118 r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 3119 r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15]; 3120 return r; 3121 } 3122 } 3123 unittest 3124 { 3125 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 3126 __m128i B = _mm_set1_epi8(1); 3127 __m128i R = _mm_sad_epu8(A, B); 3128 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 3129 0, 3130 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 3131 0]; 3132 assert(R.array == correct); 3133 } 3134 3135 /// Set packed 16-bit integers with the supplied values. 3136 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 3137 { 3138 short8 r = void; 3139 r.ptr[0] = e0; 3140 r.ptr[1] = e1; 3141 r.ptr[2] = e2; 3142 r.ptr[3] = e3; 3143 r.ptr[4] = e4; 3144 r.ptr[5] = e5; 3145 r.ptr[6] = e6; 3146 r.ptr[7] = e7; 3147 return cast(__m128i) r; 3148 } 3149 unittest 3150 { 3151 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 3152 short8 B = cast(short8) A; 3153 foreach(i; 0..8) 3154 assert(B.array[i] == i); 3155 } 3156 3157 /// Set packed 32-bit integers with the supplied values. 3158 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3159 { 3160 // PERF: does a constant inline correctly? vs int4 field assignment 3161 align(16) int[4] r = [e0, e1, e2, e3]; 3162 return *cast(int4*)&r; 3163 } 3164 unittest 3165 { 3166 __m128i A = _mm_set_epi32(3, 2, 1, 0); 3167 foreach(i; 0..4) 3168 assert(A.array[i] == i); 3169 } 3170 3171 /// Set packed 64-bit integers with the supplied values. 3172 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted 3173 { 3174 pragma(inline, true); 3175 long2 r = void; 3176 r.ptr[0] = e0.array[0]; 3177 r.ptr[1] = e1.array[0]; 3178 return cast(__m128i)(r); 3179 } 3180 unittest 3181 { 3182 __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678)); 3183 long2 B = cast(long2) A; 3184 assert(B.array[0] == 5678); 3185 assert(B.array[1] == 1234); 3186 } 3187 3188 /// Set packed 64-bit integers with the supplied values. 3189 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 3190 { 3191 pragma(inline, true); 3192 long2 r = void; 3193 r.ptr[0] = e0; 3194 r.ptr[1] = e1; 3195 return cast(__m128i)(r); 3196 } 3197 unittest 3198 { 3199 __m128i A = _mm_set_epi64x(1234, -5678); 3200 long2 B = cast(long2) A; 3201 assert(B.array[0] == -5678); 3202 assert(B.array[1] == 1234); 3203 } 3204 3205 /// Set packed 8-bit integers with the supplied values. 3206 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 3207 byte e11, byte e10, byte e9, byte e8, 3208 byte e7, byte e6, byte e5, byte e4, 3209 byte e3, byte e2, byte e1, byte e0) pure @trusted 3210 { 3211 align(16) byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 3212 e8, e9, e10, e11, e12, e13, e14, e15]; 3213 return *cast(__m128i*)(result.ptr); 3214 } 3215 unittest 3216 { 3217 byte16 R = cast(byte16) _mm_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); 3218 byte[16] correct = [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1]; 3219 assert(R.array == correct); 3220 } 3221 3222 /// Set packed double-precision (64-bit) floating-point elements with the supplied values. 3223 __m128d _mm_set_pd (double e1, double e0) pure @trusted 3224 { 3225 pragma(inline, true); 3226 double2 r = void; 3227 r.ptr[0] = e0; 3228 r.ptr[1] = e1; 3229 return r; 3230 } 3231 unittest 3232 { 3233 __m128d A = _mm_set_pd(61.0, 55.0); 3234 double[2] correct = [55.0, 61.0]; 3235 assert(A.array == correct); 3236 } 3237 3238 /// Broadcast double-precision (64-bit) floating-point value `a` to all element. 3239 __m128d _mm_set_pd1 (double a) pure @trusted 3240 { 3241 pragma(inline, true); 3242 __m128d r = void; 3243 r.ptr[0] = a; 3244 r.ptr[1] = a; 3245 return r; 3246 } 3247 unittest 3248 { 3249 __m128d A = _mm_set_pd1(61.0); 3250 double[2] correct = [61.0, 61.0]; 3251 assert(A.array == correct); 3252 } 3253 3254 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 3255 /// and zero the upper element. 3256 __m128d _mm_set_sd (double a) pure @trusted 3257 { 3258 double2 r = void; 3259 r.ptr[0] = a; 3260 r.ptr[1] = 0.0; 3261 return r; 3262 } 3263 unittest 3264 { 3265 __m128d A = _mm_set_sd(61.0); 3266 double[2] correct = [61.0, 0.0]; 3267 assert(A.array == correct); 3268 } 3269 3270 /// Broadcast 16-bit integer a to all elements of dst. 3271 __m128i _mm_set1_epi16 (short a) pure @trusted 3272 { 3273 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 3274 { 3275 short8 v = a; 3276 return cast(__m128i) v; 3277 } 3278 else 3279 { 3280 pragma(inline, true); 3281 return cast(__m128i)(short8(a)); 3282 } 3283 } 3284 unittest 3285 { 3286 short8 a = cast(short8) _mm_set1_epi16(31); 3287 for (int i = 0; i < 8; ++i) 3288 assert(a.array[i] == 31); 3289 } 3290 3291 /// Broadcast 32-bit integer `a` to all elements. 3292 __m128i _mm_set1_epi32 (int a) pure @trusted 3293 { 3294 pragma(inline, true); 3295 return cast(__m128i)(int4(a)); 3296 } 3297 unittest 3298 { 3299 int4 a = cast(int4) _mm_set1_epi32(31); 3300 for (int i = 0; i < 4; ++i) 3301 assert(a.array[i] == 31); 3302 } 3303 3304 /// Broadcast 64-bit integer `a` to all elements. 3305 __m128i _mm_set1_epi64 (__m64 a) pure @safe 3306 { 3307 return _mm_set_epi64(a, a); 3308 } 3309 unittest 3310 { 3311 long b = 0x1DEADCAFE; 3312 __m64 a; 3313 a.ptr[0] = b; 3314 long2 c = cast(long2) _mm_set1_epi64(a); 3315 assert(c.array[0] == b); 3316 assert(c.array[1] == b); 3317 } 3318 3319 /// Broadcast 64-bit integer `a` to all elements 3320 __m128i _mm_set1_epi64x (long a) pure @trusted 3321 { 3322 long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3323 return cast(__m128i)(b); 3324 } 3325 unittest 3326 { 3327 long b = 0x1DEADCAFE; 3328 long2 c = cast(long2) _mm_set1_epi64x(b); 3329 for (int i = 0; i < 2; ++i) 3330 assert(c.array[i] == b); 3331 } 3332 3333 /// Broadcast 8-bit integer `a` to all elements. 3334 __m128i _mm_set1_epi8 (byte a) pure @trusted 3335 { 3336 pragma(inline, true); 3337 byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3338 return cast(__m128i)(b); 3339 } 3340 unittest 3341 { 3342 byte16 b = cast(byte16) _mm_set1_epi8(31); 3343 for (int i = 0; i < 16; ++i) 3344 assert(b.array[i] == 31); 3345 } 3346 3347 alias _mm_set1_pd = _mm_set_pd1; 3348 3349 /// Set packed 16-bit integers with the supplied values in reverse order. 3350 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 3351 short e3, short e2, short e1, short e0) pure @trusted 3352 { 3353 short8 r = void; 3354 r.ptr[0] = e7; 3355 r.ptr[1] = e6; 3356 r.ptr[2] = e5; 3357 r.ptr[3] = e4; 3358 r.ptr[4] = e3; 3359 r.ptr[5] = e2; 3360 r.ptr[6] = e1; 3361 r.ptr[7] = e0; 3362 return cast(__m128i)(r); 3363 } 3364 unittest 3365 { 3366 short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0); 3367 short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0]; 3368 assert(A.array == correct); 3369 } 3370 3371 /// Set packed 32-bit integers with the supplied values in reverse order. 3372 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3373 { 3374 // Performs better than = void; with GDC 3375 pragma(inline, true); 3376 align(16) int[4] result = [e3, e2, e1, e0]; 3377 return *cast(__m128i*)(result.ptr); 3378 } 3379 unittest 3380 { 3381 int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647); 3382 int[4] correct = [-1, 0, -2147483648, 2147483647]; 3383 assert(A.array == correct); 3384 } 3385 3386 /// Set packed 64-bit integers with the supplied values in reverse order. 3387 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 3388 { 3389 long2 r = void; 3390 r.ptr[0] = e1; 3391 r.ptr[1] = e0; 3392 return cast(__m128i)(r); 3393 } 3394 unittest 3395 { 3396 long2 A = cast(long2) _mm_setr_epi64(-1, 0); 3397 long[2] correct = [-1, 0]; 3398 assert(A.array == correct); 3399 } 3400 3401 /// Set packed 8-bit integers with the supplied values in reverse order. 3402 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12, 3403 byte e11, byte e10, byte e9, byte e8, 3404 byte e7, byte e6, byte e5, byte e4, 3405 byte e3, byte e2, byte e1, byte e0) pure @trusted 3406 { 3407 align(16) byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 3408 e7, e6, e5, e4, e3, e2, e1, e0]; 3409 return *cast(__m128i*)(result.ptr); 3410 } 3411 unittest 3412 { 3413 byte16 R = cast(byte16) _mm_setr_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); 3414 byte[16] correct = [-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]; 3415 assert(R.array == correct); 3416 } 3417 3418 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order. 3419 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 3420 { 3421 pragma(inline, true); 3422 double2 result; 3423 result.ptr[0] = e1; 3424 result.ptr[1] = e0; 3425 return result; 3426 } 3427 unittest 3428 { 3429 __m128d A = _mm_setr_pd(61.0, 55.0); 3430 double[2] correct = [61.0, 55.0]; 3431 assert(A.array == correct); 3432 } 3433 3434 /// Return vector of type `__m128d` with all elements set to zero. 3435 __m128d _mm_setzero_pd() pure @trusted 3436 { 3437 pragma(inline, true); 3438 double2 r = void; 3439 r.ptr[0] = 0.0; 3440 r.ptr[1] = 0.0; 3441 return r; 3442 } 3443 unittest 3444 { 3445 __m128d A = _mm_setzero_pd(); 3446 double[2] correct = [0.0, 0.0]; 3447 assert(A.array == correct); 3448 } 3449 3450 /// Return vector of type `__m128i` with all elements set to zero. 3451 __m128i _mm_setzero_si128() pure @trusted 3452 { 3453 pragma(inline, true); 3454 int4 r = void; 3455 r.ptr[0] = 0; 3456 r.ptr[1] = 0; 3457 r.ptr[2] = 0; 3458 r.ptr[3] = 0; 3459 return r; 3460 } 3461 unittest 3462 { 3463 __m128i A = _mm_setzero_si128(); 3464 int[4] correct = [0, 0, 0, 0]; 3465 assert(A.array == correct); 3466 } 3467 3468 /// Shuffle 32-bit integers in `a` using the control in `imm8`. 3469 /// See_also: `_MM_SHUFFLE`. 3470 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @trusted 3471 { 3472 // PERF DMD D_SIMD 3473 static if (GDC_with_SSE2) 3474 { 3475 return __builtin_ia32_pshufd(a, imm8); 3476 } 3477 else version(LDC) 3478 { 3479 return shufflevectorLDC!(int4, (imm8 >> 0) & 3, 3480 (imm8 >> 2) & 3, 3481 (imm8 >> 4) & 3, 3482 (imm8 >> 6) & 3)(a, a); 3483 } 3484 else 3485 { 3486 int4 r = void; 3487 r.ptr[0] = a.ptr[(imm8 >> 0) & 3]; 3488 r.ptr[1] = a.ptr[(imm8 >> 2) & 3]; 3489 r.ptr[2] = a.ptr[(imm8 >> 4) & 3]; 3490 r.ptr[3] = a.ptr[(imm8 >> 6) & 3]; 3491 return r; 3492 } 3493 } 3494 unittest 3495 { 3496 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 3497 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3498 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 3499 int[4] expectedB = [ 3, 2, 1, 0 ]; 3500 assert(B.array == expectedB); 3501 } 3502 3503 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`. 3504 /// See_also: `_MM_SHUFFLE2`. 3505 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @trusted 3506 { 3507 // PERF DMD D_SIMD 3508 static if (GDC_with_SSE2) 3509 { 3510 return __builtin_ia32_shufpd(a, b, imm8); 3511 } 3512 else version(LDC) 3513 { 3514 return shufflevectorLDC!(double2, 0 + ( imm8 & 1 ), 3515 2 + ( (imm8 >> 1) & 1 ))(a, b); 3516 } 3517 else 3518 { 3519 double2 r = void; 3520 r.ptr[0] = a.array[imm8 & 1]; 3521 r.ptr[1] = b.array[(imm8 >> 1) & 1]; 3522 return r; 3523 } 3524 } 3525 unittest 3526 { 3527 __m128d A = _mm_setr_pd(0.5, 2.0); 3528 __m128d B = _mm_setr_pd(4.0, 5.0); 3529 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 3530 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B); 3531 double[2] correct = [ 2.0, 5.0 ]; 3532 assert(R.array == correct); 3533 } 3534 3535 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 3536 /// 64 bits of result, with the low 64 bits being copied from from `a` to result. 3537 /// See also: `_MM_SHUFFLE`. 3538 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @trusted 3539 { 3540 // PERF DMD D_SIMD 3541 static if (GDC_with_SSE2) 3542 { 3543 return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8); 3544 } 3545 else version(LDC) 3546 { 3547 return cast(__m128i) shufflevectorLDC!(short8, 0, 1, 2, 3, 3548 4 + ( (imm8 >> 0) & 3 ), 3549 4 + ( (imm8 >> 2) & 3 ), 3550 4 + ( (imm8 >> 4) & 3 ), 3551 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 3552 } 3553 else 3554 { 3555 short8 r = cast(short8)a; 3556 short8 sa = cast(short8)a; 3557 r.ptr[4] = sa.array[4 + ( (imm8 >> 0) & 3 ) ]; 3558 r.ptr[5] = sa.array[4 + ( (imm8 >> 2) & 3 ) ]; 3559 r.ptr[6] = sa.array[4 + ( (imm8 >> 4) & 3 ) ]; 3560 r.ptr[7] = sa.array[4 + ( (imm8 >> 6) & 3 ) ]; 3561 return cast(__m128i) r; 3562 } 3563 } 3564 unittest 3565 { 3566 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3567 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3568 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 3569 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 3570 assert(C.array == expectedC); 3571 } 3572 3573 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 3574 /// bits of result, with the high 64 bits being copied from from `a` to result. 3575 /// See_also: `_MM_SHUFFLE`. 3576 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @trusted 3577 { 3578 // PERF DMD D_SIMD 3579 static if (GDC_with_SSE2) 3580 { 3581 return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8); 3582 } 3583 else version(LDC) 3584 { 3585 return cast(__m128i) shufflevectorLDC!(short8, ( (imm8 >> 0) & 3 ), 3586 ( (imm8 >> 2) & 3 ), 3587 ( (imm8 >> 4) & 3 ), 3588 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 3589 } 3590 else 3591 { 3592 short8 r = cast(short8)a; 3593 short8 sa = cast(short8)a; 3594 r.ptr[0] = sa.array[(imm8 >> 0) & 3]; 3595 r.ptr[1] = sa.array[(imm8 >> 2) & 3]; 3596 r.ptr[2] = sa.array[(imm8 >> 4) & 3]; 3597 r.ptr[3] = sa.array[(imm8 >> 6) & 3]; 3598 return cast(__m128i) r; 3599 } 3600 } 3601 unittest 3602 { 3603 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3604 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3605 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 3606 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 3607 assert(B.array == expectedB); 3608 } 3609 3610 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros. 3611 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted 3612 { 3613 static if (LDC_with_SSE2) 3614 { 3615 return __builtin_ia32_pslld128(a, count); 3616 } 3617 else static if (GDC_with_SSE2) 3618 { 3619 return __builtin_ia32_pslld128(a, count); 3620 } 3621 else static if (DMD_with_32bit_asm) 3622 { 3623 asm pure nothrow @nogc @trusted 3624 { 3625 movdqu XMM0, a; 3626 movdqu XMM1, count; 3627 pslld XMM0, XMM1; 3628 movdqu a, XMM0; 3629 } 3630 return a; 3631 } 3632 else 3633 { 3634 int4 r = void; 3635 long2 lc = cast(long2)count; 3636 int bits = cast(int)(lc.array[0]); 3637 foreach(i; 0..4) 3638 r[i] = cast(uint)(a[i]) << bits; 3639 return r; 3640 } 3641 } 3642 3643 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros. 3644 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted 3645 { 3646 static if (LDC_with_SSE2) 3647 { 3648 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3649 } 3650 else static if (GDC_with_SSE2) 3651 { 3652 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3653 } 3654 else static if (DMD_with_32bit_asm) 3655 { 3656 asm pure nothrow @nogc @trusted 3657 { 3658 movdqu XMM0, a; 3659 movdqu XMM1, count; 3660 psllq XMM0, XMM1; 3661 movdqu a, XMM0; 3662 } 3663 return a; 3664 } 3665 else 3666 { 3667 // ARM: good since LDC 1.12 -O2 3668 // ~but -O0 version is catastrophic 3669 long2 r = void; 3670 long2 sa = cast(long2)a; 3671 long2 lc = cast(long2)count; 3672 int bits = cast(int)(lc.array[0]); 3673 foreach(i; 0..2) 3674 r.array[i] = cast(ulong)(sa.array[i]) << bits; 3675 return cast(__m128i)r; 3676 } 3677 } 3678 3679 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros. 3680 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 3681 { 3682 static if (LDC_with_SSE2) 3683 { 3684 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3685 } 3686 else static if (GDC_with_SSE2) 3687 { 3688 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3689 } 3690 else static if (DMD_with_32bit_asm) 3691 { 3692 asm pure nothrow @nogc 3693 { 3694 movdqu XMM0, a; 3695 movdqu XMM1, count; 3696 psllw XMM0, XMM1; 3697 movdqu a, XMM0; 3698 } 3699 return a; 3700 } 3701 else 3702 { 3703 short8 sa = cast(short8)a; 3704 long2 lc = cast(long2)count; 3705 int bits = cast(int)(lc.array[0]); 3706 short8 r = void; 3707 foreach(i; 0..8) 3708 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits); 3709 return cast(int4)r; 3710 } 3711 } 3712 3713 3714 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 3715 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted 3716 { 3717 static if (GDC_with_SSE2) 3718 { 3719 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3720 } 3721 else static if (LDC_with_SSE2) 3722 { 3723 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3724 } 3725 else 3726 { 3727 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3728 // D says "It's illegal to shift by the same or more bits 3729 // than the size of the quantity being shifted" 3730 // and it's UB instead. 3731 int4 r = _mm_setzero_si128(); 3732 3733 ubyte count = cast(ubyte) imm8; 3734 if (count > 31) 3735 return r; 3736 3737 foreach(i; 0..4) 3738 r.array[i] = cast(uint)(a.array[i]) << count; 3739 return r; 3740 } 3741 } 3742 unittest 3743 { 3744 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3745 __m128i B = _mm_slli_epi32(A, 1); 3746 __m128i B2 = _mm_slli_epi32(A, 1 + 256); 3747 int[4] expectedB = [ 0, 4, 6, -8]; 3748 assert(B.array == expectedB); 3749 assert(B2.array == expectedB); 3750 3751 __m128i C = _mm_slli_epi32(A, 0); 3752 int[4] expectedC = [ 0, 2, 3, -4]; 3753 assert(C.array == expectedC); 3754 3755 __m128i D = _mm_slli_epi32(A, 65); 3756 int[4] expectedD = [ 0, 0, 0, 0]; 3757 assert(D.array == expectedD); 3758 } 3759 3760 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros. 3761 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted 3762 { 3763 static if (GDC_with_SSE2) 3764 { 3765 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3766 } 3767 else static if (LDC_with_SSE2) 3768 { 3769 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3770 } 3771 else 3772 { 3773 long2 sa = cast(long2)a; 3774 3775 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3776 // D says "It's illegal to shift by the same or more bits 3777 // than the size of the quantity being shifted" 3778 // and it's UB instead. 3779 long2 r = cast(long2) _mm_setzero_si128(); 3780 ubyte count = cast(ubyte) imm8; 3781 if (count > 63) 3782 return cast(__m128i)r; 3783 3784 r.ptr[0] = cast(ulong)(sa.array[0]) << count; 3785 r.ptr[1] = cast(ulong)(sa.array[1]) << count; 3786 return cast(__m128i)r; 3787 } 3788 } 3789 unittest 3790 { 3791 __m128i A = _mm_setr_epi64(8, -4); 3792 long2 B = cast(long2) _mm_slli_epi64(A, 1); 3793 long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024); 3794 long[2] expectedB = [ 16, -8]; 3795 assert(B.array == expectedB); 3796 assert(B2.array == expectedB); 3797 3798 long2 C = cast(long2) _mm_slli_epi64(A, 0); 3799 long[2] expectedC = [ 8, -4]; 3800 assert(C.array == expectedC); 3801 3802 long2 D = cast(long2) _mm_slli_epi64(A, 64); 3803 long[2] expectedD = [ 0, -0]; 3804 assert(D.array == expectedD); 3805 } 3806 3807 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 3808 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted 3809 { 3810 static if (GDC_with_SSE2) 3811 { 3812 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3813 } 3814 else static if (LDC_with_SSE2) 3815 { 3816 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3817 } 3818 else static if (LDC_with_ARM64) 3819 { 3820 short8 sa = cast(short8)a; 3821 short8 r = cast(short8)_mm_setzero_si128(); 3822 ubyte count = cast(ubyte) imm8; 3823 if (count > 15) 3824 return cast(__m128i)r; 3825 r = sa << short8(count); 3826 return cast(__m128i)r; 3827 } 3828 else 3829 { 3830 short8 sa = cast(short8)a; 3831 short8 r = cast(short8)_mm_setzero_si128(); 3832 ubyte count = cast(ubyte) imm8; 3833 if (count > 15) 3834 return cast(__m128i)r; 3835 foreach(i; 0..8) 3836 r.ptr[i] = cast(short)(sa.array[i] << count); 3837 return cast(__m128i)r; 3838 } 3839 } 3840 unittest 3841 { 3842 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3843 short8 B = cast(short8)( _mm_slli_epi16(A, 1) ); 3844 short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) ); 3845 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 3846 assert(B.array == expectedB); 3847 assert(B2.array == expectedB); 3848 3849 short8 C = cast(short8)( _mm_slli_epi16(A, 16) ); 3850 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 3851 assert(C.array == expectedC); 3852 } 3853 3854 3855 /// Shift `a` left by `bytes` bytes while shifting in zeros. 3856 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted 3857 { 3858 static if (bytes & 0xF0) 3859 { 3860 return _mm_setzero_si128(); 3861 } 3862 else static if (DMD_with_DSIMD) 3863 { 3864 return cast(__m128i) __simd_ib(XMM.PSLLDQ, op, bytes); 3865 } 3866 else static if (GDC_with_SSE2) 3867 { 3868 pragma(inline, true); // else it doesn't seem to be inlined at all by GDC TODO _mm_srli_si128 3869 return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 3870 } 3871 else version(LDC) 3872 { 3873 return cast(__m128i) shufflevectorLDC!(byte16, 3874 16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes, 3875 22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes, 3876 28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes) 3877 (cast(byte16)_mm_setzero_si128(), cast(byte16)op); 3878 } 3879 else static if (DMD_with_32bit_asm) 3880 { 3881 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64 3882 { 3883 movdqu XMM0, op; 3884 pslldq XMM0, bytes; 3885 movdqu op, XMM0; 3886 } 3887 return op; 3888 } 3889 else 3890 { 3891 byte16 A = cast(byte16)op; 3892 byte16 R = void; 3893 for (int n = 15; n >= bytes; --n) 3894 R.ptr[n] = A.array[n-bytes]; 3895 for (int n = bytes-1; n >= 0; --n) 3896 R.ptr[n] = 0; 3897 return cast(__m128i)R; 3898 } 3899 } 3900 unittest 3901 { 3902 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3903 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left 3904 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ]; 3905 assert(R.array == correct); 3906 3907 __m128i B = _mm_slli_si128!16(_mm_set1_epi32(-1)); 3908 int[4] expectedB = [0, 0, 0, 0]; 3909 assert(B.array == expectedB); 3910 } 3911 3912 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`. 3913 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted 3914 { 3915 version(LDC) 3916 { 3917 // Disappeared with LDC 1.11 3918 static if (__VERSION__ < 2081) 3919 return __builtin_ia32_sqrtpd(vec); 3920 else 3921 { 3922 // PERF: use llvm_sqrt on the vector 3923 vec.array[0] = llvm_sqrt(vec.array[0]); 3924 vec.array[1] = llvm_sqrt(vec.array[1]); 3925 return vec; 3926 } 3927 } 3928 else static if (GDC_with_SSE2) 3929 { 3930 return __builtin_ia32_sqrtpd(vec); 3931 } 3932 else 3933 { 3934 vec.ptr[0] = sqrt(vec.array[0]); 3935 vec.ptr[1] = sqrt(vec.array[1]); 3936 return vec; 3937 } 3938 } 3939 3940 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 3941 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 3942 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted 3943 { 3944 // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only. 3945 // "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 3946 // The quadword at bits 127:64 of the destination operand remains unchanged." 3947 version(LDC) 3948 { 3949 // Disappeared with LDC 1.11 3950 static if (__VERSION__ < 2081) 3951 { 3952 __m128d c = __builtin_ia32_sqrtsd(b); 3953 a[0] = c[0]; 3954 return a; 3955 } 3956 else 3957 { 3958 a.array[0] = llvm_sqrt(b.array[0]); 3959 return a; 3960 } 3961 } 3962 else static if (GDC_with_SSE2) 3963 { 3964 __m128d c = __builtin_ia32_sqrtsd(b); 3965 a.ptr[0] = c.array[0]; 3966 return a; 3967 } 3968 else 3969 { 3970 a.ptr[0] = sqrt(b.array[0]); 3971 return a; 3972 } 3973 } 3974 unittest 3975 { 3976 __m128d A = _mm_setr_pd(1.0, 3.0); 3977 __m128d B = _mm_setr_pd(4.0, 5.0); 3978 __m128d R = _mm_sqrt_sd(A, B); 3979 double[2] correct = [2.0, 3.0 ]; 3980 assert(R.array == correct); 3981 } 3982 3983 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits. 3984 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted 3985 { 3986 static if (GDC_with_SSE2) 3987 { 3988 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3989 } 3990 else static if (LDC_with_SSE2) 3991 { 3992 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3993 } 3994 else 3995 { 3996 short8 sa = cast(short8)a; 3997 long2 lc = cast(long2)count; 3998 int bits = cast(int)(lc.array[0]); 3999 short8 r = void; 4000 foreach(i; 0..8) 4001 r.ptr[i] = cast(short)(sa.array[i] >> bits); 4002 return cast(int4)r; 4003 } 4004 } 4005 4006 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits. 4007 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted 4008 { 4009 static if (LDC_with_SSE2) 4010 { 4011 return __builtin_ia32_psrad128(a, count); 4012 } 4013 else static if (GDC_with_SSE2) 4014 { 4015 return __builtin_ia32_psrad128(a, count); 4016 } 4017 else 4018 { 4019 int4 r = void; 4020 long2 lc = cast(long2)count; 4021 int bits = cast(int)(lc.array[0]); 4022 r.ptr[0] = (a.array[0] >> bits); 4023 r.ptr[1] = (a.array[1] >> bits); 4024 r.ptr[2] = (a.array[2] >> bits); 4025 r.ptr[3] = (a.array[3] >> bits); 4026 return r; 4027 } 4028 } 4029 4030 4031 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 4032 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted 4033 { 4034 static if (GDC_with_SSE2) 4035 { 4036 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 4037 } 4038 else static if (LDC_with_SSE2) 4039 { 4040 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 4041 } 4042 else static if (LDC_with_ARM64) 4043 { 4044 short8 sa = cast(short8)a; 4045 ubyte count = cast(ubyte)imm8; 4046 if (count > 15) 4047 count = 15; 4048 short8 r = sa >> short8(count); 4049 return cast(__m128i)r; 4050 } 4051 else 4052 { 4053 short8 sa = cast(short8)a; 4054 short8 r = void; 4055 4056 // Note: the intrinsics guarantee imm8[0..7] is taken, however 4057 // D says "It's illegal to shift by the same or more bits 4058 // than the size of the quantity being shifted" 4059 // and it's UB instead. 4060 ubyte count = cast(ubyte)imm8; 4061 if (count > 15) 4062 count = 15; 4063 foreach(i; 0..8) 4064 r.ptr[i] = cast(short)(sa.array[i] >> count); 4065 return cast(int4)r; 4066 } 4067 } 4068 unittest 4069 { 4070 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 4071 short8 B = cast(short8)( _mm_srai_epi16(A, 1) ); 4072 short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) ); 4073 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 4074 assert(B.array == expectedB); 4075 assert(B2.array == expectedB); 4076 4077 short8 C = cast(short8)( _mm_srai_epi16(A, 18) ); 4078 short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ]; 4079 assert(C.array == expectedC); 4080 } 4081 4082 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 4083 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted 4084 { 4085 static if (LDC_with_SSE2) 4086 { 4087 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 4088 } 4089 else static if (GDC_with_SSE2) 4090 { 4091 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 4092 } 4093 else 4094 { 4095 int4 r = void; 4096 4097 // Note: the intrinsics guarantee imm8[0..7] is taken, however 4098 // D says "It's illegal to shift by the same or more bits 4099 // than the size of the quantity being shifted" 4100 // and it's UB instead. 4101 ubyte count = cast(ubyte) imm8; 4102 if (count > 31) 4103 count = 31; 4104 4105 r.ptr[0] = (a.array[0] >> count); 4106 r.ptr[1] = (a.array[1] >> count); 4107 r.ptr[2] = (a.array[2] >> count); 4108 r.ptr[3] = (a.array[3] >> count); 4109 return r; 4110 } 4111 } 4112 unittest 4113 { 4114 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 4115 __m128i B = _mm_srai_epi32(A, 1); 4116 __m128i B2 = _mm_srai_epi32(A, 1 + 256); 4117 int[4] expectedB = [ 0, 1, 1, -2]; 4118 assert(B.array == expectedB); 4119 assert(B2.array == expectedB); 4120 4121 __m128i C = _mm_srai_epi32(A, 32); 4122 int[4] expectedC = [ 0, 0, 0, -1]; 4123 assert(C.array == expectedC); 4124 4125 __m128i D = _mm_srai_epi32(A, 0); 4126 int[4] expectedD = [ 0, 2, 3, -4]; 4127 assert(D.array == expectedD); 4128 } 4129 4130 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted 4131 { 4132 static if (LDC_with_SSE2) 4133 { 4134 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 4135 } 4136 else static if (GDC_with_SSE2) 4137 { 4138 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 4139 } 4140 else 4141 { 4142 short8 sa = cast(short8)a; 4143 long2 lc = cast(long2)count; 4144 int bits = cast(int)(lc.array[0]); 4145 short8 r = void; 4146 foreach(i; 0..8) 4147 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits); 4148 return cast(int4)r; 4149 } 4150 } 4151 4152 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted 4153 { 4154 static if (LDC_with_SSE2) 4155 { 4156 return __builtin_ia32_psrld128(a, count); 4157 } 4158 else static if (GDC_with_SSE2) 4159 { 4160 return __builtin_ia32_psrld128(a, count); 4161 } 4162 else 4163 { 4164 int4 r = void; 4165 long2 lc = cast(long2)count; 4166 int bits = cast(int)(lc.array[0]); 4167 r.ptr[0] = cast(uint)(a.array[0]) >> bits; 4168 r.ptr[1] = cast(uint)(a.array[1]) >> bits; 4169 r.ptr[2] = cast(uint)(a.array[2]) >> bits; 4170 r.ptr[3] = cast(uint)(a.array[3]) >> bits; 4171 return r; 4172 } 4173 } 4174 4175 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted 4176 { 4177 static if (LDC_with_SSE2) 4178 { 4179 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 4180 } 4181 else static if (GDC_with_SSE2) 4182 { 4183 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 4184 } 4185 else 4186 { 4187 // Workaround for https://issues.dlang.org/show_bug.cgi?id=23047 4188 // => avoid void initialization. 4189 long2 r; 4190 long2 sa = cast(long2)a; 4191 long2 lc = cast(long2)count; 4192 int bits = cast(int)(lc.array[0]); 4193 r.ptr[0] = cast(ulong)(sa.array[0]) >> bits; 4194 r.ptr[1] = cast(ulong)(sa.array[1]) >> bits; 4195 return cast(__m128i)r; 4196 } 4197 } 4198 4199 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 4200 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted 4201 { 4202 static if (GDC_with_SSE2) 4203 { 4204 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 4205 } 4206 else static if (LDC_with_SSE2) 4207 { 4208 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 4209 } 4210 else static if (LDC_with_ARM64) 4211 { 4212 short8 sa = cast(short8)a; 4213 short8 r = cast(short8) _mm_setzero_si128(); 4214 4215 ubyte count = cast(ubyte)imm8; 4216 if (count >= 16) 4217 return cast(__m128i)r; 4218 4219 r = sa >>> short8(count); // This facility offered with LDC, but not DMD. 4220 return cast(__m128i)r; 4221 } 4222 else 4223 { 4224 short8 sa = cast(short8)a; 4225 ubyte count = cast(ubyte)imm8; 4226 4227 short8 r = cast(short8) _mm_setzero_si128(); 4228 if (count >= 16) 4229 return cast(__m128i)r; 4230 4231 foreach(i; 0..8) 4232 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count); 4233 return cast(__m128i)r; 4234 } 4235 } 4236 unittest 4237 { 4238 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 4239 short8 B = cast(short8)( _mm_srli_epi16(A, 1) ); 4240 short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) ); 4241 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 4242 assert(B.array == expectedB); 4243 assert(B2.array == expectedB); 4244 4245 short8 C = cast(short8)( _mm_srli_epi16(A, 16) ); 4246 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0]; 4247 assert(C.array == expectedC); 4248 4249 short8 D = cast(short8)( _mm_srli_epi16(A, 0) ); 4250 short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ]; 4251 assert(D.array == expectedD); 4252 } 4253 4254 4255 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 4256 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted 4257 { 4258 static if (GDC_with_SSE2) 4259 { 4260 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 4261 } 4262 else static if (LDC_with_SSE2) 4263 { 4264 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 4265 } 4266 else 4267 { 4268 ubyte count = cast(ubyte) imm8; 4269 4270 // Note: the intrinsics guarantee imm8[0..7] is taken, however 4271 // D says "It's illegal to shift by the same or more bits 4272 // than the size of the quantity being shifted" 4273 // and it's UB instead. 4274 int4 r = _mm_setzero_si128(); 4275 if (count >= 32) 4276 return r; 4277 r.ptr[0] = a.array[0] >>> count; 4278 r.ptr[1] = a.array[1] >>> count; 4279 r.ptr[2] = a.array[2] >>> count; 4280 r.ptr[3] = a.array[3] >>> count; 4281 return r; 4282 } 4283 } 4284 unittest 4285 { 4286 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 4287 __m128i B = _mm_srli_epi32(A, 1); 4288 __m128i B2 = _mm_srli_epi32(A, 1 + 256); 4289 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 4290 assert(B.array == expectedB); 4291 assert(B2.array == expectedB); 4292 4293 __m128i C = _mm_srli_epi32(A, 255); 4294 int[4] expectedC = [ 0, 0, 0, 0 ]; 4295 assert(C.array == expectedC); 4296 } 4297 4298 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros. 4299 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted 4300 { 4301 // PERF DMD 4302 static if (GDC_with_SSE2) 4303 { 4304 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4305 } 4306 else static if (LDC_with_SSE2) 4307 { 4308 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4309 } 4310 else 4311 { 4312 long2 r = cast(long2) _mm_setzero_si128(); 4313 long2 sa = cast(long2)a; 4314 4315 ubyte count = cast(ubyte) imm8; 4316 if (count >= 64) 4317 return cast(__m128i)r; 4318 4319 r.ptr[0] = sa.array[0] >>> count; 4320 r.ptr[1] = sa.array[1] >>> count; 4321 return cast(__m128i)r; 4322 } 4323 } 4324 unittest 4325 { 4326 __m128i A = _mm_setr_epi64(8, -4); 4327 long2 B = cast(long2) _mm_srli_epi64(A, 1); 4328 long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512); 4329 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 4330 assert(B.array == expectedB); 4331 assert(B2.array == expectedB); 4332 4333 long2 C = cast(long2) _mm_srli_epi64(A, 64); 4334 long[2] expectedC = [ 0, 0 ]; 4335 assert(C.array == expectedC); 4336 } 4337 4338 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4339 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @trusted 4340 { 4341 static if (bytes & 0xF0) 4342 { 4343 return _mm_setzero_si128(); 4344 } 4345 else static if (DMD_with_DSIMD) 4346 { 4347 return cast(__m128i) __simd_ib(XMM.PSRLDQ, v, bytes); 4348 } 4349 else static if (GDC_with_SSE2) 4350 { 4351 return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8)); 4352 } 4353 else static if (DMD_with_32bit_asm) 4354 { 4355 asm pure nothrow @nogc @trusted 4356 { 4357 movdqu XMM0, v; 4358 psrldq XMM0, bytes; 4359 movdqu v, XMM0; 4360 } 4361 return v; 4362 } 4363 else version(LDC) 4364 { 4365 return cast(__m128i) shufflevectorLDC!(byte16, 4366 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7, 4367 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15) 4368 (cast(byte16) v, cast(byte16)_mm_setzero_si128()); 4369 } 4370 else 4371 { 4372 byte16 A = cast(byte16)v; 4373 byte16 R = void; 4374 for (int n = 0; n < bytes; ++n) 4375 R.ptr[15-n] = 0; 4376 for (int n = bytes; n < 16; ++n) 4377 R.ptr[15-n] = A.array[15 - n + bytes]; 4378 return cast(__m128i)R; 4379 } 4380 } 4381 unittest 4382 { 4383 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, -2, 1)); 4384 int[4] correct = [-2, 3, 4, 0]; 4385 assert(R.array == correct); 4386 4387 __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1)); 4388 int[4] expectedA = [0, 0, 0, 0]; 4389 assert(A.array == expectedA); 4390 } 4391 4392 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4393 /// #BONUS 4394 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe 4395 { 4396 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v); 4397 } 4398 unittest 4399 { 4400 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)); 4401 float[4] correct = [3.0f, 4.0f, 0, 0]; 4402 assert(R.array == correct); 4403 } 4404 4405 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4406 /// #BONUS 4407 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe 4408 { 4409 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v); 4410 } 4411 4412 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4413 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4414 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted 4415 { 4416 pragma(inline, true); 4417 __m128d* aligned = cast(__m128d*)mem_addr; 4418 *aligned = a; 4419 } 4420 unittest 4421 { 4422 align(16) double[2] A; 4423 __m128d B = _mm_setr_pd(-8.0, 9.0); 4424 _mm_store_pd(A.ptr, B); 4425 assert(A == [-8.0, 9.0]); 4426 } 4427 4428 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 4429 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4430 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted 4431 { 4432 __m128d* aligned = cast(__m128d*)mem_addr; 4433 __m128d r; // PERF =void; 4434 r.ptr[0] = a.array[0]; 4435 r.ptr[1] = a.array[0]; 4436 *aligned = r; 4437 } 4438 4439 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 4440 /// be aligned on any particular boundary. 4441 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 4442 { 4443 pragma(inline, true); 4444 *mem_addr = a.array[0]; 4445 } 4446 4447 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 4448 /// general-protection exception may be generated. 4449 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 4450 { 4451 pragma(inline, true); 4452 *mem_addr = a; 4453 } 4454 4455 alias _mm_store1_pd = _mm_store_pd1; /// 4456 4457 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory. 4458 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 4459 { 4460 pragma(inline, true); 4461 *mem_addr = a.array[1]; 4462 } 4463 4464 // Note: `mem_addr` doesn't have to actually be aligned, which breaks 4465 // expectations from the user point of view. This problem also exist in C++. 4466 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 4467 { 4468 pragma(inline, true); 4469 long* dest = cast(long*)mem_addr; 4470 long2 la = cast(long2)a; 4471 *dest = la.array[0]; 4472 } 4473 unittest 4474 { 4475 long[3] A = [1, 2, 3]; 4476 _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 4477 long[3] correct = [1, 0x1_0000_0000, 3]; 4478 assert(A == correct); 4479 } 4480 4481 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. 4482 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 4483 { 4484 pragma(inline, true); 4485 *mem_addr = a.array[0]; 4486 } 4487 4488 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse 4489 /// order. `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception 4490 /// may be generated. 4491 void _mm_storer_pd (double* mem_addr, __m128d a) pure @system 4492 { 4493 __m128d reversed = void; 4494 reversed.ptr[0] = a.array[1]; 4495 reversed.ptr[1] = a.array[0]; 4496 *cast(__m128d*)mem_addr = reversed; 4497 } 4498 unittest 4499 { 4500 align(16) double[2] A = [0.0, 1.0]; 4501 _mm_storer_pd(A.ptr, _mm_setr_pd(2.0, 3.0)); 4502 assert(A[0] == 3.0 && A[1] == 2.0); 4503 } 4504 4505 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 4506 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary. 4507 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @trusted // TODO: signature, should be system 4508 { 4509 // PERF DMD 4510 pragma(inline, true); 4511 static if (GDC_with_SSE2) 4512 { 4513 __builtin_ia32_storeupd(mem_addr, a); 4514 } 4515 else version(LDC) 4516 { 4517 storeUnaligned!double2(a, mem_addr); 4518 } 4519 else 4520 { 4521 mem_addr[0] = a.array[0]; 4522 mem_addr[1] = a.array[1]; 4523 } 4524 } 4525 unittest 4526 { 4527 __m128d A = _mm_setr_pd(3.0, 4.0); 4528 align(16) double[4] R = [0.0, 0, 0, 0]; 4529 double[2] correct = [3.0, 4.0]; 4530 _mm_storeu_pd(&R[1], A); 4531 assert(R[1..3] == correct); 4532 } 4533 4534 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 4535 /// boundary. 4536 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @trusted // TODO: signature is wrong, mem_addr is not aligned. Make it @system 4537 { 4538 // PERF: DMD 4539 pragma(inline, true); 4540 static if (GDC_with_SSE2) 4541 { 4542 __builtin_ia32_storedqu(cast(char*)mem_addr, cast(ubyte16)a); 4543 } 4544 else version(LDC) 4545 { 4546 storeUnaligned!__m128i(a, cast(int*)mem_addr); 4547 } 4548 else 4549 { 4550 int* p = cast(int*)mem_addr; 4551 p[0] = a.array[0]; 4552 p[1] = a.array[1]; 4553 p[2] = a.array[2]; 4554 p[3] = a.array[3]; 4555 } 4556 } 4557 unittest 4558 { 4559 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 4560 align(16) int[6] R = [0, 0, 0, 0, 0, 0]; 4561 int[4] correct = [1, 2, 3, 4]; 4562 _mm_storeu_si128(cast(__m128i*)(&R[1]), A); 4563 assert(R[1..5] == correct); 4564 } 4565 4566 /// Store 16-bit integer from the first element of `a` into memory. 4567 /// `mem_addr` does not need to be aligned on any particular boundary. 4568 void _mm_storeu_si16 (void* mem_addr, __m128i a) pure @system 4569 { 4570 short* dest = cast(short*)mem_addr; 4571 *dest = (cast(short8)a).array[0]; 4572 } 4573 unittest 4574 { 4575 short[2] arr = [-24, 12]; 4576 _mm_storeu_si16(&arr[1], _mm_set1_epi16(26)); 4577 short[2] correct = [-24, 26]; 4578 assert(arr == correct); 4579 } 4580 4581 /// Store 32-bit integer from the first element of `a` into memory. 4582 /// `mem_addr` does not need to be aligned on any particular boundary. 4583 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted // TODO should really be @ssytem 4584 { 4585 pragma(inline, true); 4586 int* dest = cast(int*)mem_addr; 4587 *dest = a.array[0]; 4588 } 4589 unittest 4590 { 4591 int[2] arr = [-24, 12]; 4592 _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7)); 4593 assert(arr == [-24, -1]); 4594 } 4595 4596 /// Store 64-bit integer from the first element of `a` into memory. 4597 /// `mem_addr` does not need to be aligned on any particular boundary. 4598 void _mm_storeu_si64 (void* mem_addr, __m128i a) pure @system 4599 { 4600 pragma(inline, true); 4601 long* dest = cast(long*)mem_addr; 4602 long2 la = cast(long2)a; 4603 *dest = la.array[0]; 4604 } 4605 unittest 4606 { 4607 long[3] A = [1, 2, 3]; 4608 _mm_storeu_si64(&A[1], _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 4609 long[3] correct = [1, 0x1_0000_0000, 3]; 4610 assert(A == correct); 4611 } 4612 4613 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 4614 /// from `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 16-byte 4615 /// boundary or a general-protection exception may be generated. 4616 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4617 void _mm_stream_pd (double* mem_addr, __m128d a) pure @system 4618 { 4619 // PERF DMD D_SIMD 4620 static if (GDC_with_SSE2) 4621 { 4622 return __builtin_ia32_movntpd(mem_addr, a); 4623 } 4624 else static if (LDC_with_InlineIREx) 4625 { 4626 enum prefix = `!0 = !{ i32 1 }`; 4627 enum ir = ` 4628 store <2 x double> %1, <2 x double>* %0, align 16, !nontemporal !0 4629 ret void`; 4630 LDCInlineIREx!(prefix, ir, "", void, double2*, double2)(cast(double2*)mem_addr, a); 4631 } 4632 else 4633 { 4634 // Regular store instead. 4635 __m128d* dest = cast(__m128d*)mem_addr; 4636 *dest = a; 4637 } 4638 } 4639 unittest 4640 { 4641 align(16) double[2] A; 4642 __m128d B = _mm_setr_pd(-8.0, 9.0); 4643 _mm_stream_pd(A.ptr, B); 4644 assert(A == [-8.0, 9.0]); 4645 } 4646 4647 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 4648 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception 4649 /// may be generated. 4650 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4651 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) pure @trusted 4652 { 4653 // PERF DMD D_SIMD 4654 static if (GDC_with_SSE2) 4655 { 4656 return __builtin_ia32_movntdq (cast(long2*)mem_addr, cast(long2)a); 4657 } 4658 else static if (LDC_with_InlineIREx) 4659 { 4660 enum prefix = `!0 = !{ i32 1 }`; 4661 enum ir = ` 4662 store <4 x i32> %1, <4 x i32>* %0, align 16, !nontemporal !0 4663 ret void`; 4664 LDCInlineIREx!(prefix, ir, "", void, int4*, int4)(cast(int4*)mem_addr, a); 4665 } 4666 else 4667 { 4668 // Regular store instead. 4669 __m128i* dest = cast(__m128i*)mem_addr; 4670 *dest = a; 4671 } 4672 } 4673 unittest 4674 { 4675 align(16) int[4] A; 4676 __m128i B = _mm_setr_epi32(-8, 9, 10, -11); 4677 _mm_stream_si128(cast(__m128i*)A.ptr, B); 4678 assert(A == [-8, 9, 10, -11]); 4679 } 4680 4681 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 4682 /// pollution. If the cache line containing address `mem_addr` is already in the cache, 4683 /// the cache will be updated. 4684 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4685 void _mm_stream_si32 (int* mem_addr, int a) pure @trusted 4686 { 4687 // PERF DMD D_SIMD 4688 static if (GDC_with_SSE2) 4689 { 4690 return __builtin_ia32_movnti(mem_addr, a); 4691 } 4692 else static if (LDC_with_InlineIREx) 4693 { 4694 enum prefix = `!0 = !{ i32 1 }`; 4695 enum ir = ` 4696 store i32 %1, i32* %0, !nontemporal !0 4697 ret void`; 4698 LDCInlineIREx!(prefix, ir, "", void, int*, int)(mem_addr, a); 4699 } 4700 else 4701 { 4702 // Regular store instead. 4703 *mem_addr = a; 4704 } 4705 } 4706 unittest 4707 { 4708 int A; 4709 _mm_stream_si32(&A, -34); 4710 assert(A == -34); 4711 } 4712 4713 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 4714 /// cache pollution. If the cache line containing address `mem_addr` is already 4715 /// in the cache, the cache will be updated. 4716 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4717 void _mm_stream_si64 (long* mem_addr, long a) pure @trusted 4718 { 4719 // PERF DMD D_SIMD 4720 static if (GDC_with_SSE2) 4721 { 4722 return __builtin_ia32_movnti64(mem_addr, a); 4723 } 4724 else static if (LDC_with_InlineIREx) 4725 { 4726 enum prefix = `!0 = !{ i32 1 }`; 4727 enum ir = ` 4728 store i64 %1, i64* %0, !nontemporal !0 4729 ret void`; 4730 LDCInlineIREx!(prefix, ir, "", void, long*, long)(mem_addr, a); 4731 4732 } 4733 else 4734 { 4735 // Regular store instead. 4736 *mem_addr = a; 4737 } 4738 } 4739 unittest 4740 { 4741 long A; 4742 _mm_stream_si64(&A, -46); 4743 assert(A == -46); 4744 } 4745 4746 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 4747 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 4748 { 4749 pragma(inline, true); 4750 return cast(__m128i)(cast(short8)a - cast(short8)b); 4751 } 4752 unittest 4753 { 4754 __m128i A = _mm_setr_epi16(16, 32767, 1, 2, 3, 4, 6, 6); 4755 __m128i B = _mm_setr_epi16(15, -32768, 6, 8, 1000, 1, 5, 6); 4756 short8 C = cast(short8) _mm_sub_epi16(A, B); 4757 short[8] correct = [ 1, -1,-5,-6, -997, 3, 1, 0]; 4758 assert(C.array == correct); 4759 } 4760 4761 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 4762 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 4763 { 4764 pragma(inline, true); 4765 return cast(__m128i)(cast(int4)a - cast(int4)b); 4766 } 4767 unittest 4768 { 4769 __m128i A = _mm_setr_epi32(16, int.max, 1, 8); 4770 __m128i B = _mm_setr_epi32(15, int.min, 6, 2); 4771 int4 C = cast(int4) _mm_sub_epi32(A, B); 4772 int[4] correct = [ 1, -1,-5, 6]; 4773 assert(C.array == correct); 4774 } 4775 4776 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`. 4777 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 4778 { 4779 pragma(inline, true); 4780 return cast(__m128i)(cast(long2)a - cast(long2)b); 4781 } 4782 unittest 4783 { 4784 __m128i A = _mm_setr_epi64( 16, long.max); 4785 __m128i B = _mm_setr_epi64( 199, long.min); 4786 long2 C = cast(long2) _mm_sub_epi64(A, B); 4787 long[2] correct = [-183, -1]; 4788 assert(C.array == correct); 4789 } 4790 4791 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 4792 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 4793 { 4794 pragma(inline, true); 4795 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 4796 } 4797 unittest 4798 { 4799 __m128i A = _mm_setr_epi8(16, 127, 1, 2, 3, 4, 6, 6, 16, 127, 1, 2, 3, 4, 6, 6); 4800 __m128i B = _mm_setr_epi8(15, -128, 6, 8, 3, 1, 5, 6, 16, 127, 1, 2, 3, 4, 6, 6); 4801 byte16 C = cast(byte16) _mm_sub_epi8(A, B); 4802 byte[16] correct = [ 1, -1,-5,-6, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4803 assert(C.array == correct); 4804 } 4805 4806 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 4807 /// floating-point elements in `a`. 4808 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 4809 { 4810 pragma(inline, true); 4811 return a - b; 4812 } 4813 unittest 4814 { 4815 __m128d A = _mm_setr_pd(4000.0, -8.0); 4816 __m128d B = _mm_setr_pd(12.0, -8450.0); 4817 __m128d C = _mm_sub_pd(A, B); 4818 double[2] correct = [3988.0, 8442.0]; 4819 assert(C.array == correct); 4820 } 4821 4822 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 4823 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the 4824 /// upper element of result. 4825 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted 4826 { 4827 version(DigitalMars) 4828 { 4829 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 4830 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 4831 asm pure nothrow @nogc @trusted { nop;} 4832 a[0] = a[0] - b[0]; 4833 return a; 4834 } 4835 else static if (GDC_with_SSE2) 4836 { 4837 return __builtin_ia32_subsd(a, b); 4838 } 4839 else 4840 { 4841 a.ptr[0] -= b.array[0]; 4842 return a; 4843 } 4844 } 4845 unittest 4846 { 4847 __m128d a = [1.5, -2.0]; 4848 a = _mm_sub_sd(a, a); 4849 assert(a.array == [0.0, -2.0]); 4850 } 4851 4852 /// Subtract 64-bit integer `b` from 64-bit integer `a`. 4853 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe 4854 { 4855 pragma(inline, true); 4856 return a - b; 4857 } 4858 unittest 4859 { 4860 __m64 A, B; 4861 A = -1214; 4862 B = 489415; 4863 __m64 C = _mm_sub_si64(B, A); 4864 assert(C.array[0] == 489415 + 1214); 4865 } 4866 4867 /// Subtract packed signed 16-bit integers in `b` from packed 16-bit integers in `a` using 4868 /// saturation. 4869 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 4870 { 4871 // PERF DMD psubsw 4872 version(LDC) 4873 { 4874 return cast(__m128i) inteli_llvm_subs!short8(cast(short8)a, cast(short8)b); 4875 } 4876 else static if (GDC_with_SSE2) 4877 { 4878 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 4879 } 4880 else 4881 { 4882 short[8] res; // PERF =void; 4883 short8 sa = cast(short8)a; 4884 short8 sb = cast(short8)b; 4885 foreach(i; 0..8) 4886 res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4887 return _mm_loadu_si128(cast(int4*)res.ptr); 4888 } 4889 } 4890 unittest 4891 { 4892 short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0), 4893 _mm_setr_epi16(-10 , 16, 5, 4, 3, 2, 1, 0)); 4894 static immutable short[8] correctResult = [32767, -32768, 0, 0, 0, 0, 0, 0]; 4895 assert(res.array == correctResult); 4896 } 4897 4898 /// Subtract packed signed 8-bit integers in `b` from packed 8-bit integers in `a` using 4899 /// saturation. 4900 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 4901 { 4902 version(LDC) 4903 { 4904 return cast(__m128i) inteli_llvm_subs!byte16(cast(byte16)a, cast(byte16)b); 4905 } 4906 else static if (GDC_with_SSE2) 4907 { 4908 return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b); 4909 } 4910 else 4911 { 4912 byte[16] res; // PERF =void; 4913 byte16 sa = cast(byte16)a; 4914 byte16 sb = cast(byte16)b; 4915 foreach(i; 0..16) 4916 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4917 return _mm_loadu_si128(cast(int4*)res.ptr); 4918 } 4919 } 4920 unittest 4921 { 4922 byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4923 _mm_setr_epi8( 15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4924 static immutable byte[16] correctResult = [-128, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4925 assert(res.array == correctResult); 4926 } 4927 4928 /// Subtract packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 4929 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 4930 { 4931 version(LDC) 4932 { 4933 return cast(__m128i) inteli_llvm_subus!short8(cast(short8)a, cast(short8)b); 4934 } 4935 else static if (GDC_with_SSE2) 4936 { 4937 return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b); 4938 } 4939 else 4940 { 4941 short[8] res; // PERF =void; 4942 short8 sa = cast(short8)a; 4943 short8 sb = cast(short8)b; 4944 foreach(i; 0..8) 4945 { 4946 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4947 res[i] = saturateSignedIntToUnsignedShort(sum); 4948 } 4949 return _mm_loadu_si128(cast(int4*)res.ptr); 4950 } 4951 } 4952 unittest 4953 { 4954 short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534, 1, 5, 4, 3, 2, 1, 0), 4955 _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0)); 4956 static immutable short[8] correct = [ 0, 0, 1, 0, 0, 2, 0, 0]; 4957 assert(R.array == correct); 4958 } 4959 4960 /// Subtract packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4961 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4962 { 4963 version(LDC) 4964 { 4965 return cast(__m128i) inteli_llvm_subus!byte16(cast(byte16)a, cast(byte16)b); 4966 } 4967 else static if (GDC_with_SSE2) 4968 { 4969 return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b); 4970 } 4971 else 4972 { 4973 ubyte[16] res; // PERF =void; 4974 byte16 sa = cast(byte16)a; 4975 byte16 sb = cast(byte16)b; 4976 foreach(i; 0..16) 4977 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4978 return _mm_loadu_si128(cast(int4*)res.ptr); 4979 } 4980 } 4981 unittest 4982 { 4983 byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4984 _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4985 static immutable byte[16] correctResult = [ 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4986 assert(res.array == correctResult); 4987 } 4988 4989 // Note: the only difference between these intrinsics is the signalling 4990 // behaviour of quiet NaNs. This is incorrect but the case where 4991 // you would want to differentiate between qNaN and sNaN and then 4992 // treat them differently on purpose seems extremely rare. 4993 alias _mm_ucomieq_sd = _mm_comieq_sd; /// 4994 alias _mm_ucomige_sd = _mm_comige_sd; /// 4995 alias _mm_ucomigt_sd = _mm_comigt_sd; /// 4996 alias _mm_ucomile_sd = _mm_comile_sd; /// 4997 alias _mm_ucomilt_sd = _mm_comilt_sd; /// 4998 alias _mm_ucomineq_sd = _mm_comineq_sd; /// 4999 5000 /// Return vector of type `__m128d` with undefined elements. 5001 __m128d _mm_undefined_pd() pure @safe 5002 { 5003 pragma(inline, true); 5004 __m128d result = void; 5005 return result; 5006 } 5007 5008 /// Return vector of type `__m128i` with undefined elements. 5009 __m128i _mm_undefined_si128() pure @safe 5010 { 5011 pragma(inline, true); 5012 __m128i result = void; 5013 return result; 5014 } 5015 5016 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. 5017 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @trusted 5018 { 5019 // PERF DMD D_SIMD 5020 static if (GDC_with_SSE2) 5021 { 5022 return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b); 5023 } 5024 else version(LDC) 5025 { 5026 return cast(__m128i) shufflevectorLDC!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 5027 (cast(short8)a, cast(short8)b); 5028 } 5029 else static if (DMD_with_32bit_asm) 5030 { 5031 asm pure nothrow @nogc @trusted 5032 { 5033 movdqu XMM0, a; 5034 movdqu XMM1, b; 5035 punpckhwd XMM0, XMM1; 5036 movdqu a, XMM0; 5037 } 5038 return a; 5039 } 5040 else 5041 { 5042 short8 r = void; 5043 short8 sa = cast(short8)a; 5044 short8 sb = cast(short8)b; 5045 r.ptr[0] = sa.array[4]; 5046 r.ptr[1] = sb.array[4]; 5047 r.ptr[2] = sa.array[5]; 5048 r.ptr[3] = sb.array[5]; 5049 r.ptr[4] = sa.array[6]; 5050 r.ptr[5] = sb.array[6]; 5051 r.ptr[6] = sa.array[7]; 5052 r.ptr[7] = sb.array[7]; 5053 return cast(__m128i)r; 5054 } 5055 } 5056 unittest 5057 { 5058 __m128i A = _mm_setr_epi16(4, 5, 6, 7, 8, 9, 10, 11); 5059 __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19); 5060 short8 C = cast(short8)(_mm_unpackhi_epi16(A, B)); 5061 short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19]; 5062 assert(C.array == correct); 5063 } 5064 5065 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. 5066 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted 5067 { 5068 static if (GDC_with_SSE2) 5069 { 5070 return __builtin_ia32_punpckhdq128(a, b); 5071 } 5072 else version(LDC) 5073 { 5074 return shufflevectorLDC!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 5075 } 5076 else 5077 { 5078 __m128i r = void; 5079 r.ptr[0] = a.array[2]; 5080 r.ptr[1] = b.array[2]; 5081 r.ptr[2] = a.array[3]; 5082 r.ptr[3] = b.array[3]; 5083 return r; 5084 } 5085 } 5086 unittest 5087 { 5088 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 5089 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 5090 __m128i C = _mm_unpackhi_epi32(A, B); 5091 int[4] correct = [3, 7, 4, 8]; 5092 assert(C.array == correct); 5093 } 5094 5095 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`. 5096 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted 5097 { 5098 static if (GDC_with_SSE2) 5099 { 5100 return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b); 5101 } 5102 else 5103 { 5104 __m128i r = cast(__m128i)b; 5105 r[0] = a[2]; 5106 r[1] = a[3]; 5107 return r; 5108 } 5109 } 5110 unittest // Issue #36 5111 { 5112 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 5113 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 5114 long2 C = cast(long2)(_mm_unpackhi_epi64(A, B)); 5115 long[2] correct = [0x33333333_33333333, 0x55555555_55555555]; 5116 assert(C.array == correct); 5117 } 5118 5119 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. 5120 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @trusted 5121 { 5122 // PERF DMD D_SIMD 5123 static if (GDC_with_SSE2) 5124 { 5125 return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b); 5126 } 5127 else static if (DMD_with_32bit_asm) 5128 { 5129 asm pure nothrow @nogc @trusted 5130 { 5131 movdqu XMM0, a; 5132 movdqu XMM1, b; 5133 punpckhbw XMM0, XMM1; 5134 movdqu a, XMM0; 5135 } 5136 return a; 5137 } 5138 else version(LDC) 5139 { 5140 return cast(__m128i)shufflevectorLDC!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 5141 12, 28, 13, 29, 14, 30, 15, 31) 5142 (cast(byte16)a, cast(byte16)b); 5143 } 5144 else 5145 { 5146 byte16 r = void; 5147 byte16 ba = cast(byte16)a; 5148 byte16 bb = cast(byte16)b; 5149 r.ptr[0] = ba.array[8]; 5150 r.ptr[1] = bb.array[8]; 5151 r.ptr[2] = ba.array[9]; 5152 r.ptr[3] = bb.array[9]; 5153 r.ptr[4] = ba.array[10]; 5154 r.ptr[5] = bb.array[10]; 5155 r.ptr[6] = ba.array[11]; 5156 r.ptr[7] = bb.array[11]; 5157 r.ptr[8] = ba.array[12]; 5158 r.ptr[9] = bb.array[12]; 5159 r.ptr[10] = ba.array[13]; 5160 r.ptr[11] = bb.array[13]; 5161 r.ptr[12] = ba.array[14]; 5162 r.ptr[13] = bb.array[14]; 5163 r.ptr[14] = ba.array[15]; 5164 r.ptr[15] = bb.array[15]; 5165 return cast(__m128i)r; 5166 } 5167 } 5168 unittest 5169 { 5170 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 5171 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 5172 byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B); 5173 byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]; 5174 assert(C.array == correct); 5175 } 5176 5177 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`. 5178 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @trusted 5179 { 5180 // PERF DMD D_SIMD 5181 static if (GDC_with_SSE2) 5182 { 5183 return __builtin_ia32_unpckhpd(a, b); 5184 } 5185 else version(LDC) 5186 { 5187 return shufflevectorLDC!(__m128d, 1, 3)(a, b); 5188 } 5189 else 5190 { 5191 double2 r = void; 5192 r.ptr[0] = a.array[1]; 5193 r.ptr[1] = b.array[1]; 5194 return r; 5195 } 5196 } 5197 unittest 5198 { 5199 __m128d A = _mm_setr_pd(4.0, 6.0); 5200 __m128d B = _mm_setr_pd(7.0, 9.0); 5201 __m128d C = _mm_unpackhi_pd(A, B); 5202 double[2] correct = [6.0, 9.0]; 5203 assert(C.array == correct); 5204 } 5205 5206 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. 5207 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @trusted 5208 { 5209 // PERF DMD SIMD 5210 static if (GDC_with_SSE2) 5211 { 5212 return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b); 5213 } 5214 else version(LDC) 5215 { 5216 return cast(__m128i) shufflevectorLDC!(short8, 0, 8, 1, 9, 2, 10, 3, 11)(cast(short8)a, cast(short8)b); 5217 } 5218 else static if (DMD_with_32bit_asm) 5219 { 5220 asm pure nothrow @nogc @trusted 5221 { 5222 movdqu XMM0, a; 5223 movdqu XMM1, b; 5224 punpcklwd XMM0, XMM1; 5225 movdqu a, XMM0; 5226 } 5227 return a; 5228 } 5229 else 5230 { 5231 short8 r = void; 5232 short8 sa = cast(short8)a; 5233 short8 sb = cast(short8)b; 5234 r.ptr[0] = sa.array[0]; 5235 r.ptr[1] = sb.array[0]; 5236 r.ptr[2] = sa.array[1]; 5237 r.ptr[3] = sb.array[1]; 5238 r.ptr[4] = sa.array[2]; 5239 r.ptr[5] = sb.array[2]; 5240 r.ptr[6] = sa.array[3]; 5241 r.ptr[7] = sb.array[3]; 5242 return cast(__m128i)r; 5243 } 5244 } 5245 unittest 5246 { 5247 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 5248 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 5249 short8 C = cast(short8) _mm_unpacklo_epi16(A, B); 5250 short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11]; 5251 assert(C.array == correct); 5252 } 5253 5254 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. 5255 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted 5256 { 5257 // PERF DMD 5258 static if (GDC_with_SSE2) 5259 { 5260 return __builtin_ia32_punpckldq128(a, b); 5261 } 5262 else version(LDC) 5263 { 5264 return shufflevectorLDC!(int4, 0, 4, 1, 5)(cast(int4)a, cast(int4)b); 5265 } 5266 else 5267 { 5268 __m128i r; 5269 r.ptr[0] = a.array[0]; 5270 r.ptr[1] = b.array[0]; 5271 r.ptr[2] = a.array[1]; 5272 r.ptr[3] = b.array[1]; 5273 return r; 5274 } 5275 } 5276 unittest 5277 { 5278 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 5279 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 5280 __m128i C = _mm_unpacklo_epi32(A, B); 5281 int[4] correct = [1, 5, 2, 6]; 5282 assert(C.array == correct); 5283 } 5284 5285 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`. 5286 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted 5287 { 5288 static if (GDC_with_SSE2) 5289 { 5290 return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b); 5291 } 5292 else 5293 { 5294 long2 lA = cast(long2)a; 5295 long2 lB = cast(long2)b; 5296 long2 R; // PERF =void; 5297 R.ptr[0] = lA.array[0]; 5298 R.ptr[1] = lB.array[0]; 5299 return cast(__m128i)R; 5300 } 5301 } 5302 unittest // Issue #36 5303 { 5304 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 5305 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 5306 long2 C = cast(long2)(_mm_unpacklo_epi64(A, B)); 5307 long[2] correct = [0x22222222_22222222, 0x44444444_44444444]; 5308 assert(C.array == correct); 5309 } 5310 5311 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. 5312 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @trusted 5313 { 5314 // PERF DMD D_SIMD 5315 static if (GDC_with_SSE2) 5316 { 5317 return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b); 5318 } 5319 else static if (DMD_with_32bit_asm) 5320 { 5321 asm pure nothrow @nogc @trusted 5322 { 5323 movdqu XMM0, a; 5324 movdqu XMM1, b; 5325 punpcklbw XMM0, XMM1; 5326 movdqu a, XMM0; 5327 } 5328 return a; 5329 } 5330 else version(LDC) 5331 { 5332 return cast(__m128i) shufflevectorLDC!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 5333 4, 20, 5, 21, 6, 22, 7, 23) 5334 (cast(byte16)a, cast(byte16)b); 5335 } 5336 else 5337 { 5338 byte16 r = void; 5339 byte16 ba = cast(byte16)a; 5340 byte16 bb = cast(byte16)b; 5341 r.ptr[0] = ba.array[0]; 5342 r.ptr[1] = bb.array[0]; 5343 r.ptr[2] = ba.array[1]; 5344 r.ptr[3] = bb.array[1]; 5345 r.ptr[4] = ba.array[2]; 5346 r.ptr[5] = bb.array[2]; 5347 r.ptr[6] = ba.array[3]; 5348 r.ptr[7] = bb.array[3]; 5349 r.ptr[8] = ba.array[4]; 5350 r.ptr[9] = bb.array[4]; 5351 r.ptr[10] = ba.array[5]; 5352 r.ptr[11] = bb.array[5]; 5353 r.ptr[12] = ba.array[6]; 5354 r.ptr[13] = bb.array[6]; 5355 r.ptr[14] = ba.array[7]; 5356 r.ptr[15] = bb.array[7]; 5357 return cast(__m128i)r; 5358 } 5359 } 5360 unittest 5361 { 5362 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 5363 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 5364 byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B); 5365 byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]; 5366 assert(C.array == correct); 5367 } 5368 5369 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`. 5370 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @trusted 5371 { 5372 // PERF DMD D_SIMD 5373 static if (GDC_with_SSE2) 5374 { 5375 return __builtin_ia32_unpcklpd(a, b); 5376 } 5377 else version(LDC) 5378 { 5379 return shufflevectorLDC!(__m128d, 0, 2)(a, b); 5380 } 5381 else 5382 { 5383 double2 r = void; 5384 r.ptr[0] = a.array[0]; 5385 r.ptr[1] = b.array[0]; 5386 return r; 5387 } 5388 } 5389 unittest 5390 { 5391 __m128d A = _mm_setr_pd(4.0, 6.0); 5392 __m128d B = _mm_setr_pd(7.0, 9.0); 5393 __m128d C = _mm_unpacklo_pd(A, B); 5394 double[2] correct = [4.0, 7.0]; 5395 assert(C.array == correct); 5396 } 5397 5398 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 5399 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 5400 { 5401 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 5402 } 5403 unittest 5404 { 5405 __m128d A = _mm_setr_pd(-4.0, 6.0); 5406 __m128d B = _mm_setr_pd(4.0, -6.0); 5407 long2 R = cast(long2) _mm_xor_pd(A, B); 5408 long[2] correct = [long.min, long.min]; 5409 assert(R.array == correct); 5410 } 5411 5412 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`. 5413 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 5414 { 5415 return a ^ b; 5416 } 5417 unittest 5418 { 5419 __m128i A = _mm_setr_epi64(975394, 619809709); 5420 __m128i B = _mm_setr_epi64(-920275025, -6); 5421 long2 R = cast(long2) _mm_xor_si128(A, B); 5422 long[2] correct = [975394 ^ (-920275025L), 619809709L ^ -6]; 5423 assert(R.array == correct); 5424 } 5425 5426 unittest 5427 { 5428 float distance(float[4] a, float[4] b) nothrow @nogc 5429 { 5430 __m128 va = _mm_loadu_ps(a.ptr); 5431 __m128 vb = _mm_loadu_ps(b.ptr); 5432 __m128 diffSquared = _mm_sub_ps(va, vb); 5433 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 5434 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared)); 5435 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum)); 5436 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 5437 } 5438 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 5439 }