1 /** 2 * SSE2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2 4 * 5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.emmintrin; 9 10 public import inteli.types; 11 public import inteli.xmmintrin; // SSE2 includes SSE1 12 import inteli.mmx; 13 import inteli.internals; 14 15 nothrow @nogc: 16 17 18 // SSE2 instructions 19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 20 21 /// Add packed 16-bit integers in `a` and `b`. 22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 23 { 24 pragma(inline, true); 25 return cast(__m128i)(cast(short8)a + cast(short8)b); 26 } 27 unittest 28 { 29 __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77); 30 short8 R = cast(short8) _mm_add_epi16(A, A); 31 short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154]; 32 assert(R.array == correct); 33 } 34 35 /// Add packed 32-bit integers in `a` and `b`. 36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 37 { 38 pragma(inline, true); 39 return cast(__m128i)(cast(int4)a + cast(int4)b); 40 } 41 unittest 42 { 43 __m128i A = _mm_setr_epi32( -7, -1, 0, 9); 44 int4 R = _mm_add_epi32(A, A); 45 int[4] correct = [ -14, -2, 0, 18 ]; 46 assert(R.array == correct); 47 } 48 49 /// Add packed 64-bit integers in `a` and `b`. 50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 51 { 52 pragma(inline, true); 53 return cast(__m128i)(cast(long2)a + cast(long2)b); 54 } 55 unittest 56 { 57 __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000); 58 long2 R = cast(long2) _mm_add_epi64(A, A); 59 long[2] correct = [ -2, 0 ]; 60 assert(R.array == correct); 61 } 62 63 /// Add packed 8-bit integers in `a` and `b`. 64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 65 { 66 pragma(inline, true); 67 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 68 } 69 unittest 70 { 71 __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78); 72 byte16 R = cast(byte16) _mm_add_epi8(A, A); 73 byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100]; 74 assert(R.array == correct); 75 } 76 77 /// Add the lower double-precision (64-bit) floating-point element 78 /// in `a` and `b`, store the result in the lower element of dst, 79 /// and copy the upper element from `a` to the upper element of destination. 80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 81 { 82 static if (DMD_with_DSIMD) 83 { 84 return cast(__m128d) __simd(XMM.ADDSD, a, b); 85 } 86 else static if (GDC_with_SSE2) 87 { 88 return __builtin_ia32_addsd(a, b); 89 } 90 else version(DigitalMars) 91 { 92 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 93 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 94 asm pure nothrow @nogc @trusted { nop;} 95 a[0] = a[0] + b[0]; 96 return a; 97 } 98 else 99 { 100 a[0] += b[0]; 101 return a; 102 } 103 } 104 unittest 105 { 106 __m128d a = [1.5, -2.0]; 107 a = _mm_add_sd(a, a); 108 assert(a.array == [3.0, -2.0]); 109 } 110 111 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 112 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 113 { 114 pragma(inline, true); 115 return a + b; 116 } 117 unittest 118 { 119 __m128d a = [1.5, -2.0]; 120 a = _mm_add_pd(a, a); 121 assert(a.array == [3.0, -4.0]); 122 } 123 124 /// Add 64-bit integers `a` and `b`. 125 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe 126 { 127 // PERF DMD 128 pragma(inline, true); 129 return a + b; 130 } 131 132 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 133 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 134 { 135 static if (DMD_with_DSIMD) 136 { 137 return cast(__m128i) __simd(XMM.PADDSW, a, b); 138 } 139 else static if (GDC_with_SSE2) 140 { 141 return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 142 } 143 else static if(LDC_with_saturated_intrinsics) 144 { 145 return cast(__m128i) inteli_llvm_adds!short8(cast(short8)a, cast(short8)b); 146 } 147 else 148 { 149 short[8] res; // PERF =void; 150 short8 sa = cast(short8)a; 151 short8 sb = cast(short8)b; 152 foreach(i; 0..8) 153 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 154 return _mm_loadu_si128(cast(int4*)res.ptr); 155 } 156 } 157 unittest 158 { 159 short8 res = cast(short8) _mm_adds_epi16(_mm_setr_epi16( 7, 6, 5, -32768, 3, 3, 32767, 0), 160 _mm_setr_epi16( 7, 6, 5, -30000, 3, 1, 1, -10)); 161 static immutable short[8] correctResult = [14, 12, 10, -32768, 6, 4, 32767, -10]; 162 assert(res.array == correctResult); 163 } 164 165 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 166 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 167 { 168 static if (DMD_with_DSIMD) 169 { 170 return cast(__m128i) __simd(XMM.PADDSB, a, b); 171 } 172 else static if (GDC_with_SSE2) 173 { 174 return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b); 175 } 176 else static if(LDC_with_saturated_intrinsics) 177 { 178 return cast(__m128i) inteli_llvm_adds!byte16(cast(byte16)a, cast(byte16)b); 179 } 180 else 181 { 182 byte[16] res; // PERF =void; 183 byte16 sa = cast(byte16)a; 184 byte16 sb = cast(byte16)b; 185 foreach(i; 0..16) 186 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 187 return _mm_loadu_si128(cast(int4*)res.ptr); 188 } 189 } 190 unittest 191 { 192 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 127, 9, 8, 7, 6, 5, -128, 3, 2, 1, 0), 193 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, -4, 3, 2, 1, 0)); 194 static immutable byte[16] correctResult = [0, 2, 4, 6, -128, 10, 12, 14, 195 16, 18, 127, 22, 24, 26, 28, 30]; 196 assert(res.array == correctResult); 197 } 198 199 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 200 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 201 { 202 static if (DMD_with_DSIMD) 203 { 204 return cast(__m128i) __simd(XMM.PADDUSB, a, b); 205 } 206 else static if (GDC_with_SSE2) 207 { 208 return cast(__m128i) __builtin_ia32_paddusb128(cast(ubyte16)a, cast(ubyte16)b); 209 } 210 else static if(LDC_with_saturated_intrinsics) 211 { 212 return cast(__m128i) inteli_llvm_addus!byte16(cast(byte16)a, cast(byte16)b); 213 } 214 else 215 { 216 ubyte[16] res; // PERF =void; 217 byte16 sa = cast(byte16)a; 218 byte16 sb = cast(byte16)b; 219 foreach(i; 0..16) 220 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 221 return _mm_loadu_si128(cast(int4*)res.ptr); 222 } 223 } 224 unittest 225 { 226 byte16 res = cast(byte16) 227 _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0), 228 _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0)); 229 static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 230 0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 231 assert(res.array == correctResult); 232 } 233 234 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation. 235 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 236 { 237 static if (DMD_with_DSIMD) 238 { 239 // Note: DMD generates a reverted paddusw vs LDC and GDC, but that doesn't change the result anyway 240 return cast(__m128i) __simd(XMM.PADDUSW, a, b); 241 } 242 else static if (GDC_with_SSE2) 243 { 244 return cast(__m128i) __builtin_ia32_paddusw128(cast(short8)a, cast(short8)b); 245 } 246 else static if(LDC_with_saturated_intrinsics) 247 { 248 return cast(__m128i) inteli_llvm_addus!short8(cast(short8)a, cast(short8)b); 249 } 250 else 251 { 252 ushort[8] res; // PERF =void; 253 short8 sa = cast(short8)a; 254 short8 sb = cast(short8)b; 255 foreach(i; 0..8) 256 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 257 return _mm_loadu_si128(cast(int4*)res.ptr); 258 } 259 } 260 unittest 261 { 262 short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0), 263 _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0)); 264 static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6]; 265 assert(res.array == correctResult); 266 } 267 268 /// Compute the bitwise AND of packed double-precision (64-bit) 269 /// floating-point elements in `a` and `b`. 270 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 271 { 272 pragma(inline, true); 273 return cast(__m128d)( cast(long2)a & cast(long2)b ); 274 } 275 unittest 276 { 277 double a = 4.32; 278 double b = -78.99; 279 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 280 __m128d A = _mm_set_pd(a, b); 281 __m128d B = _mm_set_pd(b, a); 282 long2 R = cast(long2)( _mm_and_pd(A, B) ); 283 assert(R.array[0] == correct); 284 assert(R.array[1] == correct); 285 } 286 287 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`. 288 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 289 { 290 pragma(inline, true); 291 return a & b; 292 } 293 unittest 294 { 295 __m128i A = _mm_set1_epi32(7); 296 __m128i B = _mm_set1_epi32(14); 297 __m128i R = _mm_and_si128(A, B); 298 int[4] correct = [6, 6, 6, 6]; 299 assert(R.array == correct); 300 } 301 302 /// Compute the bitwise NOT of packed double-precision (64-bit) 303 /// floating-point elements in `a` and then AND with `b`. 304 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 305 { 306 static if (DMD_with_DSIMD) 307 { 308 return cast(__m128d) __simd(XMM.ANDNPD, a, b); 309 } 310 else 311 { 312 return cast(__m128d)( ~(cast(long2)a) & cast(long2)b); 313 } 314 } 315 unittest 316 { 317 double a = 4.32; 318 double b = -78.99; 319 long correct = (~*cast(long*)(&a)) & ( *cast(long*)(&b)); 320 long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b)); 321 __m128d A = _mm_setr_pd(a, b); 322 __m128d B = _mm_setr_pd(b, a); 323 long2 R = cast(long2)( _mm_andnot_pd(A, B) ); 324 assert(R.array[0] == correct); 325 assert(R.array[1] == correct2); 326 } 327 328 /// Compute the bitwise NOT of 128 bits (representing integer data) 329 /// in `a` and then AND with `b`. 330 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 331 { 332 static if (DMD_with_DSIMD) 333 { 334 return cast(__m128i) __simd(XMM.PANDN, a, b); 335 } 336 else 337 { 338 return (~a) & b; 339 } 340 } 341 unittest 342 { 343 __m128i A = _mm_setr_epi32(7, -2, 9, 54654); 344 __m128i B = _mm_setr_epi32(14, 78, 111, -256); 345 __m128i R = _mm_andnot_si128(A, B); 346 int[4] correct = [8, 0, 102, -54784]; 347 assert(R.array == correct); 348 } 349 350 /// Average packed unsigned 16-bit integers in `a` and `b`. 351 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted 352 { 353 static if (DMD_with_DSIMD) 354 { 355 return cast(__m128i) __simd(XMM.PAVGW, a, b); 356 } 357 else static if (GDC_with_SSE2) 358 { 359 return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b); 360 } 361 else static if (LDC_with_ARM64) 362 { 363 return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b); 364 } 365 else static if (LDC_with_SSE2 && __VERSION__ >= 2094) 366 { 367 // Exists since LDC 1.18 368 return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b); 369 } 370 else static if (LDC_with_optimizations) 371 { 372 // Generates pavgw even in LDC 1.0, even in -O0 373 // But not in ARM 374 enum ir = ` 375 %ia = zext <8 x i16> %0 to <8 x i32> 376 %ib = zext <8 x i16> %1 to <8 x i32> 377 %isum = add <8 x i32> %ia, %ib 378 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 379 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 380 %r = trunc <8 x i32> %isums to <8 x i16> 381 ret <8 x i16> %r`; 382 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 383 } 384 else 385 { 386 short8 sa = cast(short8)a; 387 short8 sb = cast(short8)b; 388 short8 sr = void; 389 foreach(i; 0..8) 390 { 391 sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 ); 392 } 393 return cast(int4)sr; 394 } 395 } 396 unittest 397 { 398 __m128i A = _mm_set1_epi16(31); 399 __m128i B = _mm_set1_epi16(64); 400 short8 avg = cast(short8)(_mm_avg_epu16(A, B)); 401 foreach(i; 0..8) 402 assert(avg.array[i] == 48); 403 } 404 405 /// Average packed unsigned 8-bit integers in `a` and `b`. 406 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted 407 { 408 static if (DMD_with_DSIMD) 409 { 410 return cast(__m128i) __simd(XMM.PAVGB, a, b); 411 } 412 else static if (GDC_with_SSE2) 413 { 414 return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b); 415 } 416 else static if (LDC_with_SSE2 && __VERSION__ >= 2094) 417 { 418 // Exists since LDC 1.18 419 return cast(__m128i) __builtin_ia32_pavgb128(cast(byte16)a, cast(byte16)b); 420 } 421 else static if (LDC_with_ARM64) 422 { 423 return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b); 424 } 425 else static if (LDC_with_optimizations) 426 { 427 // Generates pavgb even in LDC 1.0, even in -O0 428 // But not in ARM 429 enum ir = ` 430 %ia = zext <16 x i8> %0 to <16 x i16> 431 %ib = zext <16 x i8> %1 to <16 x i16> 432 %isum = add <16 x i16> %ia, %ib 433 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 434 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 435 %r = trunc <16 x i16> %isums to <16 x i8> 436 ret <16 x i8> %r`; 437 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 438 } 439 else 440 { 441 byte16 sa = cast(byte16)a; 442 byte16 sb = cast(byte16)b; 443 byte16 sr = void; 444 foreach(i; 0..16) 445 { 446 sr.ptr[i] = cast(ubyte)( (cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]) + 1) >> 1 ); 447 } 448 return cast(int4)sr; 449 } 450 } 451 unittest 452 { 453 __m128i A = _mm_set1_epi8(31); 454 __m128i B = _mm_set1_epi8(64); 455 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B)); 456 foreach(i; 0..16) 457 assert(avg.array[i] == 48); 458 } 459 460 /// Shift `a` left by `bytes` bytes while shifting in zeros. 461 alias _mm_bslli_si128 = _mm_slli_si128; 462 unittest 463 { 464 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 465 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 466 __m128i result = _mm_bslli_si128!5(toShift); 467 assert( (cast(byte16)result).array == exact); 468 } 469 470 /// Shift `v` right by `bytes` bytes while shifting in zeros. 471 alias _mm_bsrli_si128 = _mm_srli_si128; 472 unittest 473 { 474 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 475 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0]; 476 __m128i result = _mm_bsrli_si128!5(toShift); 477 assert( (cast(byte16)result).array == exact); 478 } 479 480 /// Cast vector of type `__m128d` to type `__m128`. 481 /// Note: Also possible with a regular `cast(__m128)(a)`. 482 __m128 _mm_castpd_ps (__m128d a) pure @safe 483 { 484 return cast(__m128)a; 485 } 486 487 /// Cast vector of type `__m128d` to type `__m128i`. 488 /// Note: Also possible with a regular `cast(__m128i)(a)`. 489 __m128i _mm_castpd_si128 (__m128d a) pure @safe 490 { 491 return cast(__m128i)a; 492 } 493 494 /// Cast vector of type `__m128` to type `__m128d`. 495 /// Note: Also possible with a regular `cast(__m128d)(a)`. 496 __m128d _mm_castps_pd (__m128 a) pure @safe 497 { 498 return cast(__m128d)a; 499 } 500 501 /// Cast vector of type `__m128` to type `__m128i`. 502 /// Note: Also possible with a regular `cast(__m128i)(a)`. 503 __m128i _mm_castps_si128 (__m128 a) pure @safe 504 { 505 return cast(__m128i)a; 506 } 507 508 /// Cast vector of type `__m128i` to type `__m128d`. 509 /// Note: Also possible with a regular `cast(__m128d)(a)`. 510 __m128d _mm_castsi128_pd (__m128i a) pure @safe 511 { 512 return cast(__m128d)a; 513 } 514 515 /// Cast vector of type `__m128i` to type `__m128`. 516 /// Note: Also possible with a regular `cast(__m128)(a)`. 517 __m128 _mm_castsi128_ps (__m128i a) pure @safe 518 { 519 return cast(__m128)a; 520 } 521 522 /// Invalidate and flush the cache line that contains `p` 523 /// from all levels of the cache hierarchy. 524 void _mm_clflush (const(void)* p) @trusted 525 { 526 static if (GDC_with_SSE2) 527 { 528 __builtin_ia32_clflush(p); 529 } 530 else static if (LDC_with_SSE2) 531 { 532 __builtin_ia32_clflush(cast(void*)p); 533 } 534 else version(D_InlineAsm_X86) 535 { 536 asm pure nothrow @nogc @trusted 537 { 538 mov EAX, p; 539 clflush [EAX]; 540 } 541 } 542 else version(D_InlineAsm_X86_64) 543 { 544 asm pure nothrow @nogc @trusted 545 { 546 mov RAX, p; 547 clflush [RAX]; 548 } 549 } 550 else 551 { 552 // Do nothing. Invalidating cacheline does 553 // not affect correctness. 554 } 555 } 556 unittest 557 { 558 ubyte[64] cacheline; 559 _mm_clflush(cacheline.ptr); 560 } 561 562 /// Compare packed 16-bit integers in `a` and `b` for equality. 563 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 564 { 565 static if (SIMD_COMPARISON_MASKS_16B) 566 { 567 return cast(__m128i)(cast(short8)a == cast(short8)b); 568 } 569 else static if (GDC_with_SSE2) 570 { 571 return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b); 572 } 573 else 574 { 575 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 576 } 577 } 578 unittest 579 { 580 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 581 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 582 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 583 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 584 assert(R.array == E); 585 } 586 587 /// Compare packed 32-bit integers in `a` and `b` for equality. 588 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 589 { 590 static if (SIMD_COMPARISON_MASKS_16B) 591 { 592 return cast(__m128i)(cast(int4)a == cast(int4)b); 593 } 594 else static if (GDC_with_SSE2) 595 { 596 return __builtin_ia32_pcmpeqd128(a, b); 597 } 598 else 599 { 600 return equalMask!__m128i(a, b); 601 } 602 } 603 unittest 604 { 605 int4 A = [-3, -2, -1, 0]; 606 int4 B = [ 4, -2, 2, 0]; 607 int[4] E = [ 0, -1, 0, -1]; 608 int4 R = cast(int4)(_mm_cmpeq_epi32(A, B)); 609 assert(R.array == E); 610 } 611 612 /// Compare packed 8-bit integers in `a` and `b` for equality. 613 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 614 { 615 static if (SIMD_COMPARISON_MASKS_16B) 616 { 617 return cast(__m128i)(cast(byte16)a == cast(byte16)b); 618 } 619 else static if (GDC_with_SSE2) 620 { 621 return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b); 622 } 623 else 624 { 625 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 626 } 627 } 628 unittest 629 { 630 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 631 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 632 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 633 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 634 assert(C.array == correct); 635 } 636 637 /// Compare packed double-precision (64-bit) floating-point elements 638 /// in `a` and `b` for equality. 639 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 640 { 641 static if (SIMD_COMPARISON_MASKS_16B) 642 { 643 return cast(double2)(cast(double2)a == cast(double2)b); 644 } 645 else static if (GDC_with_SSE2) 646 { 647 return __builtin_ia32_cmpeqpd(a, b); 648 } 649 else 650 { 651 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b); 652 } 653 } 654 unittest 655 { 656 double2 A = _mm_setr_pd(1.0, 2.0); 657 double2 B = _mm_setr_pd(0.0, 2.0); 658 double2 N = _mm_setr_pd(double.nan, double.nan); 659 long2 C = cast(long2) _mm_cmpeq_pd(A, B); 660 long[2] correctC = [0, -1]; 661 assert(C.array == correctC); 662 long2 D = cast(long2) _mm_cmpeq_pd(N, N); 663 long[2] correctD = [0, 0]; 664 assert(D.array == correctD); 665 } 666 667 /// Compare the lower double-precision (64-bit) floating-point elements 668 /// in `a` and `b` for equality, store the result in the lower element, 669 /// and copy the upper element from `a`. 670 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 671 { 672 static if (DMD_with_DSIMD) 673 { 674 return cast(__m128d) __simd(XMM.CMPSD, a, b, 0); 675 } 676 else static if (GDC_with_SSE2) 677 { 678 return __builtin_ia32_cmpeqsd(a, b); 679 } 680 else 681 { 682 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b); 683 } 684 } 685 unittest 686 { 687 double2 A = _mm_setr_pd(0.0, 2.0); 688 double2 B = _mm_setr_pd(1.0, 2.0); 689 double2 C = _mm_setr_pd(1.0, 3.0); 690 double2 D = cast(double2) _mm_cmpeq_sd(A, B); 691 long2 E = cast(long2) _mm_cmpeq_sd(B, C); 692 double[2] correctD = [0.0, 2.0]; 693 double two = 2.0; 694 long[2] correctE = [-1, *cast(long*)&two]; 695 assert(D.array == correctD); 696 assert(E.array == correctE); 697 } 698 699 /// Compare packed 16-bit integers elements in `a` and `b` for greater-than-or-equal. 700 /// #BONUS 701 __m128i _mm_cmpge_epi16 (__m128i a, __m128i b) pure @safe 702 { 703 static if (SIMD_COMPARISON_MASKS_16B) 704 { 705 return cast(__m128i)(cast(short8)a >= cast(short8)b); 706 } 707 else version (LDC) 708 { 709 // LDC ARM64: generates cmge since -O1 710 return cast(__m128i) greaterOrEqualMask!short8(cast(short8)a, cast(short8)b); 711 } 712 else 713 { 714 return _mm_xor_si128(_mm_cmpeq_epi16(a, b), _mm_cmpgt_epi16(a, b)); 715 } 716 } 717 unittest 718 { 719 short8 A = [-3, -2, -32768, 0, 0, 1, 2, 3]; 720 short8 B = [ 4, 3, 32767, 1, 0, -1, -2, -3]; 721 short[8] E = [ 0, 0, 0, 0, -1, -1, -1, -1]; 722 short8 R = cast(short8)(_mm_cmpge_epi16(cast(__m128i)A, cast(__m128i)B)); 723 assert(R.array == E); 724 } 725 726 /// Compare packed double-precision (64-bit) floating-point elements 727 /// in `a` and `b` for greater-than-or-equal. 728 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 729 { 730 static if (SIMD_COMPARISON_MASKS_16B) 731 { 732 return cast(__m128d)(a >= b); 733 } 734 else static if (GDC_with_SSE2) 735 { 736 return __builtin_ia32_cmpgepd(a, b); 737 } 738 else 739 { 740 return cast(__m128d) cmppd!(FPComparison.oge)(a, b); 741 } 742 } 743 744 /// Compare the lower double-precision (64-bit) floating-point elements 745 /// in `a` and `b` for greater-than-or-equal, store the result in the 746 /// lower element, and copy the upper element from `a`. 747 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 748 { 749 static if (DMD_with_DSIMD) 750 { 751 return cast(__m128d) __simd(XMM.CMPSD, b, a, 2); 752 } 753 else static if (GDC_with_SSE2) 754 { 755 return __builtin_ia32_cmplesd(b, a); 756 } 757 else 758 { 759 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b); 760 } 761 } 762 unittest 763 { 764 __m128d A = _mm_setr_pd(1.0, 0.0); 765 __m128d B = _mm_setr_pd(double.nan, 0.0); 766 __m128d C = _mm_setr_pd(2.0, 0.0); 767 assert( (cast(long2)_mm_cmpge_sd(A, A)).array[0] == -1); 768 assert( (cast(long2)_mm_cmpge_sd(A, B)).array[0] == 0); 769 assert( (cast(long2)_mm_cmpge_sd(A, C)).array[0] == 0); 770 assert( (cast(long2)_mm_cmpge_sd(B, A)).array[0] == 0); 771 assert( (cast(long2)_mm_cmpge_sd(B, B)).array[0] == 0); 772 assert( (cast(long2)_mm_cmpge_sd(B, C)).array[0] == 0); 773 assert( (cast(long2)_mm_cmpge_sd(C, A)).array[0] == -1); 774 assert( (cast(long2)_mm_cmpge_sd(C, B)).array[0] == 0); 775 assert( (cast(long2)_mm_cmpge_sd(C, C)).array[0] == -1); 776 } 777 778 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 779 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 780 { 781 static if (SIMD_COMPARISON_MASKS_16B) 782 { 783 return cast(__m128i)(cast(short8)a > cast(short8)b); 784 } 785 else static if (GDC_with_SSE2) 786 { 787 return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b); 788 } 789 else 790 { 791 return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b); 792 } 793 } 794 unittest 795 { 796 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 797 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 798 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 799 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 800 assert(R.array == E); 801 } 802 803 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 804 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 805 { 806 static if (SIMD_COMPARISON_MASKS_16B) 807 { 808 return cast(__m128i)(cast(int4)a > cast(int4)b); 809 } 810 else static if (GDC_with_SSE2) 811 { 812 return __builtin_ia32_pcmpgtd128(a, b); 813 } 814 else 815 { 816 return cast(__m128i)( greaterMask!int4(a, b)); 817 } 818 } 819 unittest 820 { 821 int4 A = [-3, 2, -1, 0]; 822 int4 B = [ 4, -2, 2, 0]; 823 int[4] E = [ 0, -1, 0, 0]; 824 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 825 assert(R.array == E); 826 } 827 828 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 829 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 830 { 831 static if (SIMD_COMPARISON_MASKS_16B) 832 { 833 return cast(__m128i)(cast(byte16)a > cast(byte16)b); 834 } 835 else 836 { 837 // Note: __builtin_ia32_pcmpgtb128 is buggy on some old GDC, do not use 838 return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b); 839 } 840 } 841 unittest 842 { 843 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 844 __m128i B = _mm_setr_epi8(2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 845 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 846 byte[16] correct = [0, 0,-1, 0, -1, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 847 __m128i D = _mm_cmpeq_epi8(A, B); 848 assert(C.array == correct); 849 } 850 851 /// Compare packed double-precision (64-bit) floating-point elements 852 /// in `a` and `b` for greater-than. 853 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 854 { 855 static if (SIMD_COMPARISON_MASKS_16B) 856 { 857 return cast(__m128d)(a > b); 858 } 859 else static if (GDC_with_SSE2) 860 { 861 return __builtin_ia32_cmpgtpd(a, b); 862 } 863 else 864 { 865 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b); 866 } 867 } 868 869 /// Compare the lower double-precision (64-bit) floating-point elements 870 /// in `a` and `b` for greater-than, store the result in the lower element, 871 /// and copy the upper element from `a`. 872 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 873 { 874 static if (DMD_with_DSIMD) 875 { 876 return cast(__m128d) __simd(XMM.CMPSD, b, a, 1); 877 } 878 else static if (GDC_with_SSE2) 879 { 880 return __builtin_ia32_cmpltsd(b, a); 881 } 882 else 883 { 884 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b); 885 } 886 } 887 unittest 888 { 889 __m128d A = _mm_setr_pd(1.0, 0.0); 890 __m128d B = _mm_setr_pd(double.nan, 0.0); 891 __m128d C = _mm_setr_pd(2.0, 0.0); 892 assert( (cast(long2)_mm_cmpgt_sd(A, A)).array[0] == 0); 893 assert( (cast(long2)_mm_cmpgt_sd(A, B)).array[0] == 0); 894 assert( (cast(long2)_mm_cmpgt_sd(A, C)).array[0] == 0); 895 assert( (cast(long2)_mm_cmpgt_sd(B, A)).array[0] == 0); 896 assert( (cast(long2)_mm_cmpgt_sd(B, B)).array[0] == 0); 897 assert( (cast(long2)_mm_cmpgt_sd(B, C)).array[0] == 0); 898 assert( (cast(long2)_mm_cmpgt_sd(C, A)).array[0] == -1); 899 assert( (cast(long2)_mm_cmpgt_sd(C, B)).array[0] == 0); 900 assert( (cast(long2)_mm_cmpgt_sd(C, C)).array[0] == 0); 901 } 902 903 904 /// Compare packed 16-bit integers elements in `a` and `b` for greater-than-or-equal. 905 /// #BONUS 906 __m128i _mm_cmple_epi16 (__m128i a, __m128i b) pure @safe 907 { 908 static if (SIMD_COMPARISON_MASKS_16B) 909 { 910 return cast(__m128i)(cast(short8)a <= cast(short8)b); 911 } 912 else version (LDC) 913 { 914 // LDC ARM64: generates cmge since -O1 915 return cast(__m128i) greaterOrEqualMask!short8(cast(short8)b, cast(short8)a); 916 } 917 else 918 { 919 return _mm_xor_si128(_mm_cmpeq_epi16(b, a), _mm_cmpgt_epi16(b, a)); 920 } 921 } 922 unittest 923 { 924 short8 A = [-3, -2, -32768, 1, 0, 1, 2, 3]; 925 short8 B = [ 4, 3, 32767, 0, 0, -1, -2, -3]; 926 short[8] E = [-1, -1, -1, 0, -1, 0, 0, 0]; 927 short8 R = cast(short8)(_mm_cmple_epi16(cast(__m128i)A, cast(__m128i)B)); 928 assert(R.array == E); 929 } 930 931 /// Compare packed double-precision (64-bit) floating-point elements 932 /// in `a` and `b` for less-than-or-equal. 933 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 934 { 935 static if (SIMD_COMPARISON_MASKS_16B) 936 { 937 return cast(__m128d)(a <= b); 938 } 939 else static if (GDC_with_SSE2) 940 { 941 return __builtin_ia32_cmplepd(a, b); 942 } 943 else 944 { 945 return cast(__m128d) cmppd!(FPComparison.ole)(a, b); 946 } 947 } 948 949 /// Compare the lower double-precision (64-bit) floating-point elements 950 /// in `a` and `b` for less-than-or-equal, store the result in the 951 /// lower element, and copy the upper element from `a`. 952 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 953 { 954 static if (DMD_with_DSIMD) 955 { 956 return cast(__m128d) __simd(XMM.CMPSD, a, b, 2); 957 } 958 else static if (GDC_with_SSE2) 959 { 960 return __builtin_ia32_cmplesd(a, b); 961 } 962 else 963 { 964 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b); 965 } 966 } 967 968 /// Compare packed 16-bit integers in `a` and `b` for less-than. 969 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 970 { 971 return _mm_cmpgt_epi16(b, a); 972 } 973 974 /// Compare packed 32-bit integers in `a` and `b` for less-than. 975 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 976 { 977 return _mm_cmpgt_epi32(b, a); 978 } 979 980 /// Compare packed 8-bit integers in `a` and `b` for less-than. 981 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 982 { 983 return _mm_cmpgt_epi8(b, a); 984 } 985 986 /// Compare packed double-precision (64-bit) floating-point elements 987 /// in `a` and `b` for less-than. 988 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 989 { 990 static if (SIMD_COMPARISON_MASKS_16B) 991 { 992 return cast(__m128d)(a < b); 993 } 994 else static if (GDC_with_SSE2) 995 { 996 return __builtin_ia32_cmpltpd(a, b); 997 } 998 else 999 { 1000 return cast(__m128d) cmppd!(FPComparison.olt)(a, b); 1001 } 1002 } 1003 1004 /// Compare the lower double-precision (64-bit) floating-point elements 1005 /// in `a` and `b` for less-than, store the result in the lower 1006 /// element, and copy the upper element from `a`. 1007 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 1008 { 1009 static if (DMD_with_DSIMD) 1010 { 1011 return cast(__m128d) __simd(XMM.CMPSD, a, b, 1); 1012 } 1013 else static if (GDC_with_SSE2) 1014 { 1015 return __builtin_ia32_cmpltsd(a, b); 1016 } 1017 else 1018 { 1019 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b); 1020 } 1021 } 1022 1023 /// Compare packed double-precision (64-bit) floating-point elements 1024 /// in `a` and `b` for not-equal. 1025 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 1026 { 1027 static if (GDC_with_SSE2) 1028 { 1029 return __builtin_ia32_cmpneqpd(a, b); 1030 } 1031 else 1032 { 1033 return cast(__m128d) cmppd!(FPComparison.une)(a, b); 1034 } 1035 } 1036 1037 /// Compare the lower double-precision (64-bit) floating-point elements 1038 /// in `a` and `b` for not-equal, store the result in the lower 1039 /// element, and copy the upper element from `a`. 1040 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 1041 { 1042 static if (GDC_with_SSE2) 1043 { 1044 return __builtin_ia32_cmpneqsd(a, b); 1045 } 1046 else 1047 { 1048 return cast(__m128d) cmpsd!(FPComparison.une)(a, b); 1049 } 1050 } 1051 1052 /// Compare packed double-precision (64-bit) floating-point elements 1053 /// in `a` and `b` for not-greater-than-or-equal. 1054 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 1055 { 1056 static if (GDC_with_SSE2) 1057 { 1058 return __builtin_ia32_cmpngepd(a, b); 1059 } 1060 else 1061 { 1062 return cast(__m128d) cmppd!(FPComparison.ult)(a, b); 1063 } 1064 } 1065 1066 /// Compare the lower double-precision (64-bit) floating-point elements 1067 /// in `a` and `b` for not-greater-than-or-equal, store the result in 1068 /// the lower element, and copy the upper element from `a`. 1069 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 1070 { 1071 // Note: There is no __builtin_ia32_cmpngesd builtin. 1072 static if (GDC_with_SSE2) 1073 { 1074 return __builtin_ia32_cmpltsd(b, a); 1075 } 1076 else 1077 { 1078 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b); 1079 } 1080 } 1081 1082 /// Compare packed double-precision (64-bit) floating-point elements 1083 /// in `a` and `b` for not-greater-than. 1084 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 1085 { 1086 static if (GDC_with_SSE2) 1087 { 1088 return __builtin_ia32_cmpngtpd(a, b); 1089 } 1090 else 1091 { 1092 return cast(__m128d) cmppd!(FPComparison.ule)(a, b); 1093 } 1094 } 1095 1096 /// Compare the lower double-precision (64-bit) floating-point elements 1097 /// in `a` and `b` for not-greater-than, store the result in the 1098 /// lower element, and copy the upper element from `a`. 1099 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 1100 { 1101 // Note: There is no __builtin_ia32_cmpngtsd builtin. 1102 static if (GDC_with_SSE2) 1103 { 1104 return __builtin_ia32_cmplesd(b, a); 1105 } 1106 else 1107 { 1108 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b); 1109 } 1110 } 1111 1112 /// Compare packed double-precision (64-bit) floating-point elements 1113 /// in `a` and `b` for not-less-than-or-equal. 1114 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 1115 { 1116 static if (GDC_with_SSE2) 1117 { 1118 return __builtin_ia32_cmpnlepd(a, b); 1119 } 1120 else 1121 { 1122 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b); 1123 } 1124 } 1125 1126 /// Compare the lower double-precision (64-bit) floating-point elements 1127 /// in `a` and `b` for not-less-than-or-equal, store the result in the 1128 /// lower element, and copy the upper element from `a`. 1129 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 1130 { 1131 static if (GDC_with_SSE2) 1132 { 1133 return __builtin_ia32_cmpnlesd(a, b); 1134 } 1135 else 1136 { 1137 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b); 1138 } 1139 } 1140 1141 /// Compare packed double-precision (64-bit) floating-point elements 1142 /// in `a` and `b` for not-less-than. 1143 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 1144 { 1145 static if (GDC_with_SSE2) 1146 { 1147 return __builtin_ia32_cmpnltpd(a, b); 1148 } 1149 else 1150 { 1151 return cast(__m128d) cmppd!(FPComparison.uge)(a, b); 1152 } 1153 } 1154 1155 /// Compare the lower double-precision (64-bit) floating-point elements 1156 /// in `a` and `b` for not-less-than, store the result in the lower 1157 /// element, and copy the upper element from `a`. 1158 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 1159 { 1160 static if (GDC_with_SSE2) 1161 { 1162 return __builtin_ia32_cmpnltsd(a, b); 1163 } 1164 else 1165 { 1166 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b); 1167 } 1168 } 1169 1170 /// Compare packed double-precision (64-bit) floating-point elements 1171 /// in `a` and `b` to see if neither is NaN. 1172 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 1173 { 1174 static if (GDC_with_SSE2) 1175 { 1176 return __builtin_ia32_cmpordpd(a, b); 1177 } 1178 else 1179 { 1180 return cast(__m128d) cmppd!(FPComparison.ord)(a, b); 1181 } 1182 } 1183 1184 /// Compare the lower double-precision (64-bit) floating-point elements 1185 /// in `a` and `b` to see if neither is NaN, store the result in the 1186 /// lower element, and copy the upper element from `a` to the upper element. 1187 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 1188 { 1189 static if (GDC_with_SSE2) 1190 { 1191 return __builtin_ia32_cmpordsd(a, b); 1192 } 1193 else 1194 { 1195 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b); 1196 } 1197 } 1198 1199 /// Compare packed double-precision (64-bit) floating-point elements 1200 /// in `a` and `b` to see if either is NaN. 1201 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 1202 { 1203 static if (GDC_with_SSE2) 1204 { 1205 return __builtin_ia32_cmpunordpd(a, b); 1206 } 1207 else 1208 { 1209 return cast(__m128d) cmppd!(FPComparison.uno)(a, b); 1210 } 1211 } 1212 1213 /// Compare the lower double-precision (64-bit) floating-point elements 1214 /// in `a` and `b` to see if either is NaN, store the result in the lower 1215 /// element, and copy the upper element from `a` to the upper element. 1216 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 1217 { 1218 static if (GDC_with_SSE2) 1219 { 1220 return __builtin_ia32_cmpunordsd(a, b); 1221 } 1222 else 1223 { 1224 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b); 1225 } 1226 } 1227 1228 /// Compare the lower double-precision (64-bit) floating-point element 1229 /// in `a` and `b` for equality, and return the boolean result (0 or 1). 1230 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe 1231 { 1232 // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 1233 // comisd instruction, it returns false in case of unordered instead. 1234 // 1235 // Actually C++ compilers disagree over the meaning of that instruction. 1236 // GCC will manage NaNs like the comisd instruction (return true if unordered), 1237 // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says. 1238 // We choose to do like the most numerous. It seems GCC is buggy with NaNs. 1239 return a.array[0] == b.array[0]; 1240 } 1241 unittest 1242 { 1243 assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1244 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1245 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1246 assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1247 assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1248 } 1249 1250 /// Compare the lower double-precision (64-bit) floating-point element 1251 /// in `a` and `b` for greater-than-or-equal, and return the boolean 1252 /// result (0 or 1). 1253 int _mm_comige_sd (__m128d a, __m128d b) pure @safe 1254 { 1255 return a.array[0] >= b.array[0]; 1256 } 1257 unittest 1258 { 1259 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1260 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1261 assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1262 assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1263 assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1264 assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1265 } 1266 1267 /// Compare the lower double-precision (64-bit) floating-point element 1268 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1). 1269 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe 1270 { 1271 return a.array[0] > b.array[0]; 1272 } 1273 unittest 1274 { 1275 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1276 assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1277 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1278 assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1279 assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1280 } 1281 1282 /// Compare the lower double-precision (64-bit) floating-point element 1283 /// in `a` and `b` for less-than-or-equal. 1284 int _mm_comile_sd (__m128d a, __m128d b) pure @safe 1285 { 1286 return a.array[0] <= b.array[0]; 1287 } 1288 unittest 1289 { 1290 assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1291 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1292 assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1293 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1294 assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1295 assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1296 } 1297 1298 /// Compare the lower double-precision (64-bit) floating-point element 1299 /// in `a` and `b` for less-than, and return the boolean result (0 or 1). 1300 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe 1301 { 1302 return a.array[0] < b.array[0]; 1303 } 1304 unittest 1305 { 1306 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1307 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1308 assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1309 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1310 assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1311 assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1312 } 1313 1314 /// Compare the lower double-precision (64-bit) floating-point element 1315 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1). 1316 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe 1317 { 1318 return a.array[0] != b.array[0]; 1319 } 1320 unittest 1321 { 1322 assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1323 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1324 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1325 assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1326 assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1327 } 1328 1329 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) 1330 /// floating-point elements. 1331 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted 1332 { 1333 static if (LDC_with_optimizations) 1334 { 1335 // Generates cvtdq2pd since LDC 1.0, even without optimizations 1336 enum ir = ` 1337 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 1338 %r = sitofp <2 x i32> %v to <2 x double> 1339 ret <2 x double> %r`; 1340 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a); 1341 } 1342 else static if (GDC_with_SSE2) 1343 { 1344 return __builtin_ia32_cvtdq2pd(a); 1345 } 1346 else 1347 { 1348 double2 r = void; 1349 r.ptr[0] = a.array[0]; 1350 r.ptr[1] = a.array[1]; 1351 return r; 1352 } 1353 } 1354 unittest 1355 { 1356 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54)); 1357 assert(A.array[0] == 54.0); 1358 assert(A.array[1] == 54.0); 1359 } 1360 1361 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 1362 /// floating-point elements. 1363 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted 1364 { 1365 static if (DMD_with_DSIMD) 1366 { 1367 return cast(__m128)__simd(XMM.CVTDQ2PS, cast(void16) a); 1368 } 1369 else static if (GDC_with_SSE2) 1370 { 1371 return __builtin_ia32_cvtdq2ps(a); 1372 } 1373 else static if (LDC_with_optimizations) 1374 { 1375 // See #86 for why we had to resort to LLVM IR. 1376 // Plain code below was leading to catastrophic behaviour. 1377 // x86: Generates cvtdq2ps since LDC 1.1.0 -O0 1378 // ARM: Generats scvtf.4s since LDC 1.8.0 -O0 1379 enum ir = ` 1380 %r = sitofp <4 x i32> %0 to <4 x float> 1381 ret <4 x float> %r`; 1382 return cast(__m128) LDCInlineIR!(ir, float4, int4)(a); 1383 } 1384 else static if (LDC_with_x86_asm) 1385 { 1386 __m128 r; 1387 asm pure nothrow @nogc @trusted 1388 { 1389 movdqu XMM0, a; 1390 cvtdq2ps XMM0, XMM0; 1391 movdqu r, XMM0; 1392 } 1393 return r; 1394 } 1395 else 1396 { 1397 __m128 res; // PERF =void; 1398 res.ptr[0] = cast(float)a.array[0]; 1399 res.ptr[1] = cast(float)a.array[1]; 1400 res.ptr[2] = cast(float)a.array[2]; 1401 res.ptr[3] = cast(float)a.array[3]; 1402 return res; 1403 } 1404 } 1405 unittest 1406 { 1407 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 1408 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 1409 } 1410 1411 /// Convert packed double-precision (64-bit) floating-point elements 1412 /// in `a` to packed 32-bit integers. 1413 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted 1414 { 1415 // PERF ARM32 1416 static if (LDC_with_SSE2) 1417 { 1418 return __builtin_ia32_cvtpd2dq(a); 1419 } 1420 else static if (GDC_with_SSE2) 1421 { 1422 return __builtin_ia32_cvtpd2dq(a); 1423 } 1424 else static if (LDC_with_ARM64) 1425 { 1426 // Get current rounding mode. 1427 uint fpscr = arm_get_fpcr(); 1428 long2 i; 1429 switch(fpscr & _MM_ROUND_MASK_ARM) 1430 { 1431 default: 1432 case _MM_ROUND_NEAREST_ARM: i = vcvtnq_s64_f64(a); break; 1433 case _MM_ROUND_DOWN_ARM: i = vcvtmq_s64_f64(a); break; 1434 case _MM_ROUND_UP_ARM: i = vcvtpq_s64_f64(a); break; 1435 case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break; 1436 } 1437 int4 zero = 0; 1438 return cast(__m128i) shufflevectorLDC!(int4, 0, 2, 4, 6)(cast(int4)i, zero); // PERF: this slow down build for nothing, test without shufflevector 1439 } 1440 else 1441 { 1442 // PERF ARM32 1443 __m128i r = _mm_setzero_si128(); 1444 r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]); 1445 r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]); 1446 return r; 1447 } 1448 } 1449 unittest 1450 { 1451 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0)); 1452 assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0); 1453 } 1454 1455 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1456 /// to packed 32-bit integers 1457 __m64 _mm_cvtpd_pi32 (__m128d v) @safe 1458 { 1459 return to_m64(_mm_cvtpd_epi32(v)); 1460 } 1461 unittest 1462 { 1463 int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0)); 1464 assert(A.array[0] == 55 && A.array[1] == 61); 1465 } 1466 1467 /// Convert packed double-precision (64-bit) floating-point elements 1468 /// in `a` to packed single-precision (32-bit) floating-point elements. 1469 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted 1470 { 1471 static if (LDC_with_SSE2) 1472 { 1473 return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately 1474 } 1475 else static if (GDC_with_SSE2) 1476 { 1477 return __builtin_ia32_cvtpd2ps(a); 1478 } 1479 else 1480 { 1481 __m128 r = void; 1482 r.ptr[0] = a.array[0]; 1483 r.ptr[1] = a.array[1]; 1484 r.ptr[2] = 0; 1485 r.ptr[3] = 0; 1486 return r; 1487 } 1488 } 1489 unittest 1490 { 1491 __m128d A = _mm_set_pd(5.25, 4.0); 1492 __m128 B = _mm_cvtpd_ps(A); 1493 assert(B.array == [4.0f, 5.25f, 0, 0]); 1494 } 1495 1496 /// Convert packed 32-bit integers in `v` to packed double-precision 1497 /// (64-bit) floating-point elements. 1498 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe 1499 { 1500 return _mm_cvtepi32_pd(to_m128i(v)); 1501 } 1502 unittest 1503 { 1504 __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5)); 1505 assert(A.array[0] == 4.0 && A.array[1] == -5.0); 1506 } 1507 1508 /// Convert packed single-precision (32-bit) floating-point elements 1509 /// in `a` to packed 32-bit integers 1510 __m128i _mm_cvtps_epi32 (__m128 a) @trusted 1511 { 1512 static if (LDC_with_SSE2) 1513 { 1514 return cast(__m128i) __builtin_ia32_cvtps2dq(a); 1515 } 1516 else static if (GDC_with_SSE2) 1517 { 1518 return __builtin_ia32_cvtps2dq(a); 1519 } 1520 else static if (LDC_with_ARM64) 1521 { 1522 // Get current rounding mode. 1523 uint fpscr = arm_get_fpcr(); 1524 switch(fpscr & _MM_ROUND_MASK_ARM) 1525 { 1526 default: 1527 case _MM_ROUND_NEAREST_ARM: return vcvtnq_s32_f32(a); 1528 case _MM_ROUND_DOWN_ARM: return vcvtmq_s32_f32(a); 1529 case _MM_ROUND_UP_ARM: return vcvtpq_s32_f32(a); 1530 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a); 1531 } 1532 } 1533 else 1534 { 1535 __m128i r = void; 1536 r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]); 1537 r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]); 1538 r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]); 1539 r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]); 1540 return r; 1541 } 1542 } 1543 unittest 1544 { 1545 // GDC bug #98607 1546 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607 1547 // GDC does not provide optimization barrier for rounding mode. 1548 // Workarounded with different literals. This bug will likely only manifest in unittest. 1549 // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't. 1550 1551 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1552 1553 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1554 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1555 assert(A.array == [1, -2, 54, -3]); 1556 1557 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1558 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f)); 1559 assert(A.array == [1, -3, 53, -3]); 1560 1561 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1562 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f)); 1563 assert(A.array == [2, -2, 54, -2]); 1564 1565 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1566 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f)); 1567 assert(A.array == [1, -2, 53, -2]); 1568 1569 _MM_SET_ROUNDING_MODE(savedRounding); 1570 } 1571 1572 /// Convert packed single-precision (32-bit) floating-point elements 1573 /// in `a` to packed double-precision (64-bit) floating-point elements. 1574 __m128d _mm_cvtps_pd (__m128 a) pure @trusted 1575 { 1576 static if (LDC_with_optimizations) 1577 { 1578 // Generates cvtps2pd since LDC 1.0 -O0 1579 enum ir = ` 1580 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1> 1581 %r = fpext <2 x float> %v to <2 x double> 1582 ret <2 x double> %r`; 1583 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a); 1584 } 1585 else static if (GDC_with_SSE2) 1586 { 1587 return __builtin_ia32_cvtps2pd(a); 1588 } 1589 else 1590 { 1591 double2 r = void; 1592 r.ptr[0] = a.array[0]; 1593 r.ptr[1] = a.array[1]; 1594 return r; 1595 } 1596 } 1597 unittest 1598 { 1599 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f)); 1600 assert(A.array[0] == 54.0); 1601 assert(A.array[1] == 54.0); 1602 } 1603 1604 /// Copy the lower double-precision (64-bit) floating-point element of `a`. 1605 double _mm_cvtsd_f64 (__m128d a) pure @safe 1606 { 1607 return a.array[0]; 1608 } 1609 1610 /// Convert the lower double-precision (64-bit) floating-point element 1611 /// in `a` to a 32-bit integer. 1612 int _mm_cvtsd_si32 (__m128d a) @safe 1613 { 1614 static if (LDC_with_SSE2) 1615 { 1616 return __builtin_ia32_cvtsd2si(a); 1617 } 1618 else static if (GDC_with_SSE2) 1619 { 1620 return __builtin_ia32_cvtsd2si(a); 1621 } 1622 else 1623 { 1624 return convertDoubleToInt32UsingMXCSR(a[0]); 1625 } 1626 } 1627 unittest 1628 { 1629 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0))); 1630 } 1631 1632 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer. 1633 long _mm_cvtsd_si64 (__m128d a) @trusted 1634 { 1635 static if (LDC_with_SSE2) 1636 { 1637 version (X86_64) 1638 { 1639 return __builtin_ia32_cvtsd2si64(a); 1640 } 1641 else 1642 { 1643 // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer 1644 // using SSE instructions only. So the builtin doesn't exist for this arch. 1645 return convertDoubleToInt64UsingMXCSR(a[0]); 1646 } 1647 } 1648 else 1649 { 1650 return convertDoubleToInt64UsingMXCSR(a.array[0]); 1651 } 1652 } 1653 unittest 1654 { 1655 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0))); 1656 1657 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1658 1659 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1660 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49))); 1661 1662 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1663 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1))); 1664 1665 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1666 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1))); 1667 1668 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1669 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9))); 1670 1671 _MM_SET_ROUNDING_MODE(savedRounding); 1672 } 1673 1674 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; /// 1675 1676 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 1677 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a` 1678 /// to the upper elements of result. 1679 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted 1680 { 1681 static if (GDC_with_SSE2) 1682 { 1683 return __builtin_ia32_cvtsd2ss(a, b); 1684 } 1685 else 1686 { 1687 // Generates cvtsd2ss since LDC 1.3 -O0 1688 a.ptr[0] = b.array[0]; 1689 return a; 1690 } 1691 } 1692 unittest 1693 { 1694 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0)); 1695 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]); 1696 } 1697 1698 /// Get the lower 32-bit integer in `a`. 1699 int _mm_cvtsi128_si32 (__m128i a) pure @safe 1700 { 1701 return a.array[0]; 1702 } 1703 1704 /// Get the lower 64-bit integer in `a`. 1705 long _mm_cvtsi128_si64 (__m128i a) pure @safe 1706 { 1707 long2 la = cast(long2)a; 1708 return la.array[0]; 1709 } 1710 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 1711 1712 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 1713 /// lower element of result, and copy the upper element from `a` to the upper element of result. 1714 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted 1715 { 1716 a.ptr[0] = cast(double)b; 1717 return a; 1718 } 1719 unittest 1720 { 1721 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 1722 assert(a.array == [42.0, 0]); 1723 } 1724 1725 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements. 1726 __m128i _mm_cvtsi32_si128 (int a) pure @trusted 1727 { 1728 int4 r = [0, 0, 0, 0]; 1729 r.ptr[0] = a; 1730 return r; 1731 } 1732 unittest 1733 { 1734 __m128i a = _mm_cvtsi32_si128(65); 1735 assert(a.array == [65, 0, 0, 0]); 1736 } 1737 1738 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 1739 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 1740 1741 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted 1742 { 1743 a.ptr[0] = cast(double)b; 1744 return a; 1745 } 1746 unittest 1747 { 1748 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 1749 assert(a.array == [42.0, 0]); 1750 } 1751 1752 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element. 1753 __m128i _mm_cvtsi64_si128 (long a) pure @trusted 1754 { 1755 long2 r = [0, 0]; 1756 r.ptr[0] = a; 1757 return cast(__m128i)(r); 1758 } 1759 1760 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; /// 1761 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; /// 1762 1763 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 1764 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 1765 // element of result. 1766 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted 1767 { 1768 a.ptr[0] = b.array[0]; 1769 return a; 1770 } 1771 unittest 1772 { 1773 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 1774 assert(a.array == [42.0, 0]); 1775 } 1776 1777 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation. 1778 long _mm_cvttss_si64 (__m128 a) pure @safe 1779 { 1780 return cast(long)(a.array[0]); // Generates cvttss2si as expected 1781 } 1782 unittest 1783 { 1784 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1785 } 1786 1787 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1788 /// Put zeroes in the upper elements of result. 1789 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted 1790 { 1791 static if (LDC_with_SSE2) 1792 { 1793 return __builtin_ia32_cvttpd2dq(a); 1794 } 1795 else static if (GDC_with_SSE2) 1796 { 1797 return __builtin_ia32_cvttpd2dq(a); 1798 } 1799 else 1800 { 1801 // Note: doesn't generate cvttpd2dq as of LDC 1.13 1802 __m128i r; // PERF =void; 1803 r.ptr[0] = cast(int)a.array[0]; 1804 r.ptr[1] = cast(int)a.array[1]; 1805 r.ptr[2] = 0; 1806 r.ptr[3] = 0; 1807 return r; 1808 } 1809 } 1810 unittest 1811 { 1812 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f)); 1813 assert(R.array == [-4, 45641, 0, 0]); 1814 } 1815 1816 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1817 /// to packed 32-bit integers with truncation. 1818 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe 1819 { 1820 return to_m64(_mm_cvttpd_epi32(v)); 1821 } 1822 unittest 1823 { 1824 int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f)); 1825 int[2] correct = [-4, 45641]; 1826 assert(R.array == correct); 1827 } 1828 1829 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1830 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted 1831 { 1832 // x86: Generates cvttps2dq since LDC 1.3 -O2 1833 // ARM64: generates fcvtze since LDC 1.8 -O2 1834 __m128i r; // PERF = void; 1835 r.ptr[0] = cast(int)a.array[0]; 1836 r.ptr[1] = cast(int)a.array[1]; 1837 r.ptr[2] = cast(int)a.array[2]; 1838 r.ptr[3] = cast(int)a.array[3]; 1839 return r; 1840 } 1841 unittest 1842 { 1843 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f)); 1844 assert(R.array == [-4, 45641, 0, 1]); 1845 } 1846 1847 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation. 1848 int _mm_cvttsd_si32 (__m128d a) 1849 { 1850 // Generates cvttsd2si since LDC 1.3 -O0 1851 return cast(int)a.array[0]; 1852 } 1853 1854 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation. 1855 long _mm_cvttsd_si64 (__m128d a) 1856 { 1857 // Generates cvttsd2si since LDC 1.3 -O0 1858 // but in 32-bit instead, it's a long sequence that resort to FPU 1859 return cast(long)a.array[0]; 1860 } 1861 1862 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; /// 1863 1864 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`. 1865 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe 1866 { 1867 pragma(inline, true); 1868 return a / b; 1869 } 1870 1871 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted 1872 { 1873 static if (GDC_with_SSE2) 1874 { 1875 return __builtin_ia32_divsd(a, b); 1876 } 1877 else version(DigitalMars) 1878 { 1879 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1880 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 1881 asm pure nothrow @nogc @trusted { nop;} 1882 a.array[0] = a.array[0] / b.array[0]; 1883 return a; 1884 } 1885 else 1886 { 1887 a.ptr[0] /= b.array[0]; 1888 return a; 1889 } 1890 } 1891 unittest 1892 { 1893 __m128d a = [2.0, 4.5]; 1894 a = _mm_div_sd(a, a); 1895 assert(a.array == [1.0, 4.5]); 1896 } 1897 1898 /// Extract a 16-bit integer from `v`, selected with `index`. 1899 /// Warning: the returned value is zero-extended to 32-bits. 1900 int _mm_extract_epi16(__m128i v, int index) pure @safe 1901 { 1902 short8 r = cast(short8)v; 1903 return cast(ushort)(r.array[index & 7]); 1904 } 1905 unittest 1906 { 1907 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1); 1908 assert(_mm_extract_epi16(A, 6) == 6); 1909 assert(_mm_extract_epi16(A, 0) == 65535); 1910 assert(_mm_extract_epi16(A, 5 + 8) == 5); 1911 } 1912 1913 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`. 1914 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted 1915 { 1916 short8 r = cast(short8)v; 1917 r.ptr[index & 7] = cast(short)i; 1918 return cast(__m128i)r; 1919 } 1920 unittest 1921 { 1922 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1923 short8 R = cast(short8) _mm_insert_epi16(A, 42, 6); 1924 short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7]; 1925 assert(R.array == correct); 1926 } 1927 1928 /// Perform a serializing operation on all load-from-memory instructions that were issued prior 1929 /// to this instruction. Guarantees that every load instruction that precedes, in program order, 1930 /// is globally visible before any load instruction which follows the fence in program order. 1931 void _mm_lfence() @trusted 1932 { 1933 version(GNU) 1934 { 1935 static if (GDC_with_SSE2) 1936 { 1937 __builtin_ia32_lfence(); 1938 } 1939 else version(X86) 1940 { 1941 asm pure nothrow @nogc @trusted 1942 { 1943 "lfence;\n" : : : ; 1944 } 1945 } 1946 else __warn_noop(); 1947 } 1948 else static if (LDC_with_SSE2) 1949 { 1950 __builtin_ia32_lfence(); 1951 } 1952 else static if (LDC_with_ARM64) 1953 { 1954 __builtin_arm_dmb(9); // dmb ishld 1955 } 1956 else static if (DMD_with_asm) 1957 { 1958 asm nothrow @nogc pure @trusted 1959 { 1960 lfence; 1961 } 1962 } 1963 else version(LDC) 1964 { 1965 // When the architecture is unknown, generate a full memory barrier, 1966 // as the semantics of sfence do not really match those of atomics. 1967 llvm_memory_fence(); 1968 } 1969 else 1970 static assert(false); 1971 } 1972 unittest 1973 { 1974 _mm_lfence(); 1975 } 1976 1977 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1978 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1979 __m128d _mm_load_pd (const(double) * mem_addr) pure 1980 { 1981 pragma(inline, true); 1982 __m128d* aligned = cast(__m128d*)mem_addr; 1983 return *aligned; 1984 } 1985 unittest 1986 { 1987 align(16) double[2] S = [-5.0, 7.0]; 1988 __m128d R = _mm_load_pd(S.ptr); 1989 assert(R.array == S); 1990 } 1991 1992 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst. 1993 /// `mem_addr` does not need to be aligned on any particular boundary. 1994 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 1995 { 1996 double m = *mem_addr; 1997 __m128d r; // PERF =void; 1998 r.ptr[0] = m; 1999 r.ptr[1] = m; 2000 return r; 2001 } 2002 unittest 2003 { 2004 double what = 4; 2005 __m128d R = _mm_load_pd1(&what); 2006 double[2] correct = [4.0, 4]; 2007 assert(R.array == correct); 2008 } 2009 2010 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 2011 /// element. `mem_addr` does not need to be aligned on any particular boundary. 2012 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted 2013 { 2014 double2 r = [0, 0]; 2015 r.ptr[0] = *mem_addr; 2016 return r; 2017 } 2018 unittest 2019 { 2020 double x = -42; 2021 __m128d a = _mm_load_sd(&x); 2022 assert(a.array == [-42.0, 0.0]); 2023 } 2024 2025 /// Load 128-bits of integer data from memory into dst. 2026 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 2027 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @safe 2028 { 2029 pragma(inline, true); 2030 return *mem_addr; 2031 } 2032 unittest 2033 { 2034 align(16) int[4] correct = [-1, 2, 3, 4]; 2035 int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr); 2036 assert(A.array == correct); 2037 } 2038 2039 alias _mm_load1_pd = _mm_load_pd1; /// 2040 2041 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 2042 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary. 2043 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted 2044 { 2045 pragma(inline, true); 2046 a.ptr[1] = *mem_addr; 2047 return a; 2048 } 2049 unittest 2050 { 2051 double A = 7.0; 2052 __m128d B = _mm_setr_pd(4.0, -5.0); 2053 __m128d R = _mm_loadh_pd(B, &A); 2054 double[2] correct = [ 4.0, 7.0 ]; 2055 assert(R.array == correct); 2056 } 2057 2058 /// Load 64-bit integer from memory into the first element of result. Zero out the other. 2059 /// Note: strange signature since the memory doesn't have to aligned, and should point to addressable 64-bit, not 128-bit. 2060 /// You may use `_mm_loadu_si64` instead. 2061 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted 2062 { 2063 pragma(inline, true); 2064 static if (DMD_with_DSIMD) 2065 { 2066 return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr); 2067 } 2068 else 2069 { 2070 auto pLong = cast(const(long)*)mem_addr; 2071 long2 r = [0, 0]; 2072 r.ptr[0] = *pLong; 2073 return cast(__m128i)(r); 2074 } 2075 } 2076 unittest 2077 { 2078 long A = 0x7878787870707070; 2079 long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A); 2080 long[2] correct = [0x7878787870707070, 0]; 2081 assert(R.array == correct); 2082 } 2083 2084 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 2085 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary. 2086 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted 2087 { 2088 a.ptr[0] = *mem_addr; 2089 return a; 2090 } 2091 unittest 2092 { 2093 double A = 7.0; 2094 __m128d B = _mm_setr_pd(4.0, -5.0); 2095 __m128d R = _mm_loadl_pd(B, &A); 2096 double[2] correct = [ 7.0, -5.0 ]; 2097 assert(R.array == correct); 2098 } 2099 2100 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 2101 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 2102 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted 2103 { 2104 __m128d a = *cast(__m128d*)(mem_addr); 2105 __m128d r; // PERF =void; 2106 r.ptr[0] = a.array[1]; 2107 r.ptr[1] = a.array[0]; 2108 return r; 2109 } 2110 unittest 2111 { 2112 align(16) double[2] A = [56.0, -74.0]; 2113 __m128d R = _mm_loadr_pd(A.ptr); 2114 double[2] correct = [-74.0, 56.0]; 2115 assert(R.array == correct); 2116 } 2117 2118 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 2119 /// `mem_addr` does not need to be aligned on any particular boundary. 2120 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted 2121 { 2122 pragma(inline, true); 2123 static if (GDC_with_SSE2) 2124 { 2125 return __builtin_ia32_loadupd(mem_addr); 2126 } 2127 else static if (LDC_with_optimizations) 2128 { 2129 return loadUnaligned!(double2)(mem_addr); 2130 } 2131 else version(DigitalMars) 2132 { 2133 // Apparently inside __simd you can use aligned dereferences without fear. 2134 // That was issue 23048 on dlang's Bugzilla. 2135 static if (DMD_with_DSIMD) 2136 { 2137 return cast(__m128d)__simd(XMM.LODUPD, *cast(double2*)mem_addr); 2138 } 2139 else static if (SSESizedVectorsAreEmulated) 2140 { 2141 // Since this vector is emulated, it doesn't have alignement constraints 2142 // and as such we can just cast it. 2143 return *cast(__m128d*)(mem_addr); 2144 } 2145 else 2146 { 2147 __m128d result; 2148 result.ptr[0] = mem_addr[0]; 2149 result.ptr[1] = mem_addr[1]; 2150 return result; 2151 } 2152 } 2153 else 2154 { 2155 __m128d result; 2156 result.ptr[0] = mem_addr[0]; 2157 result.ptr[1] = mem_addr[1]; 2158 return result; 2159 } 2160 } 2161 unittest 2162 { 2163 double[2] A = [56.0, -75.0]; 2164 __m128d R = _mm_loadu_pd(A.ptr); 2165 double[2] correct = [56.0, -75.0]; 2166 assert(R.array == correct); 2167 } 2168 2169 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary. 2170 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 2171 { 2172 // PERF DMD 2173 pragma(inline, true); 2174 static if (GDC_with_SSE2) 2175 { 2176 return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr); 2177 } 2178 else static if (LDC_with_optimizations) 2179 { 2180 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 2181 } 2182 else 2183 { 2184 const(int)* p = cast(const(int)*)mem_addr; 2185 __m128i r = void; 2186 r.ptr[0] = p[0]; 2187 r.ptr[1] = p[1]; 2188 r.ptr[2] = p[2]; 2189 r.ptr[3] = p[3]; 2190 return r; 2191 } 2192 } 2193 unittest 2194 { 2195 align(16) int[4] correct = [-1, 2, -3, 4]; 2196 int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr); 2197 assert(A.array == correct); 2198 } 2199 2200 /// Load unaligned 16-bit integer from memory into the first element, fill with zeroes otherwise. 2201 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted // TODO: should be @system actually 2202 { 2203 static if (DMD_with_DSIMD) 2204 { 2205 int r = *cast(short*)(mem_addr); 2206 return cast(__m128i) __simd(XMM.LODD, *cast(__m128i*)&r); 2207 } 2208 else version(DigitalMars) 2209 { 2210 // Workaround issue: https://issues.dlang.org/show_bug.cgi?id=21672 2211 // DMD cannot handle the below code... 2212 align(16) short[8] r = [0, 0, 0, 0, 0, 0, 0, 0]; 2213 r[0] = *cast(short*)(mem_addr); 2214 return *cast(int4*)(r.ptr); 2215 } 2216 else 2217 { 2218 short r = *cast(short*)(mem_addr); 2219 short8 result = [0, 0, 0, 0, 0, 0, 0, 0]; 2220 result.ptr[0] = r; 2221 return cast(__m128i)result; 2222 } 2223 } 2224 unittest 2225 { 2226 short r = 13; 2227 short8 A = cast(short8) _mm_loadu_si16(&r); 2228 short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0]; 2229 assert(A.array == correct); 2230 } 2231 2232 /// Load unaligned 32-bit integer from memory into the first element of result. 2233 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted // TODO: should be @system actually 2234 { 2235 pragma(inline, true); 2236 int r = *cast(int*)(mem_addr); 2237 int4 result = [0, 0, 0, 0]; 2238 result.ptr[0] = r; 2239 return result; 2240 } 2241 unittest 2242 { 2243 int r = 42; 2244 __m128i A = _mm_loadu_si32(&r); 2245 int[4] correct = [42, 0, 0, 0]; 2246 assert(A.array == correct); 2247 } 2248 2249 /// Load unaligned 64-bit integer from memory into the first element of result. 2250 /// Upper 64-bit is zeroed. 2251 __m128i _mm_loadu_si64 (const(void)* mem_addr) pure @system 2252 { 2253 pragma(inline, true); 2254 static if (DMD_with_DSIMD) 2255 { 2256 return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr); 2257 } 2258 else 2259 { 2260 auto pLong = cast(const(long)*)mem_addr; 2261 long2 r = [0, 0]; 2262 r.ptr[0] = *pLong; 2263 return cast(__m128i)r; 2264 } 2265 } 2266 unittest 2267 { 2268 long r = 446446446446; 2269 long2 A = cast(long2) _mm_loadu_si64(&r); 2270 long[2] correct = [446446446446, 0]; 2271 assert(A.array == correct); 2272 } 2273 2274 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 2275 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 2276 /// and pack the results in destination. 2277 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted 2278 { 2279 static if (GDC_with_SSE2) 2280 { 2281 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2282 } 2283 else static if (LDC_with_SSE2) 2284 { 2285 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2286 } 2287 else static if (LDC_with_optimizations) 2288 { 2289 // 5 inst with arm64 + LDC 1.32 + -O1 2290 enum ir = ` 2291 %ia = sext <8 x i16> %0 to <8 x i32> 2292 %ib = sext <8 x i16> %1 to <8 x i32> 2293 %p = mul <8 x i32> %ia, %ib 2294 %p_even = shufflevector <8 x i32> %p, <8 x i32> undef, <4 x i32> <i32 0, i32 2,i32 4, i32 6> 2295 %p_odd = shufflevector <8 x i32> %p, <8 x i32> undef, <4 x i32> <i32 1, i32 3,i32 5, i32 7> 2296 %p_sum = add <4 x i32> %p_even, %p_odd 2297 ret <4 x i32> %p_sum`; 2298 return cast(__m128i) LDCInlineIR!(ir, int4, short8, short8)(cast(short8)a, cast(short8)b); 2299 } 2300 else 2301 { 2302 short8 sa = cast(short8)a; 2303 short8 sb = cast(short8)b; 2304 int4 r; 2305 foreach(i; 0..4) 2306 { 2307 r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1]; 2308 } 2309 return r; 2310 } 2311 } 2312 unittest 2313 { 2314 short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2315 short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2316 int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B); 2317 int[4] correct = [1, 13, -2147483648, 2*32767*32767]; 2318 assert(R.array == correct); 2319 } 2320 2321 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 2322 /// (elements are not stored when the highest bit is not set in the corresponding element) 2323 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 2324 /// boundary. 2325 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted 2326 { 2327 static if (GDC_with_SSE2) 2328 { 2329 return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr); 2330 } 2331 else static if (LDC_with_SSE2) 2332 { 2333 return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr); 2334 } 2335 else static if (LDC_with_ARM64) 2336 { 2337 // PERF: catastrophic on ARM32 2338 byte16 bmask = cast(byte16)mask; 2339 byte16 shift = 7; 2340 bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask 2341 mask = cast(__m128i) bmask; 2342 __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr); 2343 dest = (a & mask) | (dest & ~mask); 2344 storeUnaligned!__m128i(dest, cast(int*)mem_addr); 2345 } 2346 else 2347 { 2348 byte16 b = cast(byte16)a; 2349 byte16 m = cast(byte16)mask; 2350 byte* dest = cast(byte*)(mem_addr); 2351 foreach(j; 0..16) 2352 { 2353 if (m.array[j] & 128) 2354 { 2355 dest[j] = b.array[j]; 2356 } 2357 } 2358 } 2359 } 2360 unittest 2361 { 2362 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]; 2363 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0); 2364 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); 2365 _mm_maskmoveu_si128(A, mask, dest.ptr); 2366 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42]; 2367 assert(dest == correct); 2368 } 2369 2370 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values. 2371 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe 2372 { 2373 static if (GDC_with_SSE2) 2374 { 2375 return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b); 2376 } 2377 else version(LDC) 2378 { 2379 // x86: pmaxsw since LDC 1.0 -O1 2380 // ARM: smax.8h since LDC 1.5 -01 2381 short8 sa = cast(short8)a; 2382 short8 sb = cast(short8)b; 2383 static if (SIMD_COMPARISON_MASKS_16B) 2384 short8 greater = sa > sb; 2385 else 2386 short8 greater = greaterMask!short8(sa, sb); 2387 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2388 } 2389 else 2390 { 2391 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else 2392 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2393 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2394 return _mm_xor_si128(b, mask); 2395 } 2396 } 2397 unittest 2398 { 2399 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57), 2400 _mm_setr_epi16(-4,-8, 9, 7, 0,-32768, 0, 0)); 2401 short[8] correct = [32767, 1, 9, 7, 9, 7, 0, 0]; 2402 assert(R.array == correct); 2403 } 2404 2405 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values. 2406 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe 2407 { 2408 // PERF DMD 2409 static if (GDC_with_SSE2) 2410 { 2411 return cast(__m128i) __builtin_ia32_pmaxub128(cast(ubyte16)a, cast(ubyte16)b); 2412 } 2413 else version(LDC) 2414 { 2415 // x86: pmaxub since LDC 1.0.0 -O1 2416 // ARM64: umax.16b since LDC 1.5.0 -O1 2417 // PERF: catastrophic on ARM32 2418 ubyte16 sa = cast(ubyte16)a; 2419 ubyte16 sb = cast(ubyte16)b; 2420 static if (SIMD_COMPARISON_MASKS_16B) 2421 ubyte16 greater = (cast(ubyte16)a > cast(ubyte16)b); 2422 else 2423 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2424 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2425 } 2426 else 2427 { 2428 // PERF: use algorithm from _mm_max_epu16 2429 __m128i value128 = _mm_set1_epi8(-128); 2430 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2431 __m128i aTob = a ^ b; // a ^ (a ^ b) == b 2432 __m128i mask = aTob & higher; 2433 return b ^ mask; 2434 2435 } 2436 } 2437 unittest 2438 { 2439 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2440 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2441 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 2442 assert(R.array == correct); 2443 } 2444 2445 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 2446 /// packed maximum values. 2447 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted 2448 { 2449 static if (GDC_with_SSE2) 2450 { 2451 return __builtin_ia32_maxpd(a, b); 2452 } 2453 else 2454 { 2455 // x86: Generates maxpd starting with LDC 1.9 -O2 2456 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2457 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2458 return a; 2459 } 2460 } 2461 unittest 2462 { 2463 __m128d A = _mm_setr_pd(4.0, 1.0); 2464 __m128d B = _mm_setr_pd(1.0, 8.0); 2465 __m128d M = _mm_max_pd(A, B); 2466 assert(M.array[0] == 4.0); 2467 assert(M.array[1] == 8.0); 2468 } 2469 2470 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 2471 /// lower element of result, and copy the upper element from `a` to the upper element of result. 2472 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted 2473 { 2474 static if (GDC_with_SSE2) 2475 { 2476 return __builtin_ia32_maxsd(a, b); 2477 } 2478 else 2479 { 2480 __m128d r = a; 2481 // Generates maxsd starting with LDC 1.3 2482 r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2483 return r; 2484 } 2485 } 2486 unittest 2487 { 2488 __m128d A = _mm_setr_pd(1.0, 1.0); 2489 __m128d B = _mm_setr_pd(4.0, 2.0); 2490 __m128d M = _mm_max_sd(A, B); 2491 assert(M.array[0] == 4.0); 2492 assert(M.array[1] == 1.0); 2493 } 2494 2495 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 2496 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 2497 /// is globally visible before any memory instruction which follows the fence in program order. 2498 void _mm_mfence() @trusted // not pure! 2499 { 2500 version(GNU) 2501 { 2502 static if (GDC_with_SSE2) 2503 { 2504 __builtin_ia32_mfence(); 2505 } 2506 else version(X86) 2507 { 2508 asm pure nothrow @nogc @trusted 2509 { 2510 "mfence;\n" : : : ; 2511 } 2512 } 2513 else __warn_noop(); 2514 } 2515 else static if (LDC_with_SSE2) 2516 { 2517 __builtin_ia32_mfence(); 2518 } 2519 else static if (DMD_with_asm) 2520 { 2521 asm nothrow @nogc pure @trusted 2522 { 2523 mfence; 2524 } 2525 } 2526 else version(LDC) 2527 { 2528 // Note: will generate the DMB ish instruction on ARM 2529 llvm_memory_fence(); 2530 } 2531 else 2532 static assert(false); 2533 } 2534 unittest 2535 { 2536 _mm_mfence(); 2537 } 2538 2539 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values. 2540 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe 2541 { 2542 static if (GDC_with_SSE2) 2543 { 2544 return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b); 2545 } 2546 else version(LDC) 2547 { 2548 // x86: pminsw since LDC 1.0 -O1 2549 // ARM64: smin.8h since LDC 1.5 -01 2550 short8 sa = cast(short8)a; 2551 short8 sb = cast(short8)b; 2552 static if (SIMD_COMPARISON_MASKS_16B) 2553 short8 greater = sa > sb; 2554 else 2555 short8 greater = greaterMask!short8(sa, sb); 2556 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2557 } 2558 else 2559 { 2560 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else 2561 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2562 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2563 return _mm_xor_si128(b, mask); 2564 } 2565 } 2566 unittest 2567 { 2568 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-32768), 2569 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 2570 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -32768]; 2571 assert(R.array == correct); 2572 } 2573 2574 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values. 2575 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe 2576 { 2577 static if (GDC_with_SSE2) 2578 { 2579 return cast(__m128i) __builtin_ia32_pminub128(cast(ubyte16)a, cast(ubyte16)b); 2580 } 2581 else version(LDC) 2582 { 2583 // x86: pminub since LDC 1.0.0 -O1 2584 // ARM: umin.16b since LDC 1.5.0 -O1 2585 // PERF: catastrophic on ARM32 2586 ubyte16 sa = cast(ubyte16)a; 2587 ubyte16 sb = cast(ubyte16)b; 2588 static if (SIMD_COMPARISON_MASKS_16B) 2589 ubyte16 greater = (cast(ubyte16)a > cast(ubyte16)b); 2590 else 2591 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2592 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2593 } 2594 else 2595 { 2596 // PERF: use the algorithm from _mm_max_epu16 2597 __m128i value128 = _mm_set1_epi8(-128); 2598 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2599 __m128i aTob = a ^ b; // a ^ (a ^ b) == b 2600 __m128i mask = aTob & lower; 2601 return b ^ mask; 2602 } 2603 } 2604 unittest 2605 { 2606 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2607 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2608 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 2609 assert(R.array == correct); 2610 } 2611 2612 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values. 2613 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted 2614 { 2615 static if (GDC_with_SSE2) 2616 { 2617 return __builtin_ia32_minpd(a, b); 2618 } 2619 else 2620 { 2621 // Generates minpd starting with LDC 1.9 2622 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2623 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2624 return a; 2625 } 2626 } 2627 unittest 2628 { 2629 __m128d A = _mm_setr_pd(1.0, 2.0); 2630 __m128d B = _mm_setr_pd(4.0, 1.0); 2631 __m128d M = _mm_min_pd(A, B); 2632 assert(M.array[0] == 1.0); 2633 assert(M.array[1] == 1.0); 2634 } 2635 2636 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 2637 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 2638 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe 2639 { 2640 static if (GDC_with_SSE2) 2641 { 2642 return __builtin_ia32_minsd(a, b); 2643 } 2644 else 2645 { 2646 // Generates minsd starting with LDC 1.3 2647 __m128d r = a; 2648 r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2649 return r; 2650 } 2651 } 2652 unittest 2653 { 2654 __m128d A = _mm_setr_pd(1.0, 3.0); 2655 __m128d B = _mm_setr_pd(4.0, 2.0); 2656 __m128d M = _mm_min_sd(A, B); 2657 assert(M.array[0] == 1.0); 2658 assert(M.array[1] == 3.0); 2659 } 2660 2661 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element. 2662 __m128i _mm_move_epi64 (__m128i a) pure @trusted 2663 { 2664 static if (GDC_with_SSE2) 2665 { 2666 // slightly better with GDC -O0 2667 return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 2668 } 2669 else 2670 { 2671 long2 result = [ 0, 0 ]; 2672 long2 la = cast(long2) a; 2673 result.ptr[0] = la.array[0]; 2674 return cast(__m128i)(result); 2675 } 2676 } 2677 unittest 2678 { 2679 long2 A = [13, 47]; 2680 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 2681 long[2] correct = [13, 0]; 2682 assert(B.array == correct); 2683 } 2684 2685 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 2686 /// the upper element from `a` to the upper element of dst. 2687 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted 2688 { 2689 static if (GDC_with_SSE2) 2690 { 2691 return __builtin_ia32_movsd(a, b); 2692 } 2693 else 2694 { 2695 b.ptr[1] = a.array[1]; 2696 return b; 2697 } 2698 } 2699 unittest 2700 { 2701 double2 A = [13.0, 47.0]; 2702 double2 B = [34.0, 58.0]; 2703 double2 C = _mm_move_sd(A, B); 2704 double[2] correct = [34.0, 47.0]; 2705 assert(C.array == correct); 2706 } 2707 2708 /// Create mask from the most significant bit of each 8-bit element in `v`. 2709 int _mm_movemask_epi8 (__m128i a) pure @trusted 2710 { 2711 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2712 static if (GDC_with_SSE2) 2713 { 2714 return __builtin_ia32_pmovmskb128(cast(ubyte16)a); 2715 } 2716 else static if (LDC_with_SSE2) 2717 { 2718 return __builtin_ia32_pmovmskb128(cast(byte16)a); 2719 } 2720 else static if (LDC_with_ARM64) 2721 { 2722 // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon 2723 // The other two solutions lead to unfound intrinsics in LLVM and that took a long time. 2724 // SO there might be something a bit faster, but this one is reasonable and branchless. 2725 byte8 mask_shift; 2726 mask_shift.ptr[0] = 7; 2727 mask_shift.ptr[1] = 6; 2728 mask_shift.ptr[2] = 5; 2729 mask_shift.ptr[3] = 4; 2730 mask_shift.ptr[4] = 3; 2731 mask_shift.ptr[5] = 2; 2732 mask_shift.ptr[6] = 1; 2733 mask_shift.ptr[7] = 0; 2734 byte8 mask_and = byte8(-128); 2735 byte8 lo = vget_low_u8(cast(byte16)a); 2736 byte8 hi = vget_high_u8(cast(byte16)a); 2737 lo = vand_u8(lo, mask_and); 2738 lo = vshr_u8(lo, mask_shift); 2739 hi = vand_u8(hi, mask_and); 2740 hi = vshr_u8(hi, mask_shift); 2741 lo = vpadd_u8(lo,lo); 2742 lo = vpadd_u8(lo,lo); 2743 lo = vpadd_u8(lo,lo); 2744 hi = vpadd_u8(hi,hi); 2745 hi = vpadd_u8(hi,hi); 2746 hi = vpadd_u8(hi,hi); 2747 return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]); 2748 } 2749 else 2750 { 2751 byte16 ai = cast(byte16)a; 2752 int r = 0; 2753 foreach(bit; 0..16) 2754 { 2755 if (ai.array[bit] < 0) r += (1 << bit); 2756 } 2757 return r; 2758 } 2759 } 2760 unittest 2761 { 2762 assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0))); 2763 } 2764 2765 /// Create mask from the most significant bit of each 16-bit element in `v`. #BONUS 2766 int _mm_movemask_epi16 (__m128i a) pure @trusted 2767 { 2768 return _mm_movemask_epi8(_mm_packs_epi16(a, _mm_setzero_si128())); 2769 } 2770 unittest 2771 { 2772 assert(0x9C == _mm_movemask_epi16(_mm_set_epi16(-1, 1, 2, -3, -32768, -1, 32767, 8))); 2773 } 2774 2775 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 2776 /// loating-point element in `v`. 2777 int _mm_movemask_pd(__m128d v) pure @safe 2778 { 2779 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2780 static if (GDC_or_LDC_with_SSE2) 2781 { 2782 return __builtin_ia32_movmskpd(v); 2783 } 2784 else 2785 { 2786 long2 lv = cast(long2)v; 2787 int r = 0; 2788 if (lv.array[0] < 0) r += 1; 2789 if (lv.array[1] < 0) r += 2; 2790 return r; 2791 } 2792 } 2793 unittest 2794 { 2795 __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0); 2796 assert(_mm_movemask_pd(A) == 2); 2797 } 2798 2799 /// Copy the lower 64-bit integer in `v`. 2800 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe 2801 { 2802 long2 lv = cast(long2)v; 2803 return long1(lv.array[0]); 2804 } 2805 unittest 2806 { 2807 __m128i A = _mm_set_epi64x(-1, -2); 2808 __m64 R = _mm_movepi64_pi64(A); 2809 assert(R.array[0] == -2); 2810 } 2811 2812 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element. 2813 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted 2814 { 2815 long2 r; 2816 r.ptr[0] = a.array[0]; 2817 r.ptr[1] = 0; 2818 return cast(__m128i)r; 2819 } 2820 2821 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in `a` and `b`, 2822 /// and store the unsigned 64-bit results. 2823 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted 2824 { 2825 // PERF DMD D_SIMD 2826 static if (GDC_with_SSE2) 2827 { 2828 return cast(__m128i) __builtin_ia32_pmuludq128 (a, b); 2829 } 2830 else 2831 { 2832 version(LDC) 2833 { 2834 static if (__VERSION__ >= 2088) 2835 { 2836 // Need LLVM9 for proper optimization 2837 long2 la, lb; 2838 la.ptr[0] = cast(uint)a.array[0]; 2839 la.ptr[1] = cast(uint)a.array[2]; 2840 lb.ptr[0] = cast(uint)b.array[0]; 2841 lb.ptr[1] = cast(uint)b.array[2]; 2842 } 2843 else 2844 { 2845 __m128i zero; 2846 zero = 0; 2847 long2 la = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(a, zero); 2848 long2 lb = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(b, zero); 2849 } 2850 } 2851 else 2852 { 2853 long2 la, lb; 2854 la.ptr[0] = cast(uint)a.array[0]; 2855 la.ptr[1] = cast(uint)a.array[2]; 2856 lb.ptr[0] = cast(uint)b.array[0]; 2857 lb.ptr[1] = cast(uint)b.array[2]; 2858 } 2859 2860 version(DigitalMars) 2861 { 2862 // DMD has no long2 mul 2863 la.ptr[0] *= lb.array[0]; 2864 la.ptr[1] *= lb.array[1]; 2865 return cast(__m128i)(la); 2866 } 2867 else 2868 { 2869 static if (__VERSION__ >= 2076) 2870 { 2871 return cast(__m128i)(la * lb); 2872 } 2873 else 2874 { 2875 // long2 mul not supported before LDC 1.5 2876 la.ptr[0] *= lb.array[0]; 2877 la.ptr[1] *= lb.array[1]; 2878 return cast(__m128i)(la); 2879 } 2880 } 2881 } 2882 } 2883 unittest 2884 { 2885 __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff); 2886 __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff); 2887 __m128i C = _mm_mul_epu32(A, B); 2888 long2 LC = cast(long2)C; 2889 assert(LC.array[0] == 18446744065119617025uL); 2890 assert(LC.array[1] == 12723420444339690338uL); 2891 } 2892 2893 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 2894 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 2895 { 2896 pragma(inline, true); 2897 return a * b; 2898 } 2899 unittest 2900 { 2901 __m128d a = [-2.0, 1.5]; 2902 a = _mm_mul_pd(a, a); 2903 assert(a.array == [4.0, 2.25]); 2904 } 2905 2906 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 2907 /// element of result, and copy the upper element from `a` to the upper element of result. 2908 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted 2909 { 2910 version(DigitalMars) 2911 { 2912 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 2913 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 2914 asm pure nothrow @nogc @trusted { nop;} 2915 a.array[0] = a.array[0] * b.array[0]; 2916 return a; 2917 } 2918 else static if (GDC_with_SSE2) 2919 { 2920 return __builtin_ia32_mulsd(a, b); 2921 } 2922 else 2923 { 2924 a.ptr[0] *= b.array[0]; 2925 return a; 2926 } 2927 } 2928 unittest 2929 { 2930 __m128d a = [-2.0, 1.5]; 2931 a = _mm_mul_sd(a, a); 2932 assert(a.array == [4.0, 1.5]); 2933 } 2934 2935 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 2936 /// and get an unsigned 64-bit result. 2937 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe 2938 { 2939 return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b))); 2940 } 2941 unittest 2942 { 2943 __m64 A = _mm_set_pi32(42, 0xDEADBEEF); 2944 __m64 B = _mm_set_pi32(42, 0xCAFEBABE); 2945 __m64 C = _mm_mul_su32(A, B); 2946 assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL); 2947 } 2948 2949 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2950 /// high 16 bits of the intermediate integers. 2951 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted 2952 { 2953 static if (GDC_with_SSE2) 2954 { 2955 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2956 } 2957 else static if (LDC_with_SSE2) 2958 { 2959 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2960 } 2961 else 2962 { 2963 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h 2964 // PERF: it seems the simde solution has one less instruction in ARM64. 2965 // PERF: Catastrophic in ARM32. 2966 short8 sa = cast(short8)a; 2967 short8 sb = cast(short8)b; 2968 short8 r = void; 2969 r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16; 2970 r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16; 2971 r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16; 2972 r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16; 2973 r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16; 2974 r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16; 2975 r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16; 2976 r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16; 2977 return cast(__m128i)r; 2978 } 2979 } 2980 unittest 2981 { 2982 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2983 __m128i B = _mm_set1_epi16(16384); 2984 short8 R = cast(short8)_mm_mulhi_epi16(A, B); 2985 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1]; 2986 assert(R.array == correct); 2987 } 2988 2989 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2990 /// high 16 bits of the intermediate integers. 2991 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted 2992 { 2993 static if (GDC_with_SSE2) 2994 { 2995 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2996 } 2997 else static if (LDC_with_SSE2) 2998 { 2999 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 3000 } 3001 else 3002 { 3003 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h 3004 // it seems the simde solution has one less instruction in ARM64 3005 // PERF: Catastrophic in ARM32. 3006 short8 sa = cast(short8)a; 3007 short8 sb = cast(short8)b; 3008 short8 r = void; 3009 r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 ); 3010 r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 ); 3011 r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 ); 3012 r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 ); 3013 r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 ); 3014 r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 ); 3015 r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 ); 3016 r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 ); 3017 return cast(__m128i)r; 3018 } 3019 } 3020 unittest 3021 { 3022 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 3023 __m128i B = _mm_set1_epi16(16384); 3024 short8 R = cast(short8)_mm_mulhi_epu16(A, B); 3025 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1]; 3026 assert(R.array == correct); 3027 } 3028 3029 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 3030 /// bits of the intermediate integers. 3031 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe 3032 { 3033 return cast(__m128i)(cast(short8)a * cast(short8)b); 3034 } 3035 unittest 3036 { 3037 __m128i A = _mm_setr_epi16(16384, -16, 0, 3, 4, 1, 16, 7); 3038 __m128i B = _mm_set1_epi16(16384); 3039 short8 R = cast(short8)_mm_mullo_epi16(A, B); 3040 short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384]; 3041 assert(R.array == correct); 3042 } 3043 3044 /// Compute the bitwise NOT of 128 bits in `a`. #BONUS 3045 __m128i _mm_not_si128 (__m128i a) pure @safe 3046 { 3047 return ~a; 3048 } 3049 unittest 3050 { 3051 __m128i A = _mm_set1_epi32(-748); 3052 int4 notA = cast(int4) _mm_not_si128(A); 3053 int[4] correct = [747, 747, 747, 747]; 3054 assert(notA.array == correct); 3055 } 3056 3057 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 3058 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 3059 { 3060 pragma(inline, true); 3061 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 3062 } 3063 3064 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`. 3065 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 3066 { 3067 pragma(inline, true); 3068 return a | b; 3069 } 3070 3071 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation. 3072 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted 3073 { 3074 static if (DMD_with_DSIMD) 3075 { 3076 return cast(__m128i) __simd(XMM.PACKSSDW, a, b); 3077 } 3078 else static if (GDC_with_SSE2) 3079 { 3080 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 3081 } 3082 else static if (LDC_with_SSE2) 3083 { 3084 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 3085 } 3086 else static if (LDC_with_ARM64) 3087 { 3088 short4 ra = vqmovn_s32(cast(int4)a); 3089 short4 rb = vqmovn_s32(cast(int4)b); 3090 return cast(__m128i)vcombine_s16(ra, rb); 3091 } 3092 else 3093 { 3094 // PERF: catastrophic on ARM32 3095 short8 r; 3096 r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]); 3097 r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]); 3098 r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]); 3099 r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]); 3100 r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]); 3101 r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]); 3102 r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]); 3103 r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]); 3104 return cast(__m128i)r; 3105 } 3106 } 3107 unittest 3108 { 3109 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 3110 short8 R = cast(short8) _mm_packs_epi32(A, A); 3111 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0]; 3112 assert(R.array == correct); 3113 } 3114 3115 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation. 3116 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted 3117 { 3118 static if (DMD_with_DSIMD) 3119 { 3120 return cast(__m128i) __simd(XMM.PACKSSWB, a, b); 3121 } 3122 else static if (GDC_with_SSE2) 3123 { 3124 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 3125 } 3126 else static if (LDC_with_SSE2) 3127 { 3128 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 3129 } 3130 else static if (LDC_with_ARM64) 3131 { 3132 // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02 3133 byte8 ra = vqmovn_s16(cast(short8)a); 3134 byte8 rb = vqmovn_s16(cast(short8)b); 3135 return cast(__m128i)vcombine_s8(ra, rb); 3136 } 3137 else 3138 { 3139 // PERF: ARM32 is missing 3140 byte16 r; 3141 short8 sa = cast(short8)a; 3142 short8 sb = cast(short8)b; 3143 foreach(i; 0..8) 3144 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]); 3145 foreach(i; 0..8) 3146 r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]); 3147 return cast(__m128i)r; 3148 } 3149 } 3150 unittest 3151 { 3152 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0); 3153 byte16 R = cast(byte16) _mm_packs_epi16(A, A); 3154 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0, 3155 127, -128, 127, 0, 127, -128, 127, 0]; 3156 assert(R.array == correct); 3157 } 3158 3159 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation. 3160 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 3161 { 3162 // PERF DMD catastrophic 3163 static if (DMD_with_DSIMD) 3164 { 3165 return cast(__m128i) __simd(XMM.PACKUSWB, a, b); 3166 } 3167 else static if (GDC_with_SSE2) 3168 { 3169 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 3170 } 3171 else static if (LDC_with_SSE2) 3172 { 3173 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 3174 } 3175 else static if (LDC_with_ARM64) 3176 { 3177 // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02 3178 byte8 ra = vqmovun_s16(cast(short8)a); 3179 byte8 rb = vqmovun_s16(cast(short8)b); 3180 return cast(__m128i)vcombine_s8(ra, rb); 3181 } 3182 else 3183 { 3184 short8 sa = cast(short8)a; 3185 short8 sb = cast(short8)b; 3186 align(16) ubyte[16] result = void; 3187 for (int i = 0; i < 8; ++i) 3188 { 3189 short s = sa[i]; 3190 if (s < 0) s = 0; 3191 if (s > 255) s = 255; 3192 result[i] = cast(ubyte)s; 3193 3194 s = sb[i]; 3195 if (s < 0) s = 0; 3196 if (s > 255) s = 255; 3197 result[i+8] = cast(ubyte)s; 3198 } 3199 return *cast(__m128i*)(result.ptr); 3200 } 3201 } 3202 unittest 3203 { 3204 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 3205 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 3206 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 3207 0, 255, 0, 255, 255, 2, 1, 0]; 3208 foreach(i; 0..16) 3209 assert(AA.array[i] == cast(byte)(correctResult[i])); 3210 } 3211 3212 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 3213 /// and power consumption of spin-wait loops. 3214 void _mm_pause() @trusted 3215 { 3216 version(GNU) 3217 { 3218 static if (GDC_with_SSE2) 3219 { 3220 __builtin_ia32_pause(); 3221 } 3222 else version(X86) 3223 { 3224 asm pure nothrow @nogc @trusted 3225 { 3226 "pause;\n" : : : ; 3227 } 3228 } 3229 else __warn_noop(); 3230 } 3231 else static if (LDC_with_SSE2) 3232 { 3233 __builtin_ia32_pause(); 3234 } 3235 else static if (DMD_with_asm) 3236 { 3237 asm nothrow @nogc pure @trusted 3238 { 3239 rep; nop; // F3 90 = pause 3240 } 3241 } 3242 else version (LDC) 3243 { 3244 // PERF: Do nothing currently , could be the "yield" intruction on ARM. 3245 } 3246 else 3247 static assert(false); 3248 } 3249 unittest 3250 { 3251 _mm_pause(); 3252 } 3253 3254 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 3255 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 3256 /// low 16 bits of 64-bit elements in result. 3257 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted 3258 { 3259 static if (GDC_with_SSE2) 3260 { 3261 return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b); 3262 } 3263 else static if (LDC_with_SSE2) 3264 { 3265 return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b); 3266 } 3267 else static if (LDC_with_ARM64) 3268 { 3269 ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b)); 3270 3271 // PERF: Looks suboptimal vs addp 3272 ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]); 3273 ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]); 3274 ushort8 r = 0; 3275 r[0] = r0; 3276 r[4] = r4; 3277 return cast(__m128i) r; 3278 } 3279 else 3280 { 3281 // PERF: ARM32 is lacking 3282 byte16 ab = cast(byte16)a; 3283 byte16 bb = cast(byte16)b; 3284 ubyte[16] t; 3285 foreach(i; 0..16) 3286 { 3287 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]); 3288 if (diff < 0) diff = -diff; 3289 t[i] = cast(ubyte)(diff); 3290 } 3291 int4 r = _mm_setzero_si128(); 3292 r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 3293 r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15]; 3294 return r; 3295 } 3296 } 3297 unittest 3298 { 3299 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 3300 __m128i B = _mm_set1_epi8(1); 3301 __m128i R = _mm_sad_epu8(A, B); 3302 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 3303 0, 3304 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 3305 0]; 3306 assert(R.array == correct); 3307 } 3308 3309 /// Set packed 16-bit integers with the supplied values. 3310 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 3311 { 3312 short8 r = void; 3313 r.ptr[0] = e0; 3314 r.ptr[1] = e1; 3315 r.ptr[2] = e2; 3316 r.ptr[3] = e3; 3317 r.ptr[4] = e4; 3318 r.ptr[5] = e5; 3319 r.ptr[6] = e6; 3320 r.ptr[7] = e7; 3321 return cast(__m128i) r; 3322 } 3323 unittest 3324 { 3325 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 3326 short8 B = cast(short8) A; 3327 foreach(i; 0..8) 3328 assert(B.array[i] == i); 3329 } 3330 3331 /// Set packed 32-bit integers with the supplied values. 3332 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3333 { 3334 // PERF: does a constant inline correctly? vs int4 field assignment 3335 align(16) int[4] r = [e0, e1, e2, e3]; 3336 return *cast(int4*)&r; 3337 } 3338 unittest 3339 { 3340 __m128i A = _mm_set_epi32(3, 2, 1, 0); 3341 foreach(i; 0..4) 3342 assert(A.array[i] == i); 3343 3344 static if (__VERSION__ >= 2094) 3345 enum __m128i B = _mm_setr_epi32(0, 1, 2, 3); 3346 } 3347 3348 /// Set packed 64-bit integers with the supplied values. 3349 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted 3350 { 3351 pragma(inline, true); 3352 long2 r = void; 3353 r.ptr[0] = e0.array[0]; 3354 r.ptr[1] = e1.array[0]; 3355 return cast(__m128i)(r); 3356 } 3357 unittest 3358 { 3359 __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678)); 3360 long2 B = cast(long2) A; 3361 assert(B.array[0] == 5678); 3362 assert(B.array[1] == 1234); 3363 } 3364 3365 /// Set packed 64-bit integers with the supplied values. 3366 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 3367 { 3368 pragma(inline, true); 3369 long2 r = void; 3370 r.ptr[0] = e0; 3371 r.ptr[1] = e1; 3372 return cast(__m128i)(r); 3373 } 3374 unittest 3375 { 3376 __m128i A = _mm_set_epi64x(1234, -5678); 3377 long2 B = cast(long2) A; 3378 assert(B.array[0] == -5678); 3379 assert(B.array[1] == 1234); 3380 } 3381 3382 /// Set packed 8-bit integers with the supplied values. 3383 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 3384 byte e11, byte e10, byte e9, byte e8, 3385 byte e7, byte e6, byte e5, byte e4, 3386 byte e3, byte e2, byte e1, byte e0) pure @trusted 3387 { 3388 align(16) byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 3389 e8, e9, e10, e11, e12, e13, e14, e15]; 3390 return *cast(__m128i*)(result.ptr); 3391 } 3392 unittest 3393 { 3394 byte16 R = cast(byte16) _mm_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); 3395 byte[16] correct = [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1]; 3396 assert(R.array == correct); 3397 } 3398 3399 /// Set packed double-precision (64-bit) floating-point elements with the supplied values. 3400 __m128d _mm_set_pd (double e1, double e0) pure @trusted 3401 { 3402 pragma(inline, true); 3403 double2 r = void; 3404 r.ptr[0] = e0; 3405 r.ptr[1] = e1; 3406 return r; 3407 } 3408 unittest 3409 { 3410 __m128d A = _mm_set_pd(61.0, 55.0); 3411 double[2] correct = [55.0, 61.0]; 3412 assert(A.array == correct); 3413 } 3414 3415 /// Broadcast double-precision (64-bit) floating-point value `a` to all element. 3416 __m128d _mm_set_pd1 (double a) pure @trusted 3417 { 3418 pragma(inline, true); 3419 __m128d r = void; 3420 r.ptr[0] = a; 3421 r.ptr[1] = a; 3422 return r; 3423 } 3424 unittest 3425 { 3426 __m128d A = _mm_set_pd1(61.0); 3427 double[2] correct = [61.0, 61.0]; 3428 assert(A.array == correct); 3429 } 3430 3431 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 3432 /// and zero the upper element. 3433 __m128d _mm_set_sd (double a) pure @trusted 3434 { 3435 double2 r = void; 3436 r.ptr[0] = a; 3437 r.ptr[1] = 0.0; 3438 return r; 3439 } 3440 unittest 3441 { 3442 __m128d A = _mm_set_sd(61.0); 3443 double[2] correct = [61.0, 0.0]; 3444 assert(A.array == correct); 3445 } 3446 3447 /// Broadcast 16-bit integer a to all elements of dst. 3448 __m128i _mm_set1_epi16 (short a) pure @trusted 3449 { 3450 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 3451 { 3452 short8 v = a; 3453 return cast(__m128i) v; 3454 } 3455 else 3456 { 3457 pragma(inline, true); 3458 return cast(__m128i)(short8(a)); 3459 } 3460 } 3461 unittest 3462 { 3463 short8 a = cast(short8) _mm_set1_epi16(31); 3464 for (int i = 0; i < 8; ++i) 3465 assert(a.array[i] == 31); 3466 } 3467 3468 /// Broadcast 32-bit integer `a` to all elements. 3469 __m128i _mm_set1_epi32 (int a) pure @trusted 3470 { 3471 pragma(inline, true); 3472 return cast(__m128i)(int4(a)); 3473 } 3474 unittest 3475 { 3476 int4 A = cast(int4) _mm_set1_epi32(31); 3477 for (int i = 0; i < 4; ++i) 3478 assert(A.array[i] == 31); 3479 3480 // compile-time should work 3481 static if (__VERSION__ >= 2094) 3482 enum __m128i B = _mm_set1_epi32(3); 3483 } 3484 3485 /// Broadcast 64-bit integer `a` to all elements. 3486 __m128i _mm_set1_epi64 (__m64 a) pure @safe 3487 { 3488 return _mm_set_epi64(a, a); 3489 } 3490 unittest 3491 { 3492 long b = 0x1DEADCAFE; 3493 __m64 a; 3494 a.ptr[0] = b; 3495 long2 c = cast(long2) _mm_set1_epi64(a); 3496 assert(c.array[0] == b); 3497 assert(c.array[1] == b); 3498 } 3499 3500 /// Broadcast 64-bit integer `a` to all elements 3501 __m128i _mm_set1_epi64x (long a) pure @trusted 3502 { 3503 long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3504 return cast(__m128i)(b); 3505 } 3506 unittest 3507 { 3508 long b = 0x1DEADCAFE; 3509 long2 c = cast(long2) _mm_set1_epi64x(b); 3510 for (int i = 0; i < 2; ++i) 3511 assert(c.array[i] == b); 3512 } 3513 3514 /// Broadcast 8-bit integer `a` to all elements. 3515 __m128i _mm_set1_epi8 (byte a) pure @trusted 3516 { 3517 pragma(inline, true); 3518 byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3519 return cast(__m128i)(b); 3520 } 3521 unittest 3522 { 3523 byte16 b = cast(byte16) _mm_set1_epi8(31); 3524 for (int i = 0; i < 16; ++i) 3525 assert(b.array[i] == 31); 3526 } 3527 3528 alias _mm_set1_pd = _mm_set_pd1; 3529 3530 /// Set packed 16-bit integers with the supplied values in reverse order. 3531 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 3532 short e3, short e2, short e1, short e0) pure @trusted 3533 { 3534 short8 r = void; 3535 r.ptr[0] = e7; 3536 r.ptr[1] = e6; 3537 r.ptr[2] = e5; 3538 r.ptr[3] = e4; 3539 r.ptr[4] = e3; 3540 r.ptr[5] = e2; 3541 r.ptr[6] = e1; 3542 r.ptr[7] = e0; 3543 return cast(__m128i)(r); 3544 } 3545 unittest 3546 { 3547 short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0); 3548 short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0]; 3549 assert(A.array == correct); 3550 } 3551 3552 /// Set packed 32-bit integers with the supplied values in reverse order. 3553 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3554 { 3555 if (__ctfe) 3556 { 3557 __m128i r; 3558 r.ptr[0] = e3; 3559 r.ptr[1] = e2; 3560 r.ptr[2] = e1; 3561 r.ptr[3] = e0; 3562 return r; 3563 } 3564 else 3565 { 3566 // Performs better than = void; with GDC 3567 pragma(inline, true); 3568 align(16) int[4] result = [e3, e2, e1, e0]; 3569 return *cast(__m128i*)(result.ptr); 3570 } 3571 } 3572 unittest 3573 { 3574 int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647); 3575 int[4] correct = [-1, 0, -2147483648, 2147483647]; 3576 assert(A.array == correct); 3577 3578 // compile-time should work 3579 static if (__VERSION__ >= 2094) 3580 enum __m128i B = _mm_setr_epi32(0, 1, 2, 3); 3581 } 3582 3583 /// Set packed 64-bit integers with the supplied values in reverse order. 3584 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 3585 { 3586 long2 r = void; 3587 r.ptr[0] = e1; 3588 r.ptr[1] = e0; 3589 return cast(__m128i)(r); 3590 } 3591 unittest 3592 { 3593 long2 A = cast(long2) _mm_setr_epi64(-1, 0); 3594 long[2] correct = [-1, 0]; 3595 assert(A.array == correct); 3596 } 3597 3598 /// Set packed 8-bit integers with the supplied values in reverse order. 3599 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12, 3600 byte e11, byte e10, byte e9, byte e8, 3601 byte e7, byte e6, byte e5, byte e4, 3602 byte e3, byte e2, byte e1, byte e0) pure @trusted 3603 { 3604 align(16) byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 3605 e7, e6, e5, e4, e3, e2, e1, e0]; 3606 return *cast(__m128i*)(result.ptr); 3607 } 3608 unittest 3609 { 3610 byte16 R = cast(byte16) _mm_setr_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); 3611 byte[16] correct = [-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]; 3612 assert(R.array == correct); 3613 } 3614 3615 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order. 3616 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 3617 { 3618 pragma(inline, true); 3619 double2 result; 3620 result.ptr[0] = e1; 3621 result.ptr[1] = e0; 3622 return result; 3623 } 3624 unittest 3625 { 3626 __m128d A = _mm_setr_pd(61.0, 55.0); 3627 double[2] correct = [61.0, 55.0]; 3628 assert(A.array == correct); 3629 } 3630 3631 /// Return vector of type `__m128d` with all elements set to zero. 3632 __m128d _mm_setzero_pd() pure @trusted 3633 { 3634 pragma(inline, true); 3635 double2 r = void; 3636 r.ptr[0] = 0.0; 3637 r.ptr[1] = 0.0; 3638 return r; 3639 } 3640 unittest 3641 { 3642 __m128d A = _mm_setzero_pd(); 3643 double[2] correct = [0.0, 0.0]; 3644 assert(A.array == correct); 3645 } 3646 3647 /// Return vector of type `__m128i` with all elements set to zero. 3648 __m128i _mm_setzero_si128() pure @trusted 3649 { 3650 pragma(inline, true); 3651 int4 r = void; 3652 r.ptr[0] = 0; 3653 r.ptr[1] = 0; 3654 r.ptr[2] = 0; 3655 r.ptr[3] = 0; 3656 return r; 3657 } 3658 unittest 3659 { 3660 __m128i A = _mm_setzero_si128(); 3661 int[4] correct = [0, 0, 0, 0]; 3662 assert(A.array == correct); 3663 } 3664 3665 /// Shuffle 32-bit integers in `a` using the control in `imm8`. 3666 /// See_also: `_MM_SHUFFLE`. 3667 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @trusted 3668 { 3669 // PERF DMD D_SIMD 3670 static if (GDC_with_SSE2) 3671 { 3672 return __builtin_ia32_pshufd(a, imm8); 3673 } 3674 else static if (LDC_with_optimizations) 3675 { 3676 return shufflevectorLDC!(int4, (imm8 >> 0) & 3, 3677 (imm8 >> 2) & 3, 3678 (imm8 >> 4) & 3, 3679 (imm8 >> 6) & 3)(a, a); 3680 } 3681 else 3682 { 3683 int4 r = void; 3684 r.ptr[0] = a.ptr[(imm8 >> 0) & 3]; 3685 r.ptr[1] = a.ptr[(imm8 >> 2) & 3]; 3686 r.ptr[2] = a.ptr[(imm8 >> 4) & 3]; 3687 r.ptr[3] = a.ptr[(imm8 >> 6) & 3]; 3688 return r; 3689 } 3690 } 3691 unittest 3692 { 3693 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 3694 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3695 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 3696 int[4] expectedB = [ 3, 2, 1, 0 ]; 3697 assert(B.array == expectedB); 3698 } 3699 3700 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`. 3701 /// See_also: `_MM_SHUFFLE2`. 3702 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @trusted 3703 { 3704 // PERF DMD D_SIMD 3705 static if (GDC_with_SSE2) 3706 { 3707 return __builtin_ia32_shufpd(a, b, imm8); 3708 } 3709 else version(LDC) 3710 { 3711 return shufflevectorLDC!(double2, 0 + ( imm8 & 1 ), 3712 2 + ( (imm8 >> 1) & 1 ))(a, b); 3713 } 3714 else 3715 { 3716 double2 r = void; 3717 r.ptr[0] = a.array[imm8 & 1]; 3718 r.ptr[1] = b.array[(imm8 >> 1) & 1]; 3719 return r; 3720 } 3721 } 3722 unittest 3723 { 3724 __m128d A = _mm_setr_pd(0.5, 2.0); 3725 __m128d B = _mm_setr_pd(4.0, 5.0); 3726 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 3727 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B); 3728 double[2] correct = [ 2.0, 5.0 ]; 3729 assert(R.array == correct); 3730 } 3731 3732 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 3733 /// 64 bits of result, with the low 64 bits being copied from from `a` to result. 3734 /// See also: `_MM_SHUFFLE`. 3735 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @trusted 3736 { 3737 static if (DMD_with_DSIMD) 3738 { 3739 return cast(__m128i) __simd(XMM.PSHUFHW, a, a, cast(ubyte)imm8); 3740 } 3741 else static if (GDC_with_SSE2) 3742 { 3743 return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8); 3744 } 3745 else static if (LDC_with_optimizations) 3746 { 3747 return cast(__m128i) shufflevectorLDC!(short8, 0, 1, 2, 3, 3748 4 + ( (imm8 >> 0) & 3 ), 3749 4 + ( (imm8 >> 2) & 3 ), 3750 4 + ( (imm8 >> 4) & 3 ), 3751 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 3752 } 3753 else 3754 { 3755 short8 r = cast(short8)a; 3756 short8 sa = cast(short8)a; 3757 r.ptr[4] = sa.array[4 + ( (imm8 >> 0) & 3 ) ]; 3758 r.ptr[5] = sa.array[4 + ( (imm8 >> 2) & 3 ) ]; 3759 r.ptr[6] = sa.array[4 + ( (imm8 >> 4) & 3 ) ]; 3760 r.ptr[7] = sa.array[4 + ( (imm8 >> 6) & 3 ) ]; 3761 return cast(__m128i) r; 3762 } 3763 } 3764 unittest 3765 { 3766 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3767 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3768 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 3769 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 3770 assert(C.array == expectedC); 3771 } 3772 3773 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 3774 /// bits of result, with the high 64 bits being copied from from `a` to result. 3775 /// See_also: `_MM_SHUFFLE`. 3776 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @trusted 3777 { 3778 static if (DMD_with_DSIMD) 3779 { 3780 return cast(__m128i) __simd(XMM.PSHUFLW, a, a, cast(ubyte)imm8); 3781 } 3782 else static if (GDC_with_SSE2) 3783 { 3784 return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8); 3785 } 3786 else static if (LDC_with_optimizations) 3787 { 3788 return cast(__m128i) shufflevectorLDC!(short8, ( (imm8 >> 0) & 3 ), 3789 ( (imm8 >> 2) & 3 ), 3790 ( (imm8 >> 4) & 3 ), 3791 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 3792 } 3793 else 3794 { 3795 short8 r = cast(short8)a; 3796 short8 sa = cast(short8)a; 3797 r.ptr[0] = sa.array[(imm8 >> 0) & 3]; 3798 r.ptr[1] = sa.array[(imm8 >> 2) & 3]; 3799 r.ptr[2] = sa.array[(imm8 >> 4) & 3]; 3800 r.ptr[3] = sa.array[(imm8 >> 6) & 3]; 3801 return cast(__m128i) r; 3802 } 3803 } 3804 unittest 3805 { 3806 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3807 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3808 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 3809 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 3810 assert(B.array == expectedB); 3811 } 3812 3813 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros. 3814 /// Bit-shift is a single value in the low-order 64-bit of `count`. 3815 /// If bit-shift > 31, result is defined to be all zeroes. 3816 /// Note: prefer `_mm_slli_epi32`, less of a trap. 3817 __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted 3818 { 3819 static if (GDC_or_LDC_with_SSE2) 3820 { 3821 return __builtin_ia32_pslld128(a, count); 3822 } 3823 else 3824 { 3825 int4 r = void; 3826 long2 lc = cast(long2)count; 3827 ulong bits = cast(ulong)(lc.array[0]); 3828 foreach(i; 0..4) 3829 r[i] = cast(uint)(a[i]) << bits; 3830 if (bits > 31) 3831 r = int4(0); 3832 return r; 3833 } 3834 } 3835 unittest 3836 { 3837 __m128i shift0 = _mm_setzero_si128(); 3838 __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift 3839 __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5); 3840 __m128i A = _mm_setr_epi32(4, -9, 11, -2147483648); 3841 int[4] correct0 = A.array; 3842 int[4] correctX = [0, 0, 0, 0]; 3843 int[4] correct2 = [16, -36, 44, 0]; 3844 int4 B0 = cast(int4) _mm_sll_epi32(A, shift0); 3845 int4 BX = cast(int4) _mm_sll_epi32(A, shiftX); 3846 int4 B2 = cast(int4) _mm_sll_epi32(A, shift2); 3847 assert(B0.array == correct0); 3848 assert(BX.array == correctX); 3849 assert(B2.array == correct2); 3850 } 3851 3852 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros. 3853 /// Bit-shift is a single value in the low-order 64-bit of `count`. 3854 /// If bit-shift > 63, result is defined to be all zeroes. 3855 /// Note: prefer `_mm_slli_epi64`, less of a trap. 3856 __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted 3857 { 3858 static if (GDC_or_LDC_with_SSE2) 3859 { 3860 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3861 } 3862 else 3863 { 3864 // ARM: good since LDC 1.12 -O2 3865 // ~but -O0 version is catastrophic 3866 long2 r = void; 3867 long2 sa = cast(long2)a; 3868 long2 lc = cast(long2)count; 3869 ulong bits = cast(ulong)(lc.array[0]); 3870 foreach(i; 0..2) 3871 r.array[i] = cast(ulong)(sa.array[i]) << bits; 3872 if (bits > 63) 3873 r = long2(0); 3874 return cast(__m128i)r; 3875 } 3876 } 3877 unittest 3878 { 3879 __m128i shift0 = _mm_setzero_si128(); 3880 __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift 3881 __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5); 3882 __m128i A = _mm_setr_epi64(4, -9); 3883 long[2] correct0 = [ 4, -9]; 3884 long[2] correctX = [ 0, 0]; 3885 long[2] correct2 = [16, -36]; 3886 long2 B0 = cast(long2) _mm_sll_epi64(A, shift0); 3887 long2 BX = cast(long2) _mm_sll_epi64(A, shiftX); 3888 long2 B2 = cast(long2) _mm_sll_epi64(A, shift2); 3889 assert(B0.array == correct0); 3890 assert(BX.array == correctX); 3891 assert(B2.array == correct2); 3892 } 3893 3894 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros. 3895 /// Bit-shift is a single value in the low-order 64-bit of `count`. 3896 /// If bit-shift > 15, result is defined to be all zeroes. 3897 /// Warning: prefer `_mm_slli_epi16`, less of a trap. 3898 __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 3899 { 3900 static if (GDC_or_LDC_with_SSE2) 3901 { 3902 return cast(__m128i)__builtin_ia32_psllw128(cast(short8)a, cast(short8)count); 3903 } 3904 else 3905 { 3906 short8 sa = cast(short8)a; 3907 long2 lc = cast(long2)count; 3908 ulong bits = cast(ulong)(lc.array[0]); 3909 short8 r = void; 3910 foreach(i; 0..8) 3911 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits); 3912 if (bits > 15) 3913 r = short8(0); 3914 return cast(int4)r; 3915 } 3916 } 3917 unittest 3918 { 3919 __m128i shift0 = _mm_setzero_si128(); 3920 __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift 3921 __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5); 3922 __m128i A = _mm_setr_epi16(4, -8, 11, -32768, 4, -8, 11, -32768); 3923 short[8] correct0 = (cast(short8)A).array; 3924 short[8] correctX = [0, 0, 0, 0, 0, 0, 0, 0]; 3925 short[8] correct2 = [16, -32, 44, 0, 16, -32, 44, 0]; 3926 short8 B0 = cast(short8) _mm_sll_epi16(A, shift0); 3927 short8 BX = cast(short8) _mm_sll_epi16(A, shiftX); 3928 short8 B2 = cast(short8) _mm_sll_epi16(A, shift2); 3929 assert(B0.array == correct0); 3930 assert(BX.array == correctX); 3931 assert(B2.array == correct2); 3932 } 3933 3934 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 3935 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted 3936 { 3937 static if (GDC_with_SSE2) 3938 { 3939 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3940 } 3941 else static if (LDC_with_SSE2) 3942 { 3943 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3944 } 3945 else 3946 { 3947 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3948 // D says "It's illegal to shift by the same or more bits 3949 // than the size of the quantity being shifted" 3950 // and it's UB instead. 3951 int4 r = _mm_setzero_si128(); 3952 3953 ubyte count = cast(ubyte) imm8; 3954 if (count > 31) 3955 return r; 3956 3957 foreach(i; 0..4) 3958 r.array[i] = cast(uint)(a.array[i]) << count; 3959 return r; 3960 } 3961 } 3962 unittest 3963 { 3964 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3965 __m128i B = _mm_slli_epi32(A, 1); 3966 __m128i B2 = _mm_slli_epi32(A, 1 + 256); 3967 int[4] expectedB = [ 0, 4, 6, -8]; 3968 assert(B.array == expectedB); 3969 assert(B2.array == expectedB); 3970 3971 __m128i C = _mm_slli_epi32(A, 0); 3972 int[4] expectedC = [ 0, 2, 3, -4]; 3973 assert(C.array == expectedC); 3974 3975 __m128i D = _mm_slli_epi32(A, 65); 3976 int[4] expectedD = [ 0, 0, 0, 0]; 3977 assert(D.array == expectedD); 3978 } 3979 3980 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros. 3981 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted 3982 { 3983 static if (GDC_with_SSE2) 3984 { 3985 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3986 } 3987 else static if (LDC_with_SSE2) 3988 { 3989 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3990 } 3991 else 3992 { 3993 long2 sa = cast(long2)a; 3994 3995 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3996 // D says "It's illegal to shift by the same or more bits 3997 // than the size of the quantity being shifted" 3998 // and it's UB instead. 3999 long2 r = cast(long2) _mm_setzero_si128(); 4000 ubyte count = cast(ubyte) imm8; 4001 if (count > 63) 4002 return cast(__m128i)r; 4003 4004 r.ptr[0] = cast(ulong)(sa.array[0]) << count; 4005 r.ptr[1] = cast(ulong)(sa.array[1]) << count; 4006 return cast(__m128i)r; 4007 } 4008 } 4009 unittest 4010 { 4011 __m128i A = _mm_setr_epi64(8, -4); 4012 long2 B = cast(long2) _mm_slli_epi64(A, 1); 4013 long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024); 4014 long[2] expectedB = [ 16, -8]; 4015 assert(B.array == expectedB); 4016 assert(B2.array == expectedB); 4017 4018 long2 C = cast(long2) _mm_slli_epi64(A, 0); 4019 long[2] expectedC = [ 8, -4]; 4020 assert(C.array == expectedC); 4021 4022 long2 D = cast(long2) _mm_slli_epi64(A, 64); 4023 long[2] expectedD = [ 0, -0]; 4024 assert(D.array == expectedD); 4025 } 4026 4027 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 4028 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted 4029 { 4030 static if (GDC_with_SSE2) 4031 { 4032 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 4033 } 4034 else static if (LDC_with_SSE2) 4035 { 4036 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 4037 } 4038 else static if (LDC_with_ARM64) 4039 { 4040 short8 sa = cast(short8)a; 4041 short8 r = cast(short8)_mm_setzero_si128(); 4042 ubyte count = cast(ubyte) imm8; 4043 if (count > 15) 4044 return cast(__m128i)r; 4045 r = sa << short8(count); 4046 return cast(__m128i)r; 4047 } 4048 else 4049 { 4050 short8 sa = cast(short8)a; 4051 short8 r = cast(short8)_mm_setzero_si128(); 4052 ubyte count = cast(ubyte) imm8; 4053 if (count > 15) 4054 return cast(__m128i)r; 4055 foreach(i; 0..8) 4056 r.ptr[i] = cast(short)(sa.array[i] << count); 4057 return cast(__m128i)r; 4058 } 4059 } 4060 unittest 4061 { 4062 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 4063 short8 B = cast(short8)( _mm_slli_epi16(A, 1) ); 4064 short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) ); 4065 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 4066 assert(B.array == expectedB); 4067 assert(B2.array == expectedB); 4068 4069 short8 C = cast(short8)( _mm_slli_epi16(A, 16) ); 4070 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 4071 assert(C.array == expectedC); 4072 } 4073 4074 4075 /// Shift `a` left by `bytes` bytes while shifting in zeros. 4076 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted 4077 { 4078 static if (bytes & 0xF0) 4079 { 4080 return _mm_setzero_si128(); 4081 } 4082 else static if (DMD_with_DSIMD) 4083 { 4084 return cast(__m128i) __simd_ib(XMM.PSLLDQ, op, bytes); 4085 } 4086 else static if (GDC_with_SSE2) 4087 { 4088 pragma(inline, true); // else it doesn't seem to be inlined at all by GDC PERF do it in _mm_srli_si128 and check 4089 return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 4090 } 4091 else static if (LDC_with_optimizations) 4092 { 4093 return cast(__m128i) shufflevectorLDC!(byte16, 4094 16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes, 4095 22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes, 4096 28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes) 4097 (cast(byte16)_mm_setzero_si128(), cast(byte16)op); 4098 } 4099 else static if (DMD_with_32bit_asm) 4100 { 4101 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64 4102 { 4103 movdqu XMM0, op; 4104 pslldq XMM0, bytes; 4105 movdqu op, XMM0; 4106 } 4107 return op; 4108 } 4109 else 4110 { 4111 byte16 A = cast(byte16)op; 4112 byte16 R = void; 4113 for (int n = 15; n >= bytes; --n) 4114 R.ptr[n] = A.array[n-bytes]; 4115 for (int n = bytes-1; n >= 0; --n) 4116 R.ptr[n] = 0; 4117 return cast(__m128i)R; 4118 } 4119 } 4120 unittest 4121 { 4122 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 4123 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left 4124 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ]; 4125 assert(R.array == correct); 4126 4127 __m128i B = _mm_slli_si128!16(_mm_set1_epi32(-1)); 4128 int[4] expectedB = [0, 0, 0, 0]; 4129 assert(B.array == expectedB); 4130 } 4131 4132 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`. 4133 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted 4134 { 4135 version(LDC) 4136 { 4137 // Disappeared with LDC 1.11 4138 static if (__VERSION__ < 2081) 4139 return __builtin_ia32_sqrtpd(vec); 4140 else 4141 { 4142 // PERF: use llvm_sqrt on the vector 4143 vec.array[0] = llvm_sqrt(vec.array[0]); 4144 vec.array[1] = llvm_sqrt(vec.array[1]); 4145 return vec; 4146 } 4147 } 4148 else static if (GDC_with_SSE2) 4149 { 4150 return __builtin_ia32_sqrtpd(vec); 4151 } 4152 else 4153 { 4154 vec.ptr[0] = sqrt(vec.array[0]); 4155 vec.ptr[1] = sqrt(vec.array[1]); 4156 return vec; 4157 } 4158 } 4159 4160 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 4161 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 4162 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted 4163 { 4164 // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only. 4165 // "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 4166 // The quadword at bits 127:64 of the destination operand remains unchanged." 4167 version(LDC) 4168 { 4169 // Disappeared with LDC 1.11 4170 static if (__VERSION__ < 2081) 4171 { 4172 __m128d c = __builtin_ia32_sqrtsd(b); 4173 a[0] = c[0]; 4174 return a; 4175 } 4176 else 4177 { 4178 a.array[0] = llvm_sqrt(b.array[0]); 4179 return a; 4180 } 4181 } 4182 else static if (GDC_with_SSE2) 4183 { 4184 __m128d c = __builtin_ia32_sqrtsd(b); 4185 a.ptr[0] = c.array[0]; 4186 return a; 4187 } 4188 else 4189 { 4190 a.ptr[0] = sqrt(b.array[0]); 4191 return a; 4192 } 4193 } 4194 unittest 4195 { 4196 __m128d A = _mm_setr_pd(1.0, 3.0); 4197 __m128d B = _mm_setr_pd(4.0, 5.0); 4198 __m128d R = _mm_sqrt_sd(A, B); 4199 double[2] correct = [2.0, 3.0 ]; 4200 assert(R.array == correct); 4201 } 4202 4203 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits. 4204 /// Bit-shift is a single value in the low-order 64-bit of `count`. 4205 /// If bit-shift > 15, result is defined to be all sign bits. 4206 /// Warning: prefer `_mm_srai_epi16`, less of a trap. 4207 __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted 4208 { 4209 static if (GDC_or_LDC_with_SSE2) 4210 { 4211 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 4212 } 4213 else 4214 { 4215 short8 sa = cast(short8)a; 4216 long2 lc = cast(long2)count; 4217 ulong bits = cast(ulong)(lc.array[0]); 4218 if (bits > 15) 4219 bits = 15; 4220 short8 r = void; 4221 foreach(i; 0..8) 4222 r.ptr[i] = cast(short)(sa.array[i] >> bits); 4223 return cast(int4)r; 4224 } 4225 } 4226 unittest 4227 { 4228 __m128i shift0 = _mm_setzero_si128(); 4229 __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift 4230 __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5); 4231 __m128i A = _mm_setr_epi16(4, -9, 11, -32768, 4, -8, 11, -32768); 4232 short[8] correct0 = (cast(short8)A).array; 4233 short[8] correctX = [0, -1, 0, -1, 0, -1, 0, -1]; 4234 short[8] correct2 = [1, -3, 2, -8192, 1, -2, 2, -8192]; 4235 short8 B0 = cast(short8) _mm_sra_epi16(A, shift0); 4236 short8 BX = cast(short8) _mm_sra_epi16(A, shiftX); 4237 short8 B2 = cast(short8) _mm_sra_epi16(A, shift2); 4238 assert(B0.array == correct0); 4239 assert(BX.array == correctX); 4240 assert(B2.array == correct2); 4241 } 4242 4243 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits. 4244 /// Bit-shift is a single value in the low-order 64-bit of `count`. 4245 /// If bit-shift > 31, result is defined to be all sign bits. 4246 /// Note: prefer `_mm_srai_epi32`, less of a trap. 4247 __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted 4248 { 4249 static if (GDC_or_LDC_with_SSE2) 4250 { 4251 return __builtin_ia32_psrad128(a, count); 4252 } 4253 else 4254 { 4255 int4 r = void; 4256 long2 lc = cast(long2)count; 4257 ulong bits = cast(ulong)(lc.array[0]); 4258 if (bits > 31) 4259 bits = 31; 4260 r.ptr[0] = (a.array[0] >> bits); 4261 r.ptr[1] = (a.array[1] >> bits); 4262 r.ptr[2] = (a.array[2] >> bits); 4263 r.ptr[3] = (a.array[3] >> bits); 4264 return r; 4265 } 4266 } 4267 unittest 4268 { 4269 __m128i shift0 = _mm_setzero_si128(); 4270 __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift 4271 __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5); 4272 __m128i A = _mm_setr_epi32(4, -9, 11, -2147483648); 4273 int[4] correct0 = A.array; 4274 int[4] correctX = [0, -1, 0, -1]; 4275 int[4] correct2 = [1, -3, 2, -536870912]; 4276 int4 B0 = cast(int4) _mm_sra_epi32(A, shift0); 4277 int4 BX = cast(int4) _mm_sra_epi32(A, shiftX); 4278 int4 B2 = cast(int4) _mm_sra_epi32(A, shift2); 4279 assert(B0.array == correct0); 4280 assert(BX.array == correctX); 4281 assert(B2.array == correct2); 4282 } 4283 4284 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 4285 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted 4286 { 4287 static if (GDC_with_SSE2) 4288 { 4289 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 4290 } 4291 else static if (LDC_with_SSE2) 4292 { 4293 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 4294 } 4295 else static if (LDC_with_ARM64) 4296 { 4297 short8 sa = cast(short8)a; 4298 ubyte count = cast(ubyte)imm8; 4299 if (count > 15) 4300 count = 15; 4301 short8 r = sa >> short8(count); 4302 return cast(__m128i)r; 4303 } 4304 else 4305 { 4306 short8 sa = cast(short8)a; 4307 short8 r = void; 4308 4309 // Note: the intrinsics guarantee imm8[0..7] is taken, however 4310 // D says "It's illegal to shift by the same or more bits 4311 // than the size of the quantity being shifted" 4312 // and it's UB instead. 4313 ubyte count = cast(ubyte)imm8; 4314 if (count > 15) 4315 count = 15; 4316 foreach(i; 0..8) 4317 r.ptr[i] = cast(short)(sa.array[i] >> count); 4318 return cast(int4)r; 4319 } 4320 } 4321 unittest 4322 { 4323 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 4324 short8 B = cast(short8)( _mm_srai_epi16(A, 1) ); 4325 short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) ); 4326 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 4327 assert(B.array == expectedB); 4328 assert(B2.array == expectedB); 4329 4330 short8 C = cast(short8)( _mm_srai_epi16(A, 18) ); 4331 short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ]; 4332 assert(C.array == expectedC); 4333 } 4334 4335 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 4336 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted 4337 { 4338 static if (LDC_with_SSE2) 4339 { 4340 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 4341 } 4342 else static if (GDC_with_SSE2) 4343 { 4344 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 4345 } 4346 else 4347 { 4348 int4 r = void; 4349 4350 // Note: the intrinsics guarantee imm8[0..7] is taken, however 4351 // D says "It's illegal to shift by the same or more bits 4352 // than the size of the quantity being shifted" 4353 // and it's UB instead. 4354 // See Issue: #56 4355 ubyte count = cast(ubyte) imm8; 4356 if (count > 31) 4357 count = 31; 4358 4359 r.ptr[0] = (a.array[0] >> count); 4360 r.ptr[1] = (a.array[1] >> count); 4361 r.ptr[2] = (a.array[2] >> count); 4362 r.ptr[3] = (a.array[3] >> count); 4363 return r; 4364 } 4365 } 4366 unittest 4367 { 4368 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 4369 __m128i B = _mm_srai_epi32(A, 1); 4370 __m128i B2 = _mm_srai_epi32(A, 1 + 256); 4371 int[4] expectedB = [ 0, 1, 1, -2]; 4372 assert(B.array == expectedB); 4373 assert(B2.array == expectedB); 4374 4375 __m128i C = _mm_srai_epi32(A, 32); 4376 int[4] expectedC = [ 0, 0, 0, -1]; 4377 assert(C.array == expectedC); 4378 4379 __m128i D = _mm_srai_epi32(A, 0); 4380 int[4] expectedD = [ 0, 2, 3, -4]; 4381 assert(D.array == expectedD); 4382 } 4383 4384 /// Shift packed 16-bit integers in `a` right by `count` while shifting in zeros. 4385 /// Bit-shift is a single value in the low-order 64-bit of `count`. 4386 /// If bit-shift > 15, result is defined to be all zeroes. 4387 /// Warning: prefer `_mm_srli_epi16`, less of a trap. 4388 __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted 4389 { 4390 // PERF ARM64 4391 static if (GDC_or_LDC_with_SSE2) 4392 { 4393 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 4394 } 4395 else 4396 { 4397 short8 sa = cast(short8)a; 4398 long2 lc = cast(long2)count; 4399 ulong bits = cast(ulong)(lc.array[0]); 4400 short8 r = void; 4401 foreach(i; 0..8) 4402 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits); 4403 if (bits > 15) 4404 r = short8(0); 4405 return cast(__m128i)r; 4406 } 4407 } 4408 unittest 4409 { 4410 __m128i shift0 = _mm_setzero_si128(); 4411 __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift 4412 __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5); 4413 __m128i A = _mm_setr_epi16(4, -8, 11, -32768, 4, -8, 11, -32768); 4414 short[8] correct0 = (cast(short8)A).array; 4415 short[8] correctX = [0, 0, 0, 0, 0, 0, 0, 0]; 4416 short[8] correct2 = [1, 16382, 2, 8192, 1, 16382, 2, 8192]; 4417 short8 B0 = cast(short8) _mm_srl_epi16(A, shift0); 4418 short8 BX = cast(short8) _mm_srl_epi16(A, shiftX); 4419 short8 B2 = cast(short8) _mm_srl_epi16(A, shift2); 4420 assert(B0.array == correct0); 4421 assert(BX.array == correctX); 4422 assert(B2.array == correct2); 4423 } 4424 4425 /// Shift packed 32-bit integers in `a` right by `count` while shifting in zeros. 4426 /// Bit-shift is a single value in the low-order 64-bit of `count`. 4427 /// If bit-shift > 31, result is defined to be all zeroes. 4428 /// Note: prefer `_mm_srli_epi32`, less of a trap. 4429 __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted 4430 { 4431 static if (GDC_or_LDC_with_SSE2) 4432 { 4433 return __builtin_ia32_psrld128(a, count); 4434 } 4435 else 4436 { 4437 int4 r = void; 4438 long2 lc = cast(long2)count; 4439 ulong bits = cast(ulong)(lc.array[0]); 4440 r.ptr[0] = cast(uint)(a.array[0]) >> bits; 4441 r.ptr[1] = cast(uint)(a.array[1]) >> bits; 4442 r.ptr[2] = cast(uint)(a.array[2]) >> bits; 4443 r.ptr[3] = cast(uint)(a.array[3]) >> bits; 4444 if (bits > 31) // Same semantics as x86 instruction 4445 r = int4(0); 4446 return r; 4447 } 4448 } 4449 unittest 4450 { 4451 __m128i shift0 = _mm_setzero_si128(); 4452 __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift 4453 __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5); 4454 __m128i A = _mm_setr_epi32(4, -8, 11, -0x80000000); 4455 int[4] correct0 = A.array; 4456 int[4] correctX = [0, 0, 0, 0]; 4457 int[4] correct2 = [1, 1073741822, 2, 536870912]; 4458 int4 B0 = cast(int4) _mm_srl_epi32(A, shift0); 4459 int4 BX = cast(int4) _mm_srl_epi32(A, shiftX); 4460 int4 B2 = cast(int4) _mm_srl_epi32(A, shift2); 4461 assert(B0.array == correct0); 4462 assert(BX.array == correctX); 4463 assert(B2.array == correct2); 4464 } 4465 4466 /// Shift packed 64-bit integers in `a` right by `count` while shifting in zeroes. 4467 /// Bit-shift is a single value in the low-order 64-bit of `count`. 4468 /// If bit-shift > 63, result is defined to be all zeroes. 4469 /// Note: prefer `_mm_srli_epi64`, less of a trap. 4470 __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted 4471 { 4472 static if (GDC_or_LDC_with_SSE2) 4473 { 4474 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 4475 } 4476 else 4477 { 4478 long2 r; 4479 long2 sa = cast(long2)a; 4480 long2 lc = cast(long2)count; 4481 ulong bits = cast(ulong)(lc.array[0]); 4482 r.ptr[0] = cast(ulong)(sa.array[0]) >> bits; 4483 r.ptr[1] = cast(ulong)(sa.array[1]) >> bits; 4484 if (bits > 63) 4485 r = long2(0); 4486 return cast(__m128i)r; 4487 } 4488 } 4489 unittest 4490 { 4491 __m128i shift0 = _mm_setzero_si128(); 4492 __m128i shiftX = _mm_set1_epi64x(0x8000_0000_0000_0000); // too large shift 4493 __m128i shift2 = _mm_setr_epi32(2, 0, 4, 5); 4494 __m128i A = _mm_setr_epi64(4, -9); 4495 long[2] correct0 = [4, -9]; 4496 long[2] correctX = [0, 0]; 4497 long[2] correct2 = [1, 4611686018427387901]; 4498 long2 B0 = cast(long2) _mm_srl_epi64(A, shift0); 4499 long2 BX = cast(long2) _mm_srl_epi64(A, shiftX); 4500 long2 B2 = cast(long2) _mm_srl_epi64(A, shift2); 4501 assert(B0.array == correct0); 4502 assert(BX.array == correctX); 4503 assert(B2.array == correct2); 4504 } 4505 4506 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 4507 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted 4508 { 4509 static if (GDC_with_SSE2) 4510 { 4511 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 4512 } 4513 else static if (LDC_with_SSE2) 4514 { 4515 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 4516 } 4517 else static if (LDC_with_ARM64) 4518 { 4519 short8 sa = cast(short8)a; 4520 short8 r = cast(short8) _mm_setzero_si128(); 4521 4522 ubyte count = cast(ubyte)imm8; 4523 if (count >= 16) 4524 return cast(__m128i)r; 4525 4526 r = sa >>> short8(count); // This facility offered with LDC, but not DMD. 4527 return cast(__m128i)r; 4528 } 4529 else 4530 { 4531 short8 sa = cast(short8)a; 4532 ubyte count = cast(ubyte)imm8; 4533 4534 short8 r = cast(short8) _mm_setzero_si128(); 4535 if (count >= 16) 4536 return cast(__m128i)r; 4537 4538 foreach(i; 0..8) 4539 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count); 4540 return cast(__m128i)r; 4541 } 4542 } 4543 unittest 4544 { 4545 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 4546 short8 B = cast(short8)( _mm_srli_epi16(A, 1) ); 4547 short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) ); 4548 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 4549 assert(B.array == expectedB); 4550 assert(B2.array == expectedB); 4551 4552 short8 C = cast(short8)( _mm_srli_epi16(A, 16) ); 4553 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0]; 4554 assert(C.array == expectedC); 4555 4556 short8 D = cast(short8)( _mm_srli_epi16(A, 0) ); 4557 short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ]; 4558 assert(D.array == expectedD); 4559 } 4560 4561 4562 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 4563 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted 4564 { 4565 static if (GDC_with_SSE2) 4566 { 4567 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 4568 } 4569 else static if (LDC_with_SSE2) 4570 { 4571 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 4572 } 4573 else 4574 { 4575 ubyte count = cast(ubyte) imm8; 4576 4577 // Note: the intrinsics guarantee imm8[0..7] is taken, however 4578 // D says "It's illegal to shift by the same or more bits 4579 // than the size of the quantity being shifted" 4580 // and it's UB instead. 4581 int4 r = _mm_setzero_si128(); 4582 if (count >= 32) 4583 return r; 4584 r.ptr[0] = a.array[0] >>> count; 4585 r.ptr[1] = a.array[1] >>> count; 4586 r.ptr[2] = a.array[2] >>> count; 4587 r.ptr[3] = a.array[3] >>> count; 4588 return r; 4589 } 4590 } 4591 unittest 4592 { 4593 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 4594 __m128i B = _mm_srli_epi32(A, 1); 4595 __m128i B2 = _mm_srli_epi32(A, 1 + 256); 4596 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 4597 assert(B.array == expectedB); 4598 assert(B2.array == expectedB); 4599 4600 __m128i C = _mm_srli_epi32(A, 255); 4601 int[4] expectedC = [ 0, 0, 0, 0 ]; 4602 assert(C.array == expectedC); 4603 } 4604 4605 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros. 4606 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted 4607 { 4608 // PERF DMD 4609 static if (GDC_or_LDC_with_SSE2) 4610 { 4611 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4612 } 4613 else 4614 { 4615 long2 r = cast(long2) _mm_setzero_si128(); 4616 long2 sa = cast(long2)a; 4617 4618 ubyte count = cast(ubyte) imm8; 4619 if (count >= 64) 4620 return cast(__m128i)r; 4621 4622 r.ptr[0] = sa.array[0] >>> count; 4623 r.ptr[1] = sa.array[1] >>> count; 4624 return cast(__m128i)r; 4625 } 4626 } 4627 unittest 4628 { 4629 __m128i A = _mm_setr_epi64(8, -4); 4630 long2 B = cast(long2) _mm_srli_epi64(A, 1); 4631 long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512); 4632 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 4633 assert(B.array == expectedB); 4634 assert(B2.array == expectedB); 4635 4636 long2 C = cast(long2) _mm_srli_epi64(A, 64); 4637 long[2] expectedC = [ 0, 0 ]; 4638 assert(C.array == expectedC); 4639 } 4640 4641 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4642 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @trusted 4643 { 4644 static if (bytes & 0xF0) 4645 { 4646 return _mm_setzero_si128(); 4647 } 4648 else static if (DMD_with_DSIMD) 4649 { 4650 return cast(__m128i) __simd_ib(XMM.PSRLDQ, v, bytes); 4651 } 4652 else static if (GDC_with_SSE2) 4653 { 4654 return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8)); 4655 } 4656 else static if (DMD_with_32bit_asm) 4657 { 4658 asm pure nothrow @nogc @trusted 4659 { 4660 movdqu XMM0, v; 4661 psrldq XMM0, bytes; 4662 movdqu v, XMM0; 4663 } 4664 return v; 4665 } 4666 else static if (LDC_with_optimizations) 4667 { 4668 return cast(__m128i) shufflevectorLDC!(byte16, 4669 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7, 4670 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15) 4671 (cast(byte16) v, cast(byte16)_mm_setzero_si128()); 4672 } 4673 else 4674 { 4675 byte16 A = cast(byte16)v; 4676 byte16 R = void; 4677 for (int n = 0; n < bytes; ++n) 4678 R.ptr[15-n] = 0; 4679 for (int n = bytes; n < 16; ++n) 4680 R.ptr[15-n] = A.array[15 - n + bytes]; 4681 return cast(__m128i)R; 4682 } 4683 } 4684 unittest 4685 { 4686 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, -2, 1)); 4687 int[4] correct = [-2, 3, 4, 0]; 4688 assert(R.array == correct); 4689 4690 __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1)); 4691 int[4] expectedA = [0, 0, 0, 0]; 4692 assert(A.array == expectedA); 4693 } 4694 4695 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4696 /// #BONUS 4697 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe 4698 { 4699 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v); 4700 } 4701 unittest 4702 { 4703 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)); 4704 float[4] correct = [3.0f, 4.0f, 0, 0]; 4705 assert(R.array == correct); 4706 } 4707 4708 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4709 /// #BONUS 4710 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe 4711 { 4712 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v); 4713 } 4714 4715 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4716 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4717 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted 4718 { 4719 pragma(inline, true); 4720 __m128d* aligned = cast(__m128d*)mem_addr; 4721 *aligned = a; 4722 } 4723 unittest 4724 { 4725 align(16) double[2] A; 4726 __m128d B = _mm_setr_pd(-8.0, 9.0); 4727 _mm_store_pd(A.ptr, B); 4728 assert(A == [-8.0, 9.0]); 4729 } 4730 4731 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 4732 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4733 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted 4734 { 4735 __m128d* aligned = cast(__m128d*)mem_addr; 4736 __m128d r; // PERF =void; 4737 r.ptr[0] = a.array[0]; 4738 r.ptr[1] = a.array[0]; 4739 *aligned = r; 4740 } 4741 4742 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 4743 /// be aligned on any particular boundary. 4744 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 4745 { 4746 pragma(inline, true); 4747 *mem_addr = a.array[0]; 4748 } 4749 4750 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 4751 /// general-protection exception may be generated. 4752 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 4753 { 4754 pragma(inline, true); 4755 *mem_addr = a; 4756 } 4757 4758 alias _mm_store1_pd = _mm_store_pd1; /// 4759 4760 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory. 4761 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 4762 { 4763 pragma(inline, true); 4764 *mem_addr = a.array[1]; 4765 } 4766 4767 // Note: `mem_addr` doesn't have to actually be aligned, which breaks 4768 // expectations from the user point of view. This problem also exist in C++. 4769 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 4770 { 4771 pragma(inline, true); 4772 long* dest = cast(long*)mem_addr; 4773 long2 la = cast(long2)a; 4774 *dest = la.array[0]; 4775 } 4776 unittest 4777 { 4778 long[3] A = [1, 2, 3]; 4779 _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 4780 long[3] correct = [1, 0x1_0000_0000, 3]; 4781 assert(A == correct); 4782 } 4783 4784 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. 4785 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 4786 { 4787 pragma(inline, true); 4788 *mem_addr = a.array[0]; 4789 } 4790 4791 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse 4792 /// order. `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception 4793 /// may be generated. 4794 void _mm_storer_pd (double* mem_addr, __m128d a) pure @system 4795 { 4796 __m128d reversed = void; 4797 reversed.ptr[0] = a.array[1]; 4798 reversed.ptr[1] = a.array[0]; 4799 *cast(__m128d*)mem_addr = reversed; 4800 } 4801 unittest 4802 { 4803 align(16) double[2] A = [0.0, 1.0]; 4804 _mm_storer_pd(A.ptr, _mm_setr_pd(2.0, 3.0)); 4805 assert(A[0] == 3.0 && A[1] == 2.0); 4806 } 4807 4808 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 4809 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary. 4810 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @trusted // TODO: signature, should be system 4811 { 4812 // PERF DMD 4813 pragma(inline, true); 4814 static if (GDC_with_SSE2) 4815 { 4816 __builtin_ia32_storeupd(mem_addr, a); 4817 } 4818 else static if (LDC_with_optimizations) 4819 { 4820 storeUnaligned!double2(a, mem_addr); 4821 } 4822 else 4823 { 4824 mem_addr[0] = a.array[0]; 4825 mem_addr[1] = a.array[1]; 4826 } 4827 } 4828 unittest 4829 { 4830 __m128d A = _mm_setr_pd(3.0, 4.0); 4831 align(16) double[4] R = [0.0, 0, 0, 0]; 4832 double[2] correct = [3.0, 4.0]; 4833 _mm_storeu_pd(&R[1], A); 4834 assert(R[1..3] == correct); 4835 } 4836 4837 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 4838 /// boundary. 4839 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @trusted // TODO: signature is wrong, mem_addr is not aligned. Make it @system 4840 { 4841 // PERF: DMD 4842 pragma(inline, true); 4843 static if (GDC_with_SSE2) 4844 { 4845 __builtin_ia32_storedqu(cast(char*)mem_addr, cast(ubyte16)a); 4846 } 4847 else static if (LDC_with_optimizations) 4848 { 4849 storeUnaligned!__m128i(a, cast(int*)mem_addr); 4850 } 4851 else 4852 { 4853 int* p = cast(int*)mem_addr; 4854 p[0] = a.array[0]; 4855 p[1] = a.array[1]; 4856 p[2] = a.array[2]; 4857 p[3] = a.array[3]; 4858 } 4859 } 4860 unittest 4861 { 4862 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 4863 align(16) int[6] R = [0, 0, 0, 0, 0, 0]; 4864 int[4] correct = [1, 2, 3, 4]; 4865 _mm_storeu_si128(cast(__m128i*)(&R[1]), A); 4866 assert(R[1..5] == correct); 4867 } 4868 4869 /// Store 16-bit integer from the first element of `a` into memory. 4870 /// `mem_addr` does not need to be aligned on any particular boundary. 4871 void _mm_storeu_si16 (void* mem_addr, __m128i a) pure @system 4872 { 4873 short* dest = cast(short*)mem_addr; 4874 *dest = (cast(short8)a).array[0]; 4875 } 4876 unittest 4877 { 4878 short[2] arr = [-24, 12]; 4879 _mm_storeu_si16(&arr[1], _mm_set1_epi16(26)); 4880 short[2] correct = [-24, 26]; 4881 assert(arr == correct); 4882 } 4883 4884 /// Store 32-bit integer from the first element of `a` into memory. 4885 /// `mem_addr` does not need to be aligned on any particular boundary. 4886 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted // TODO should really be @ssytem 4887 { 4888 pragma(inline, true); 4889 int* dest = cast(int*)mem_addr; 4890 *dest = a.array[0]; 4891 } 4892 unittest 4893 { 4894 int[2] arr = [-24, 12]; 4895 _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7)); 4896 assert(arr == [-24, -1]); 4897 } 4898 4899 /// Store 64-bit integer from the first element of `a` into memory. 4900 /// `mem_addr` does not need to be aligned on any particular boundary. 4901 void _mm_storeu_si64 (void* mem_addr, __m128i a) pure @system 4902 { 4903 pragma(inline, true); 4904 long* dest = cast(long*)mem_addr; 4905 long2 la = cast(long2)a; 4906 *dest = la.array[0]; 4907 } 4908 unittest 4909 { 4910 long[3] A = [1, 2, 3]; 4911 _mm_storeu_si64(&A[1], _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 4912 long[3] correct = [1, 0x1_0000_0000, 3]; 4913 assert(A == correct); 4914 } 4915 4916 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 4917 /// from `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 16-byte 4918 /// boundary or a general-protection exception may be generated. 4919 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4920 void _mm_stream_pd (double* mem_addr, __m128d a) pure @system 4921 { 4922 // PERF DMD D_SIMD 4923 static if (GDC_with_SSE2) 4924 { 4925 return __builtin_ia32_movntpd(mem_addr, a); 4926 } 4927 else static if (LDC_with_InlineIREx && LDC_with_optimizations) 4928 { 4929 enum prefix = `!0 = !{ i32 1 }`; 4930 enum ir = ` 4931 store <2 x double> %1, <2 x double>* %0, align 16, !nontemporal !0 4932 ret void`; 4933 LDCInlineIREx!(prefix, ir, "", void, double2*, double2)(cast(double2*)mem_addr, a); 4934 } 4935 else 4936 { 4937 // Regular store instead. 4938 __m128d* dest = cast(__m128d*)mem_addr; 4939 *dest = a; 4940 } 4941 } 4942 unittest 4943 { 4944 align(16) double[2] A; 4945 __m128d B = _mm_setr_pd(-8.0, 9.0); 4946 _mm_stream_pd(A.ptr, B); 4947 assert(A == [-8.0, 9.0]); 4948 } 4949 4950 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 4951 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception 4952 /// may be generated. 4953 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4954 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) pure @trusted 4955 { 4956 // PERF DMD D_SIMD 4957 static if (GDC_with_SSE2) 4958 { 4959 return __builtin_ia32_movntdq (cast(long2*)mem_addr, cast(long2)a); 4960 } 4961 else static if (LDC_with_InlineIREx && LDC_with_optimizations) 4962 { 4963 enum prefix = `!0 = !{ i32 1 }`; 4964 enum ir = ` 4965 store <4 x i32> %1, <4 x i32>* %0, align 16, !nontemporal !0 4966 ret void`; 4967 LDCInlineIREx!(prefix, ir, "", void, int4*, int4)(cast(int4*)mem_addr, a); 4968 } 4969 else 4970 { 4971 // Regular store instead. 4972 __m128i* dest = cast(__m128i*)mem_addr; 4973 *dest = a; 4974 } 4975 } 4976 unittest 4977 { 4978 align(16) int[4] A; 4979 __m128i B = _mm_setr_epi32(-8, 9, 10, -11); 4980 _mm_stream_si128(cast(__m128i*)A.ptr, B); 4981 assert(A == [-8, 9, 10, -11]); 4982 } 4983 4984 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 4985 /// pollution. If the cache line containing address `mem_addr` is already in the cache, 4986 /// the cache will be updated. 4987 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4988 void _mm_stream_si32 (int* mem_addr, int a) pure @trusted 4989 { 4990 // PERF DMD D_SIMD 4991 static if (GDC_with_SSE2) 4992 { 4993 return __builtin_ia32_movnti(mem_addr, a); 4994 } 4995 else static if (LDC_with_InlineIREx && LDC_with_optimizations) 4996 { 4997 enum prefix = `!0 = !{ i32 1 }`; 4998 enum ir = ` 4999 store i32 %1, i32* %0, !nontemporal !0 5000 ret void`; 5001 LDCInlineIREx!(prefix, ir, "", void, int*, int)(mem_addr, a); 5002 } 5003 else 5004 { 5005 // Regular store instead. 5006 *mem_addr = a; 5007 } 5008 } 5009 unittest 5010 { 5011 int A; 5012 _mm_stream_si32(&A, -34); 5013 assert(A == -34); 5014 } 5015 5016 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 5017 /// cache pollution. If the cache line containing address `mem_addr` is already 5018 /// in the cache, the cache will be updated. 5019 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 5020 void _mm_stream_si64 (long* mem_addr, long a) pure @trusted 5021 { 5022 // PERF DMD D_SIMD 5023 static if (GDC_with_SSE2) 5024 { 5025 return __builtin_ia32_movnti64(mem_addr, a); 5026 } 5027 else static if (LDC_with_InlineIREx && LDC_with_optimizations) 5028 { 5029 enum prefix = `!0 = !{ i32 1 }`; 5030 enum ir = ` 5031 store i64 %1, i64* %0, !nontemporal !0 5032 ret void`; 5033 LDCInlineIREx!(prefix, ir, "", void, long*, long)(mem_addr, a); 5034 5035 } 5036 else 5037 { 5038 // Regular store instead. 5039 *mem_addr = a; 5040 } 5041 } 5042 unittest 5043 { 5044 long A; 5045 _mm_stream_si64(&A, -46); 5046 assert(A == -46); 5047 } 5048 5049 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 5050 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 5051 { 5052 pragma(inline, true); 5053 return cast(__m128i)(cast(short8)a - cast(short8)b); 5054 } 5055 unittest 5056 { 5057 __m128i A = _mm_setr_epi16(16, 32767, 1, 2, 3, 4, 6, 6); 5058 __m128i B = _mm_setr_epi16(15, -32768, 6, 8, 1000, 1, 5, 6); 5059 short8 C = cast(short8) _mm_sub_epi16(A, B); 5060 short[8] correct = [ 1, -1,-5,-6, -997, 3, 1, 0]; 5061 assert(C.array == correct); 5062 } 5063 5064 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 5065 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 5066 { 5067 pragma(inline, true); 5068 return cast(__m128i)(cast(int4)a - cast(int4)b); 5069 } 5070 unittest 5071 { 5072 __m128i A = _mm_setr_epi32(16, int.max, 1, 8); 5073 __m128i B = _mm_setr_epi32(15, int.min, 6, 2); 5074 int4 C = cast(int4) _mm_sub_epi32(A, B); 5075 int[4] correct = [ 1, -1,-5, 6]; 5076 assert(C.array == correct); 5077 } 5078 5079 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`. 5080 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 5081 { 5082 pragma(inline, true); 5083 return cast(__m128i)(cast(long2)a - cast(long2)b); 5084 } 5085 unittest 5086 { 5087 __m128i A = _mm_setr_epi64( 16, long.max); 5088 __m128i B = _mm_setr_epi64( 199, long.min); 5089 long2 C = cast(long2) _mm_sub_epi64(A, B); 5090 long[2] correct = [-183, -1]; 5091 assert(C.array == correct); 5092 } 5093 5094 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 5095 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 5096 { 5097 pragma(inline, true); 5098 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 5099 } 5100 unittest 5101 { 5102 __m128i A = _mm_setr_epi8(16, 127, 1, 2, 3, 4, 6, 6, 16, 127, 1, 2, 3, 4, 6, 6); 5103 __m128i B = _mm_setr_epi8(15, -128, 6, 8, 3, 1, 5, 6, 16, 127, 1, 2, 3, 4, 6, 6); 5104 byte16 C = cast(byte16) _mm_sub_epi8(A, B); 5105 byte[16] correct = [ 1, -1,-5,-6, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 5106 assert(C.array == correct); 5107 } 5108 5109 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 5110 /// floating-point elements in `a`. 5111 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 5112 { 5113 pragma(inline, true); 5114 return a - b; 5115 } 5116 unittest 5117 { 5118 __m128d A = _mm_setr_pd(4000.0, -8.0); 5119 __m128d B = _mm_setr_pd(12.0, -8450.0); 5120 __m128d C = _mm_sub_pd(A, B); 5121 double[2] correct = [3988.0, 8442.0]; 5122 assert(C.array == correct); 5123 } 5124 5125 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 5126 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the 5127 /// upper element of result. 5128 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted 5129 { 5130 version(DigitalMars) 5131 { 5132 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 5133 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 5134 asm pure nothrow @nogc @trusted { nop;} 5135 a[0] = a[0] - b[0]; 5136 return a; 5137 } 5138 else static if (GDC_with_SSE2) 5139 { 5140 return __builtin_ia32_subsd(a, b); 5141 } 5142 else 5143 { 5144 a.ptr[0] -= b.array[0]; 5145 return a; 5146 } 5147 } 5148 unittest 5149 { 5150 __m128d a = [1.5, -2.0]; 5151 a = _mm_sub_sd(a, a); 5152 assert(a.array == [0.0, -2.0]); 5153 } 5154 5155 /// Subtract 64-bit integer `b` from 64-bit integer `a`. 5156 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe 5157 { 5158 pragma(inline, true); 5159 return a - b; 5160 } 5161 unittest 5162 { 5163 __m64 A, B; 5164 A = -1214; 5165 B = 489415; 5166 __m64 C = _mm_sub_si64(B, A); 5167 assert(C.array[0] == 489415 + 1214); 5168 } 5169 5170 /// Subtract packed signed 16-bit integers in `b` from packed 16-bit integers in `a` using 5171 /// saturation. 5172 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 5173 { 5174 // PERF DMD psubsw 5175 static if(LDC_with_saturated_intrinsics) 5176 { 5177 return cast(__m128i) inteli_llvm_subs!short8(cast(short8)a, cast(short8)b); 5178 } 5179 else static if (GDC_with_SSE2) 5180 { 5181 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 5182 } 5183 else 5184 { 5185 short[8] res; // PERF =void; 5186 short8 sa = cast(short8)a; 5187 short8 sb = cast(short8)b; 5188 foreach(i; 0..8) 5189 res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 5190 return _mm_loadu_si128(cast(int4*)res.ptr); 5191 } 5192 } 5193 unittest 5194 { 5195 short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0), 5196 _mm_setr_epi16(-10 , 16, 5, 4, 3, 2, 1, 0)); 5197 static immutable short[8] correctResult = [32767, -32768, 0, 0, 0, 0, 0, 0]; 5198 assert(res.array == correctResult); 5199 } 5200 5201 /// Subtract packed signed 8-bit integers in `b` from packed 8-bit integers in `a` using 5202 /// saturation. 5203 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 5204 { 5205 static if(LDC_with_saturated_intrinsics) 5206 { 5207 return cast(__m128i) inteli_llvm_subs!byte16(cast(byte16)a, cast(byte16)b); 5208 } 5209 else static if (GDC_with_SSE2) 5210 { 5211 return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b); 5212 } 5213 else 5214 { 5215 byte[16] res; // PERF =void; 5216 byte16 sa = cast(byte16)a; 5217 byte16 sb = cast(byte16)b; 5218 foreach(i; 0..16) 5219 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 5220 return _mm_loadu_si128(cast(int4*)res.ptr); 5221 } 5222 } 5223 unittest 5224 { 5225 byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 5226 _mm_setr_epi8( 15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 5227 static immutable byte[16] correctResult = [-128, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 5228 assert(res.array == correctResult); 5229 } 5230 5231 /// Subtract packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 5232 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 5233 { 5234 static if(LDC_with_saturated_intrinsics) 5235 { 5236 return cast(__m128i) inteli_llvm_subus!short8(cast(short8)a, cast(short8)b); 5237 } 5238 else static if (GDC_with_SSE2) 5239 { 5240 return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b); 5241 } 5242 else 5243 { 5244 short[8] res; // PERF =void; 5245 short8 sa = cast(short8)a; 5246 short8 sb = cast(short8)b; 5247 foreach(i; 0..8) 5248 { 5249 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 5250 res[i] = saturateSignedIntToUnsignedShort(sum); 5251 } 5252 return _mm_loadu_si128(cast(int4*)res.ptr); 5253 } 5254 } 5255 unittest 5256 { 5257 short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534, 1, 5, 4, 3, 2, 1, 0), 5258 _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0)); 5259 static immutable short[8] correct = [ 0, 0, 1, 0, 0, 2, 0, 0]; 5260 assert(R.array == correct); 5261 } 5262 5263 /// Subtract packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 5264 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 5265 { 5266 static if(LDC_with_saturated_intrinsics) 5267 { 5268 return cast(__m128i) inteli_llvm_subus!byte16(cast(byte16)a, cast(byte16)b); 5269 } 5270 else static if (GDC_with_SSE2) 5271 { 5272 return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b); 5273 } 5274 else 5275 { 5276 ubyte[16] res; // PERF =void; 5277 byte16 sa = cast(byte16)a; 5278 byte16 sb = cast(byte16)b; 5279 foreach(i; 0..16) 5280 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 5281 return _mm_loadu_si128(cast(int4*)res.ptr); 5282 } 5283 } 5284 unittest 5285 { 5286 byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 5287 _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 5288 static immutable byte[16] correctResult = [ 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 5289 assert(res.array == correctResult); 5290 } 5291 5292 // Note: the only difference between these intrinsics is the signalling 5293 // behaviour of quiet NaNs. This is incorrect but the case where 5294 // you would want to differentiate between qNaN and sNaN and then 5295 // treat them differently on purpose seems extremely rare. 5296 alias _mm_ucomieq_sd = _mm_comieq_sd; /// 5297 alias _mm_ucomige_sd = _mm_comige_sd; /// 5298 alias _mm_ucomigt_sd = _mm_comigt_sd; /// 5299 alias _mm_ucomile_sd = _mm_comile_sd; /// 5300 alias _mm_ucomilt_sd = _mm_comilt_sd; /// 5301 alias _mm_ucomineq_sd = _mm_comineq_sd; /// 5302 5303 /// Return vector of type `__m128d` with undefined elements. 5304 __m128d _mm_undefined_pd() pure @safe 5305 { 5306 pragma(inline, true); 5307 __m128d result = void; 5308 return result; 5309 } 5310 5311 /// Return vector of type `__m128i` with undefined elements. 5312 __m128i _mm_undefined_si128() pure @safe 5313 { 5314 pragma(inline, true); 5315 __m128i result = void; 5316 return result; 5317 } 5318 5319 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. 5320 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @trusted 5321 { 5322 static if (DMD_with_DSIMD) 5323 { 5324 return cast(__m128i) __simd(XMM.PUNPCKHWD, a, b); 5325 } 5326 else static if (GDC_with_SSE2) 5327 { 5328 return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b); 5329 } 5330 else static if (LDC_with_optimizations) 5331 { 5332 enum ir = `%r = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 5333 ret <8 x i16> %r`; 5334 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 5335 } 5336 else static if (DMD_with_32bit_asm || LDC_with_x86_asm) 5337 { 5338 asm pure nothrow @nogc @trusted 5339 { 5340 movdqu XMM0, a; 5341 movdqu XMM1, b; 5342 punpckhwd XMM0, XMM1; 5343 movdqu a, XMM0; 5344 } 5345 return a; 5346 } 5347 else 5348 { 5349 short8 r = void; 5350 short8 sa = cast(short8)a; 5351 short8 sb = cast(short8)b; 5352 r.ptr[0] = sa.array[4]; 5353 r.ptr[1] = sb.array[4]; 5354 r.ptr[2] = sa.array[5]; 5355 r.ptr[3] = sb.array[5]; 5356 r.ptr[4] = sa.array[6]; 5357 r.ptr[5] = sb.array[6]; 5358 r.ptr[6] = sa.array[7]; 5359 r.ptr[7] = sb.array[7]; 5360 return cast(__m128i)r; 5361 } 5362 } 5363 unittest 5364 { 5365 __m128i A = _mm_setr_epi16(4, 5, 6, 7, 8, 9, 10, 11); 5366 __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19); 5367 short8 C = cast(short8)(_mm_unpackhi_epi16(A, B)); 5368 short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19]; 5369 assert(C.array == correct); 5370 } 5371 5372 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. 5373 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted 5374 { 5375 static if (DMD_with_DSIMD) 5376 { 5377 return cast(__m128i) __simd(XMM.PUNPCKHDQ, a, b); 5378 } 5379 else static if (GDC_with_SSE2) 5380 { 5381 return __builtin_ia32_punpckhdq128(a, b); 5382 } 5383 else static if (LDC_with_optimizations) 5384 { 5385 enum ir = `%r = shufflevector <4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 5386 ret <4 x i32> %r`; 5387 return LDCInlineIR!(ir, int4, int4, int4)(cast(int4)a, cast(int4)b); 5388 } 5389 else 5390 { 5391 __m128i r = void; 5392 r.ptr[0] = a.array[2]; 5393 r.ptr[1] = b.array[2]; 5394 r.ptr[2] = a.array[3]; 5395 r.ptr[3] = b.array[3]; 5396 return r; 5397 } 5398 } 5399 unittest 5400 { 5401 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 5402 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 5403 __m128i C = _mm_unpackhi_epi32(A, B); 5404 int[4] correct = [3, 7, 4, 8]; 5405 assert(C.array == correct); 5406 } 5407 5408 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`. 5409 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted 5410 { 5411 static if (GDC_with_SSE2) 5412 { 5413 return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b); 5414 } 5415 else 5416 { 5417 __m128i r = cast(__m128i)b; 5418 r[0] = a[2]; 5419 r[1] = a[3]; 5420 return r; 5421 } 5422 } 5423 unittest // Issue #36 5424 { 5425 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 5426 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 5427 long2 C = cast(long2)(_mm_unpackhi_epi64(A, B)); 5428 long[2] correct = [0x33333333_33333333, 0x55555555_55555555]; 5429 assert(C.array == correct); 5430 } 5431 5432 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. 5433 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @trusted 5434 { 5435 static if (DMD_with_DSIMD) 5436 { 5437 return cast(__m128i) __simd(XMM.PUNPCKHBW, a, b); 5438 } 5439 else static if (GDC_with_SSE2) 5440 { 5441 return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b); 5442 } 5443 else static if (LDC_with_optimizations) 5444 { 5445 enum ir = `%r = shufflevector <16 x i8> %0, <16 x i8> %1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 5446 ret <16 x i8> %r`; 5447 return cast(__m128i)LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 5448 } 5449 else static if (DMD_with_32bit_asm || LDC_with_x86_asm) 5450 { 5451 asm pure nothrow @nogc @trusted 5452 { 5453 movdqu XMM0, a; 5454 movdqu XMM1, b; 5455 punpckhbw XMM0, XMM1; 5456 movdqu a, XMM0; 5457 } 5458 return a; 5459 } 5460 else 5461 { 5462 byte16 r = void; 5463 byte16 ba = cast(byte16)a; 5464 byte16 bb = cast(byte16)b; 5465 r.ptr[0] = ba.array[8]; 5466 r.ptr[1] = bb.array[8]; 5467 r.ptr[2] = ba.array[9]; 5468 r.ptr[3] = bb.array[9]; 5469 r.ptr[4] = ba.array[10]; 5470 r.ptr[5] = bb.array[10]; 5471 r.ptr[6] = ba.array[11]; 5472 r.ptr[7] = bb.array[11]; 5473 r.ptr[8] = ba.array[12]; 5474 r.ptr[9] = bb.array[12]; 5475 r.ptr[10] = ba.array[13]; 5476 r.ptr[11] = bb.array[13]; 5477 r.ptr[12] = ba.array[14]; 5478 r.ptr[13] = bb.array[14]; 5479 r.ptr[14] = ba.array[15]; 5480 r.ptr[15] = bb.array[15]; 5481 return cast(__m128i)r; 5482 } 5483 } 5484 unittest 5485 { 5486 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 5487 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 5488 byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B); 5489 byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]; 5490 assert(C.array == correct); 5491 } 5492 5493 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`. 5494 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @trusted 5495 { 5496 // PERF DMD D_SIMD 5497 static if (GDC_with_SSE2) 5498 { 5499 return __builtin_ia32_unpckhpd(a, b); 5500 } 5501 else static if (LDC_with_optimizations) 5502 { 5503 enum ir = `%r = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> <i32 1, i32 3> 5504 ret <2 x double> %r`; 5505 return LDCInlineIR!(ir, double2, double2, double2)(a, b); 5506 } 5507 else 5508 { 5509 double2 r = void; 5510 r.ptr[0] = a.array[1]; 5511 r.ptr[1] = b.array[1]; 5512 return r; 5513 } 5514 } 5515 unittest 5516 { 5517 __m128d A = _mm_setr_pd(4.0, 6.0); 5518 __m128d B = _mm_setr_pd(7.0, 9.0); 5519 __m128d C = _mm_unpackhi_pd(A, B); 5520 double[2] correct = [6.0, 9.0]; 5521 assert(C.array == correct); 5522 } 5523 5524 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. 5525 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @trusted 5526 { 5527 static if (DMD_with_DSIMD) 5528 { 5529 return cast(__m128i) __simd(XMM.PUNPCKLWD, a, b); 5530 } 5531 else static if (GDC_with_SSE2) 5532 { 5533 return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b); 5534 } 5535 else static if (LDC_with_optimizations) 5536 { 5537 enum ir = `%r = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 5538 ret <8 x i16> %r`; 5539 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 5540 } 5541 else static if (DMD_with_32bit_asm || LDC_with_x86_asm) 5542 { 5543 asm pure nothrow @nogc @trusted 5544 { 5545 movdqu XMM0, a; 5546 movdqu XMM1, b; 5547 punpcklwd XMM0, XMM1; 5548 movdqu a, XMM0; 5549 } 5550 return a; 5551 } 5552 else 5553 { 5554 short8 r = void; 5555 short8 sa = cast(short8)a; 5556 short8 sb = cast(short8)b; 5557 r.ptr[0] = sa.array[0]; 5558 r.ptr[1] = sb.array[0]; 5559 r.ptr[2] = sa.array[1]; 5560 r.ptr[3] = sb.array[1]; 5561 r.ptr[4] = sa.array[2]; 5562 r.ptr[5] = sb.array[2]; 5563 r.ptr[6] = sa.array[3]; 5564 r.ptr[7] = sb.array[3]; 5565 return cast(__m128i)r; 5566 } 5567 } 5568 unittest 5569 { 5570 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 5571 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 5572 short8 C = cast(short8) _mm_unpacklo_epi16(A, B); 5573 short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11]; 5574 assert(C.array == correct); 5575 } 5576 5577 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. 5578 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted 5579 { 5580 static if (DMD_with_DSIMD) 5581 { 5582 return cast(__m128i) __simd(XMM.PUNPCKLDQ, a, b); 5583 } 5584 else static if (GDC_with_SSE2) 5585 { 5586 return __builtin_ia32_punpckldq128(a, b); 5587 } 5588 else static if (LDC_with_optimizations) 5589 { 5590 enum ir = `%r = shufflevector <4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 5591 ret <4 x i32> %r`; 5592 return LDCInlineIR!(ir, int4, int4, int4)(cast(int4)a, cast(int4)b); 5593 } 5594 else 5595 { 5596 __m128i r; 5597 r.ptr[0] = a.array[0]; 5598 r.ptr[1] = b.array[0]; 5599 r.ptr[2] = a.array[1]; 5600 r.ptr[3] = b.array[1]; 5601 return r; 5602 } 5603 } 5604 unittest 5605 { 5606 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 5607 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 5608 __m128i C = _mm_unpacklo_epi32(A, B); 5609 int[4] correct = [1, 5, 2, 6]; 5610 assert(C.array == correct); 5611 } 5612 5613 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`. 5614 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted 5615 { 5616 static if (GDC_with_SSE2) 5617 { 5618 return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b); 5619 } 5620 else 5621 { 5622 long2 lA = cast(long2)a; 5623 long2 lB = cast(long2)b; 5624 long2 R; // PERF =void; 5625 R.ptr[0] = lA.array[0]; 5626 R.ptr[1] = lB.array[0]; 5627 return cast(__m128i)R; 5628 } 5629 } 5630 unittest // Issue #36 5631 { 5632 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 5633 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 5634 long2 C = cast(long2)(_mm_unpacklo_epi64(A, B)); 5635 long[2] correct = [0x22222222_22222222, 0x44444444_44444444]; 5636 assert(C.array == correct); 5637 } 5638 5639 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. 5640 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @trusted 5641 { 5642 static if (DMD_with_DSIMD) 5643 { 5644 return cast(__m128i) __simd(XMM.PUNPCKLBW, a, b); 5645 } 5646 else static if (GDC_with_SSE2) 5647 { 5648 return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b); 5649 } 5650 else static if (LDC_with_optimizations) 5651 { 5652 enum ir = `%r = shufflevector <16 x i8> %0, <16 x i8> %1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 5653 ret <16 x i8> %r`; 5654 return cast(__m128i)LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 5655 } 5656 else static if (DMD_with_32bit_asm || LDC_with_x86_asm) 5657 { 5658 asm pure nothrow @nogc @trusted 5659 { 5660 movdqu XMM0, a; 5661 movdqu XMM1, b; 5662 punpcklbw XMM0, XMM1; 5663 movdqu a, XMM0; 5664 } 5665 return a; 5666 } 5667 else 5668 { 5669 byte16 r = void; 5670 byte16 ba = cast(byte16)a; 5671 byte16 bb = cast(byte16)b; 5672 r.ptr[0] = ba.array[0]; 5673 r.ptr[1] = bb.array[0]; 5674 r.ptr[2] = ba.array[1]; 5675 r.ptr[3] = bb.array[1]; 5676 r.ptr[4] = ba.array[2]; 5677 r.ptr[5] = bb.array[2]; 5678 r.ptr[6] = ba.array[3]; 5679 r.ptr[7] = bb.array[3]; 5680 r.ptr[8] = ba.array[4]; 5681 r.ptr[9] = bb.array[4]; 5682 r.ptr[10] = ba.array[5]; 5683 r.ptr[11] = bb.array[5]; 5684 r.ptr[12] = ba.array[6]; 5685 r.ptr[13] = bb.array[6]; 5686 r.ptr[14] = ba.array[7]; 5687 r.ptr[15] = bb.array[7]; 5688 return cast(__m128i)r; 5689 } 5690 } 5691 unittest 5692 { 5693 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 5694 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 5695 byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B); 5696 byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]; 5697 assert(C.array == correct); 5698 } 5699 5700 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`. 5701 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @trusted 5702 { 5703 // PERF DMD D_SIMD 5704 static if (GDC_with_SSE2) 5705 { 5706 return __builtin_ia32_unpcklpd(a, b); 5707 } 5708 else static if (LDC_with_optimizations) 5709 { 5710 enum ir = `%r = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> <i32 0, i32 2> 5711 ret <2 x double> %r`; 5712 return LDCInlineIR!(ir, double2, double2, double2)(a, b); 5713 } 5714 else 5715 { 5716 double2 r = void; 5717 r.ptr[0] = a.array[0]; 5718 r.ptr[1] = b.array[0]; 5719 return r; 5720 } 5721 } 5722 unittest 5723 { 5724 __m128d A = _mm_setr_pd(4.0, 6.0); 5725 __m128d B = _mm_setr_pd(7.0, 9.0); 5726 __m128d C = _mm_unpacklo_pd(A, B); 5727 double[2] correct = [4.0, 7.0]; 5728 assert(C.array == correct); 5729 } 5730 5731 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 5732 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 5733 { 5734 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 5735 } 5736 unittest 5737 { 5738 __m128d A = _mm_setr_pd(-4.0, 6.0); 5739 __m128d B = _mm_setr_pd(4.0, -6.0); 5740 long2 R = cast(long2) _mm_xor_pd(A, B); 5741 long[2] correct = [long.min, long.min]; 5742 assert(R.array == correct); 5743 } 5744 5745 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`. 5746 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 5747 { 5748 return a ^ b; 5749 } 5750 unittest 5751 { 5752 __m128i A = _mm_setr_epi64(975394, 619809709); 5753 __m128i B = _mm_setr_epi64(-920275025, -6); 5754 long2 R = cast(long2) _mm_xor_si128(A, B); 5755 long[2] correct = [975394 ^ (-920275025L), 619809709L ^ -6]; 5756 assert(R.array == correct); 5757 } 5758 5759 unittest 5760 { 5761 float distance(float[4] a, float[4] b) nothrow @nogc 5762 { 5763 __m128 va = _mm_loadu_ps(a.ptr); 5764 __m128 vb = _mm_loadu_ps(b.ptr); 5765 __m128 diffSquared = _mm_sub_ps(va, vb); 5766 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 5767 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared)); 5768 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum)); 5769 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 5770 } 5771 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 5772 }