1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2019. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.xmmintrin; 7 8 public import inteli.types; 9 10 import inteli.internals; 11 12 import inteli.mmx; 13 import inteli.emmintrin; 14 15 import core.stdc.stdlib: malloc, free; 16 import core.exception: onOutOfMemoryError; 17 18 version(D_InlineAsm_X86) 19 version = InlineX86Asm; 20 else version(D_InlineAsm_X86_64) 21 version = InlineX86Asm; 22 23 24 // SSE1 25 26 nothrow @nogc: 27 28 29 enum int _MM_EXCEPT_INVALID = 0x0001; 30 enum int _MM_EXCEPT_DENORM = 0x0002; 31 enum int _MM_EXCEPT_DIV_ZERO = 0x0004; 32 enum int _MM_EXCEPT_OVERFLOW = 0x0008; 33 enum int _MM_EXCEPT_UNDERFLOW = 0x0010; 34 enum int _MM_EXCEPT_INEXACT = 0x0020; 35 enum int _MM_EXCEPT_MASK = 0x003f; 36 37 enum int _MM_MASK_INVALID = 0x0080; 38 enum int _MM_MASK_DENORM = 0x0100; 39 enum int _MM_MASK_DIV_ZERO = 0x0200; 40 enum int _MM_MASK_OVERFLOW = 0x0400; 41 enum int _MM_MASK_UNDERFLOW = 0x0800; 42 enum int _MM_MASK_INEXACT = 0x1000; 43 enum int _MM_MASK_MASK = 0x1f80; 44 45 enum int _MM_ROUND_NEAREST = 0x0000; 46 enum int _MM_ROUND_DOWN = 0x2000; 47 enum int _MM_ROUND_UP = 0x4000; 48 enum int _MM_ROUND_TOWARD_ZERO = 0x6000; 49 enum int _MM_ROUND_MASK = 0x6000; 50 51 enum int _MM_FLUSH_ZERO_MASK = 0x8000; 52 enum int _MM_FLUSH_ZERO_ON = 0x8000; 53 enum int _MM_FLUSH_ZERO_OFF = 0x0000; 54 55 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe 56 { 57 return a + b; 58 } 59 60 unittest 61 { 62 __m128 a = [1, 2, 3, 4]; 63 a = _mm_add_ps(a, a); 64 assert(a.array[0] == 2); 65 assert(a.array[1] == 4); 66 assert(a.array[2] == 6); 67 assert(a.array[3] == 8); 68 } 69 70 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe 71 { 72 static if (GDC_with_SSE) 73 return __builtin_ia32_addss(a, b); 74 else 75 { 76 a[0] += b[0]; 77 return a; 78 } 79 } 80 unittest 81 { 82 __m128 a = [1, 2, 3, 4]; 83 a = _mm_add_ss(a, a); 84 assert(a.array == [2.0f, 2, 3, 4]); 85 } 86 87 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe 88 { 89 return cast(__m128)(cast(__m128i)a & cast(__m128i)b); 90 } 91 unittest 92 { 93 // Note: tested in emmintrin.d 94 } 95 96 __m128 _mm_andnot_ps (__m128 a, __m128 b) pure @safe 97 { 98 return cast(__m128)( (~cast(__m128i)a) & cast(__m128i)b ); 99 } 100 101 /// Average packed unsigned 16-bit integers in ``a` and `b`. 102 __m64 _mm_avg_pu16 (__m64 a, __m64 b) pure @safe 103 { 104 return to_m64(_mm_avg_epu16(to_m128i(a), to_m128i(b))); 105 } 106 107 /// Average packed unsigned 8-bit integers in ``a` and `b`. 108 __m64 _mm_avg_pu8 (__m64 a, __m64 b) pure @safe 109 { 110 return to_m64(_mm_avg_epu8(to_m128i(a), to_m128i(b))); 111 } 112 113 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe 114 { 115 return cast(__m128) cmpps!(FPComparison.oeq)(a, b); 116 } 117 118 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe 119 { 120 return cast(__m128) cmpss!(FPComparison.oeq)(a, b); 121 } 122 123 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe 124 { 125 return cast(__m128) cmpps!(FPComparison.oge)(a, b); 126 } 127 unittest 128 { 129 __m128i R = cast(__m128i) _mm_cmpge_ps(_mm_setr_ps(0, 1, -1, float.nan), 130 _mm_setr_ps(0, 0, 0, 0)); 131 int[4] correct = [-1, -1, 0, 0]; 132 assert(R.array == correct); 133 } 134 135 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe 136 { 137 return cast(__m128) cmpss!(FPComparison.oge)(a, b); 138 } 139 140 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe 141 { 142 return cast(__m128) cmpps!(FPComparison.ogt)(a, b); 143 } 144 145 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe 146 { 147 return cast(__m128) cmpss!(FPComparison.ogt)(a, b); 148 } 149 150 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe 151 { 152 return cast(__m128) cmpps!(FPComparison.ole)(a, b); 153 } 154 155 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe 156 { 157 return cast(__m128) cmpss!(FPComparison.ole)(a, b); 158 } 159 160 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe 161 { 162 return cast(__m128) cmpps!(FPComparison.olt)(a, b); 163 } 164 165 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe 166 { 167 return cast(__m128) cmpss!(FPComparison.olt)(a, b); 168 } 169 170 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe 171 { 172 return cast(__m128) cmpps!(FPComparison.une)(a, b); 173 } 174 175 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe 176 { 177 return cast(__m128) cmpss!(FPComparison.une)(a, b); 178 } 179 180 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe 181 { 182 return cast(__m128) cmpps!(FPComparison.ult)(a, b); 183 } 184 185 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe 186 { 187 return cast(__m128) cmpss!(FPComparison.ult)(a, b); 188 } 189 190 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe 191 { 192 return cast(__m128) cmpps!(FPComparison.ule)(a, b); 193 } 194 195 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe 196 { 197 return cast(__m128) cmpss!(FPComparison.ule)(a, b); 198 } 199 200 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe 201 { 202 return cast(__m128) cmpps!(FPComparison.ugt)(a, b); 203 } 204 205 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe 206 { 207 return cast(__m128) cmpss!(FPComparison.ugt)(a, b); 208 } 209 210 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe 211 { 212 return cast(__m128) cmpps!(FPComparison.uge)(a, b); 213 } 214 215 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe 216 { 217 return cast(__m128) cmpss!(FPComparison.uge)(a, b); 218 } 219 220 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe 221 { 222 return cast(__m128) cmpps!(FPComparison.ord)(a, b); 223 } 224 225 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe 226 { 227 return cast(__m128) cmpss!(FPComparison.ord)(a, b); 228 } 229 230 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe 231 { 232 return cast(__m128) cmpps!(FPComparison.uno)(a, b); 233 } 234 235 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe 236 { 237 return cast(__m128) cmpss!(FPComparison.uno)(a, b); 238 } 239 240 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS 241 // Some such comparisons yields true for NaNs, other don't. 242 243 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe // comiss + sete 244 { 245 return comss!(FPComparison.ueq)(a, b); // yields true for NaN! 246 } 247 248 int _mm_comige_ss (__m128 a, __m128 b) pure @safe // comiss + setae 249 { 250 return comss!(FPComparison.oge)(a, b); 251 } 252 253 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta 254 { 255 return comss!(FPComparison.ogt)(a, b); 256 } 257 258 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe 259 { 260 return comss!(FPComparison.ule)(a, b); // yields true for NaN! 261 } 262 263 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb 264 { 265 return comss!(FPComparison.ult)(a, b); // yields true for NaN! 266 } 267 268 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne 269 { 270 return comss!(FPComparison.one)(a, b); 271 } 272 273 alias _mm_cvt_pi2ps = _mm_cvtpi32_ps; 274 275 __m64 _mm_cvt_ps2pi (__m128 a) pure @safe 276 { 277 return to_m64(_mm_cvtps_epi32(a)); 278 } 279 280 __m128 _mm_cvt_si2ss(__m128 v, int x) pure @trusted 281 { 282 v.ptr[0] = cast(float)x; 283 return v; 284 } 285 unittest 286 { 287 __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42); 288 assert(a.array == [42f, 0, 0, 0]); 289 } 290 291 // Note: is just another name for _mm_cvtss_si32 292 alias _mm_cvt_ss2si = _mm_cvtss_si32; 293 294 295 __m128 _mm_cvtpi16_ps (__m64 a) pure @safe 296 { 297 __m128i ma = to_m128i(a); 298 ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit 299 ma = _mm_srai_epi32(_mm_slli_epi32(ma, 16), 16); // Replicate sign bit 300 return _mm_cvtepi32_ps(ma); 301 } 302 unittest 303 { 304 __m64 A = _mm_setr_pi16(-1, 2, -3, 4); 305 __m128 R = _mm_cvtpi16_ps(A); 306 float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f]; 307 assert(R.array == correct); 308 } 309 310 __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) pure @trusted 311 { 312 __m128 fb = _mm_cvtepi32_ps(to_m128i(b)); 313 a.ptr[0] = fb.array[0]; 314 a.ptr[1] = fb.array[1]; 315 return a; 316 } 317 unittest 318 { 319 __m128 R = _mm_cvtpi32_ps(_mm_set1_ps(4.0f), _mm_setr_pi32(1, 2)); 320 float[4] correct = [1.0f, 2.0f, 4.0f, 4.0f]; 321 assert(R.array == correct); 322 } 323 324 325 __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) pure @trusted 326 { 327 long2 l; 328 l.ptr[0] = a.array[0]; 329 l.ptr[1] = b.array[0]; 330 return _mm_cvtepi32_ps(cast(__m128i)l); 331 } 332 333 __m128 _mm_cvtpi8_ps (__m64 a) pure @safe 334 { 335 __m128i b = to_m128i(a); 336 337 // Zero extend to 32-bit 338 b = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 339 b = _mm_unpacklo_epi16(b, _mm_setzero_si128()); 340 341 // Replicate sign bit 342 b = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24); // Replicate sign bit 343 return _mm_cvtepi32_ps(b); 344 } 345 unittest 346 { 347 __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0); 348 __m128 R = _mm_cvtpi8_ps(A); 349 float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f]; 350 assert(R.array == correct); 351 } 352 353 __m64 _mm_cvtps_pi16 (__m128 a) pure @safe 354 { 355 // The C++ version of this intrinsic convert to 32-bit float, then use packssdw 356 // Which means the 16-bit integers should be saturated 357 __m128i b = _mm_cvtps_epi32(a); 358 b = _mm_packs_epi32(b, b); 359 return to_m64(b); 360 } 361 unittest 362 { 363 __m128 A = _mm_setr_ps(-1.0f, 2.0f, -33000.0f, 70000.0f); 364 short4 R = cast(short4) _mm_cvtps_pi16(A); 365 short[4] correct = [-1, 2, -32768, 32767]; 366 assert(R.array == correct); 367 } 368 369 __m64 _mm_cvtps_pi32 (__m128 a) pure @safe 370 { 371 return to_m64(_mm_cvtps_epi32(a)); 372 } 373 unittest 374 { 375 __m128 A = _mm_setr_ps(-33000.0f, 70000.0f, -1.0f, 2.0f, ); 376 int2 R = cast(int2) _mm_cvtps_pi32(A); 377 int[2] correct = [-33000, 70000]; 378 assert(R.array == correct); 379 } 380 381 __m64 _mm_cvtps_pi8 (__m128 a) pure @safe 382 { 383 // The C++ version of this intrinsic convert to 32-bit float, then use packssdw + packsswb 384 // Which means the 8-bit integers should be saturated 385 __m128i b = _mm_cvtps_epi32(a); 386 b = _mm_packs_epi32(b, _mm_setzero_si128()); 387 b = _mm_packs_epi16(b, _mm_setzero_si128()); 388 return to_m64(b); 389 } 390 unittest 391 { 392 __m128 A = _mm_setr_ps(-1.0f, 2.0f, -129.0f, 128.0f); 393 byte8 R = cast(byte8) _mm_cvtps_pi8(A); 394 byte[8] correct = [-1, 2, -128, 127, 0, 0, 0, 0]; 395 assert(R.array == correct); 396 } 397 398 __m128 _mm_cvtpu16_ps (__m64 a) pure @safe 399 { 400 __m128i ma = to_m128i(a); 401 ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit 402 return _mm_cvtepi32_ps(ma); 403 } 404 unittest 405 { 406 __m64 A = _mm_setr_pi16(-1, 2, -3, 4); 407 __m128 R = _mm_cvtpu16_ps(A); 408 float[4] correct = [65535.0f, 2.0f, 65533.0f, 4.0f]; 409 assert(R.array == correct); 410 } 411 412 __m128 _mm_cvtpu8_ps (__m64 a) pure @safe 413 { 414 __m128i b = to_m128i(a); 415 416 // Zero extend to 32-bit 417 b = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 418 b = _mm_unpacklo_epi16(b, _mm_setzero_si128()); 419 return _mm_cvtepi32_ps(b); 420 } 421 unittest 422 { 423 __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0); 424 __m128 R = _mm_cvtpu8_ps(A); 425 float[4] correct = [255.0f, 2.0f, 253.0f, 4.0f]; 426 assert(R.array == correct); 427 } 428 429 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @trusted 430 { 431 v.ptr[0] = cast(float)x; 432 return v; 433 } 434 unittest 435 { 436 __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42); 437 assert(a.array == [42.0f, 0, 0, 0]); 438 } 439 440 // Note: on macOS, using "llvm.x86.sse.cvtsi642ss" was buggy 441 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @trusted 442 { 443 v.ptr[0] = cast(float)x; 444 return v; 445 } 446 unittest 447 { 448 __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42); 449 assert(a.array == [42.0f, 0, 0, 0]); 450 } 451 452 float _mm_cvtss_f32(__m128 a) pure @safe 453 { 454 return a.array[0]; 455 } 456 457 version(LDC) 458 { 459 alias _mm_cvtss_si32 = __builtin_ia32_cvtss2si; 460 } 461 else 462 { 463 int _mm_cvtss_si32 (__m128 a) pure @safe 464 { 465 return convertFloatToInt32UsingMXCSR(a.array[0]); 466 } 467 } 468 unittest 469 { 470 assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 471 } 472 473 version(LDC) 474 { 475 version(X86_64) 476 alias _mm_cvtss_si64 = __builtin_ia32_cvtss2si64; 477 else 478 { 479 // Note: __builtin_ia32_cvtss2si64 crashes LDC in 32-bit 480 long _mm_cvtss_si64 (__m128 a) pure @safe 481 { 482 return convertFloatToInt64UsingMXCSR(a.array[0]); 483 } 484 } 485 } 486 else 487 { 488 long _mm_cvtss_si64 (__m128 a) pure @safe 489 { 490 return convertFloatToInt64UsingMXCSR(a.array[0]); 491 } 492 } 493 unittest 494 { 495 assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 496 497 uint savedRounding = _MM_GET_ROUNDING_MODE(); 498 499 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 500 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.5f))); 501 502 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 503 assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f))); 504 505 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 506 assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f))); 507 508 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 509 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f))); 510 511 _MM_SET_ROUNDING_MODE(savedRounding); 512 } 513 514 515 version(LDC) 516 { 517 alias _mm_cvtt_ss2si = __builtin_ia32_cvttss2si; 518 } 519 else 520 { 521 int _mm_cvtt_ss2si (__m128 a) pure @safe 522 { 523 return cast(int)(a.array[0]); 524 } 525 } 526 unittest 527 { 528 assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 529 } 530 531 __m64 _mm_cvtt_ps2pi (__m128 a) pure @safe 532 { 533 return to_m64(_mm_cvttps_epi32(a)); 534 } 535 536 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; // it's actually the same op 537 538 // Note: __builtin_ia32_cvttss2si64 crashes LDC when generating 32-bit x86 code. 539 long _mm_cvttss_si64 (__m128 a) pure @safe 540 { 541 return cast(long)(a.array[0]); // Generates cvttss2si as expected 542 } 543 unittest 544 { 545 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 546 } 547 548 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe 549 { 550 return a / b; 551 } 552 unittest 553 { 554 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 555 a = _mm_div_ps(a, a); 556 float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f]; 557 assert(a.array == correct); 558 } 559 560 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe 561 { 562 static if (GDC_with_SSE) 563 return __builtin_ia32_divss(a, b); 564 else 565 { 566 a[0] /= b[0]; 567 return a; 568 } 569 } 570 unittest 571 { 572 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 573 a = _mm_div_ss(a, a); 574 float[4] correct = [1.0f, -2.0, 3.0f, 1.0f]; 575 assert(a.array == correct); 576 } 577 578 int _mm_extract_pi16 (__m64 a, int imm8) 579 { 580 short4 sa = cast(short4)a; 581 return cast(ushort)(sa.array[imm8]); 582 } 583 unittest 584 { 585 __m64 A = _mm_setr_pi16(-1, 6, 0, 4); 586 assert(_mm_extract_pi16(A, 0) == 65535); 587 assert(_mm_extract_pi16(A, 1) == 6); 588 assert(_mm_extract_pi16(A, 2) == 0); 589 assert(_mm_extract_pi16(A, 3) == 4); 590 } 591 592 /// Free aligned memory that was allocated with `_mm_malloc`. 593 void _mm_free(void * mem_addr) @trusted 594 { 595 // support for free(NULL) 596 if (mem_addr is null) 597 return; 598 599 // Technically we don't need to store size and alignement in the chunk, but we do in case we 600 // have to implement _mm_realloc 601 602 size_t pointerSize = (void*).sizeof; 603 void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof); 604 size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize); 605 size_t alignment = *alignmentLocation; 606 assert(alignment != 0); 607 assert(isPointerAligned(mem_addr, alignment)); 608 free(*rawLocation); 609 } 610 611 uint _MM_GET_EXCEPTION_MASK() pure @safe 612 { 613 return _mm_getcsr() & _MM_MASK_MASK; 614 } 615 616 uint _MM_GET_EXCEPTION_STATE() pure @safe 617 { 618 return _mm_getcsr() & _MM_EXCEPT_MASK; 619 } 620 621 uint _MM_GET_FLUSH_ZERO_MODE() pure @safe 622 { 623 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 624 } 625 626 uint _MM_GET_ROUNDING_MODE() pure @safe 627 { 628 return _mm_getcsr() & _MM_ROUND_MASK; 629 } 630 631 uint _mm_getcsr() pure @safe 632 { 633 version(GNU) 634 { 635 static if (GDC_with_SSE) 636 { 637 return __builtin_ia32_stmxcsr(); 638 } 639 else version(X86) 640 { 641 uint sseRounding = 0; 642 asm pure nothrow @nogc @trusted 643 { 644 "stmxcsr %0;\n" 645 : "=m" (sseRounding) 646 : 647 : ; 648 } 649 return sseRounding; 650 } 651 else 652 static assert(false); 653 } 654 else version (InlineX86Asm) 655 { 656 uint controlWord; 657 asm nothrow @nogc pure @safe 658 { 659 stmxcsr controlWord; 660 } 661 return controlWord; 662 } 663 else 664 static assert(0, "Not yet supported"); 665 } 666 unittest 667 { 668 uint csr = _mm_getcsr(); 669 } 670 671 __m64 _mm_insert_pi16 (__m64 v, int i, int index) pure @trusted 672 { 673 short4 r = cast(short4)v; 674 r.ptr[index & 3] = cast(short)i; 675 return cast(__m64)r; 676 } 677 unittest 678 { 679 __m64 A = _mm_set_pi16(3, 2, 1, 0); 680 short4 R = cast(short4) _mm_insert_pi16(A, 42, 1 | 4); 681 short[4] correct = [0, 42, 2, 3]; 682 assert(R.array == correct); 683 } 684 685 __m128 _mm_load_ps(const(float)*p) pure @trusted 686 { 687 return *cast(__m128*)p; 688 } 689 690 __m128 _mm_load_ps1(const(float)*p) pure @trusted 691 { 692 return __m128(*p); 693 } 694 695 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted 696 { 697 __m128 r; 698 r.ptr[0] = *mem_addr; 699 r.ptr[1] = 0; 700 r.ptr[2] = 0; 701 r.ptr[3] = 0; 702 return r; 703 } 704 705 alias _mm_load1_ps = _mm_load_ps1; 706 707 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @trusted 708 { 709 long2 la = cast(long2)a; 710 la.ptr[1] = (*mem_addr).array[0]; 711 return cast(__m128)la; 712 } 713 714 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @trusted 715 { 716 long2 la = cast(long2)a; 717 la.ptr[0] = (*mem_addr).array[0]; 718 return cast(__m128)la; 719 } 720 721 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted 722 { 723 __m128* aligned = cast(__m128*)mem_addr; 724 __m128 a = *aligned; 725 __m128 r; 726 r.ptr[0] = a.array[3]; 727 r.ptr[1] = a.array[2]; 728 r.ptr[2] = a.array[1]; 729 r.ptr[3] = a.array[0]; 730 return r; 731 } 732 733 __m128 _mm_loadu_ps(const(float)*p) pure @safe 734 { 735 return loadUnaligned!(__m128)(p); 736 } 737 738 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted 739 { 740 short r = *cast(short*)(mem_addr); 741 short8 result = [0, 0, 0, 0, 0, 0, 0, 0]; 742 result.ptr[0] = r; 743 return cast(__m128i)result; 744 } 745 unittest 746 { 747 short r = 13; 748 short8 A = cast(short8) _mm_loadu_si16(&r); 749 short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0]; 750 assert(A.array == correct); 751 } 752 753 __m128i _mm_loadu_si64(const(void)* mem_addr) pure @trusted 754 { 755 long r = *cast(long*)(mem_addr); 756 long2 result = [0, 0]; 757 result.ptr[0] = r; 758 return cast(__m128i)result; 759 } 760 unittest 761 { 762 long r = 446446446446; 763 long2 A = cast(long2) _mm_loadu_si64(&r); 764 long[2] correct = [446446446446, 0]; 765 assert(A.array == correct); 766 } 767 768 /// Allocate size bytes of memory, aligned to the alignment specified in align, 769 /// and return a pointer to the allocated memory. `_mm_free` should be used to free 770 /// memory that is allocated with `_mm_malloc`. 771 void* _mm_malloc(size_t size, size_t alignment) @trusted 772 { 773 assert(alignment != 0); 774 size_t request = requestedSize(size, alignment); 775 void* raw = malloc(request); 776 if (request > 0 && raw == null) // malloc(0) can validly return anything 777 onOutOfMemoryError(); 778 return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size 779 } 780 781 void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr) @trusted 782 { 783 // this works since mask is zero-extended 784 return _mm_maskmoveu_si128 (to_m128i(a), to_m128i(mask), mem_addr); 785 } 786 787 deprecated alias _m_maskmovq = _mm_maskmove_si64; 788 789 __m64 _mm_max_pi16 (__m64 a, __m64 b) pure @safe 790 { 791 return to_m64(_mm_max_epi16(to_m128i(a), to_m128i(b))); 792 } 793 794 static if (GDC_with_SSE) 795 { 796 alias _mm_max_ps = __builtin_ia32_maxps; 797 } 798 else version(LDC) 799 { 800 alias _mm_max_ps = __builtin_ia32_maxps; 801 } 802 else 803 { 804 __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe 805 { 806 __m128 r; 807 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 808 r[1] = (a[1] > b[1]) ? a[1] : b[1]; 809 r[2] = (a[2] > b[2]) ? a[2] : b[2]; 810 r[3] = (a[3] > b[3]) ? a[3] : b[3]; 811 return r; 812 } 813 } 814 unittest 815 { 816 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 817 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 818 __m128 M = _mm_max_ps(A, B); 819 assert(M.array[0] == 4); 820 assert(M.array[1] == 2); 821 assert(M.array[2] == 4); // in case of NaN, second operand prevails (as it seems) 822 assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems) 823 } 824 825 __m64 _mm_max_pu8 (__m64 a, __m64 b) pure @safe 826 { 827 return to_m64(_mm_max_epu8(to_m128i(a), to_m128i(b))); 828 } 829 830 static if (GDC_with_SSE) 831 { 832 alias _mm_max_ss = __builtin_ia32_maxss; 833 } 834 else version(LDC) 835 { 836 alias _mm_max_ss = __builtin_ia32_maxss; 837 } 838 else 839 { 840 __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe 841 { 842 __m128 r = a; 843 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 844 return r; 845 } 846 } 847 unittest 848 { 849 __m128 A = _mm_setr_ps(1, 2, 3, 4); 850 __m128 B = _mm_setr_ps(4, 1, 4, 1); 851 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 852 __m128 M = _mm_max_ss(A, B); 853 assert(M.array[0] == 4); 854 assert(M.array[1] == 2); 855 assert(M.array[2] == 3); 856 assert(M.array[3] == 4); 857 M = _mm_max_ps(A, C); // in case of NaN, second operand prevails 858 assert(M.array[0] != M.array[0]); 859 M = _mm_max_ps(C, A); // in case of NaN, second operand prevails 860 assert(M.array[0] == 1); 861 } 862 863 __m64 _mm_min_pi16 (__m64 a, __m64 b) pure @safe 864 { 865 return to_m64(_mm_min_epi16(to_m128i(a), to_m128i(b))); 866 } 867 868 static if (GDC_with_SSE) 869 { 870 alias _mm_min_ps = __builtin_ia32_minps; 871 } 872 else version(LDC) 873 { 874 alias _mm_min_ps = __builtin_ia32_minps; 875 } 876 else 877 { 878 __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe 879 { 880 __m128 r; 881 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 882 r[1] = (a[1] < b[1]) ? a[1] : b[1]; 883 r[2] = (a[2] < b[2]) ? a[2] : b[2]; 884 r[3] = (a[3] < b[3]) ? a[3] : b[3]; 885 return r; 886 } 887 } 888 unittest 889 { 890 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 891 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 892 __m128 M = _mm_min_ps(A, B); 893 assert(M.array[0] == 1); 894 assert(M.array[1] == 1); 895 assert(M.array[2] == 4); // in case of NaN, second operand prevails (as it seems) 896 assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems) 897 } 898 899 __m64 _mm_min_pu8 (__m64 a, __m64 b) pure @safe 900 { 901 return to_m64(_mm_min_epu8(to_m128i(a), to_m128i(b))); 902 } 903 904 static if (GDC_with_SSE) 905 { 906 alias _mm_min_ss = __builtin_ia32_minss; 907 } 908 else version(LDC) 909 { 910 alias _mm_min_ss = __builtin_ia32_minss; 911 } 912 else 913 { 914 __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe 915 { 916 __m128 r = a; 917 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 918 return r; 919 } 920 } 921 unittest 922 { 923 __m128 A = _mm_setr_ps(1, 2, 3, 4); 924 __m128 B = _mm_setr_ps(4, 1, 4, 1); 925 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 926 __m128 M = _mm_min_ss(A, B); 927 assert(M.array[0] == 1); 928 assert(M.array[1] == 2); 929 assert(M.array[2] == 3); 930 assert(M.array[3] == 4); 931 M = _mm_min_ps(A, C); // in case of NaN, second operand prevails 932 assert(M.array[0] != M.array[0]); 933 M = _mm_min_ps(C, A); // in case of NaN, second operand prevails 934 assert(M.array[0] == 1); 935 } 936 937 __m128 _mm_move_ss (__m128 a, __m128 b) pure @trusted 938 { 939 a.ptr[0] = b.array[0]; 940 return a; 941 } 942 943 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @trusted 944 { 945 b.ptr[0] = a.array[2]; 946 b.ptr[1] = a.array[3]; 947 return b; 948 } 949 950 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @trusted 951 { 952 a.ptr[2] = b.array[0]; 953 a.ptr[3] = b.array[1]; 954 return a; 955 } 956 957 int _mm_movemask_pi8 (__m64 a) pure @safe 958 { 959 return _mm_movemask_epi8(to_m128i(a)); 960 } 961 unittest 962 { 963 assert(0x9C == _mm_movemask_pi8(_mm_set_pi8(-1, 0, 0, -1, -1, -1, 0, 0))); 964 } 965 966 static if (GDC_with_SSE) 967 { 968 alias _mm_movemask_ps = __builtin_ia32_movmskps; 969 } 970 else version(LDC) 971 { 972 alias _mm_movemask_ps = __builtin_ia32_movmskps; 973 } 974 else 975 { 976 int _mm_movemask_ps (__m128 a) pure @safe 977 { 978 int4 ai = cast(int4)a; 979 int r = 0; 980 if (ai[0] < 0) r += 1; 981 if (ai[1] < 0) r += 2; 982 if (ai[2] < 0) r += 4; 983 if (ai[3] < 0) r += 8; 984 return r; 985 } 986 } 987 unittest 988 { 989 int4 A = [-1, 0, -43, 0]; 990 assert(5 == _mm_movemask_ps(cast(float4)A)); 991 } 992 993 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe 994 { 995 return a * b; 996 } 997 unittest 998 { 999 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1000 a = _mm_mul_ps(a, a); 1001 float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f]; 1002 assert(a.array == correct); 1003 } 1004 1005 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe 1006 { 1007 static if (GDC_with_SSE) 1008 return __builtin_ia32_mulss(a, b); 1009 else 1010 { 1011 a[0] *= b[0]; 1012 return a; 1013 } 1014 } 1015 unittest 1016 { 1017 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1018 a = _mm_mul_ss(a, a); 1019 float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f]; 1020 assert(a.array == correct); 1021 } 1022 1023 __m64 _mm_mulhi_pu16 (__m64 a, __m64 b) pure @safe 1024 { 1025 return to_m64(_mm_mulhi_epu16(to_m128i(a), to_m128i(b))); 1026 } 1027 unittest 1028 { 1029 __m64 A = _mm_setr_pi16(0, -16, 2, 3); 1030 __m64 B = _mm_set1_pi16(16384); 1031 short4 R = cast(short4)_mm_mulhi_pu16(A, B); 1032 short[4] correct = [0, 0x3FFC, 0, 0]; 1033 assert(R.array == correct); 1034 } 1035 1036 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe 1037 { 1038 return cast(__m128)(cast(__m128i)a | cast(__m128i)b); 1039 } 1040 1041 deprecated alias 1042 _m_pavgb = _mm_avg_pu8, 1043 _m_pavgw = _mm_avg_pu16, 1044 _m_pextrw = _mm_extract_pi16, 1045 _m_pinsrw = _mm_insert_pi16, 1046 _m_pmaxsw = _mm_max_pi16, 1047 _m_pmaxub = _mm_max_pu8, 1048 _m_pminsw = _mm_min_pi16, 1049 _m_pminub = _mm_min_pu8, 1050 _m_pmovmskb = _mm_movemask_pi8, 1051 _m_pmulhuw = _mm_mulhi_pu16; 1052 1053 enum _MM_HINT_T0 = 3; /// 1054 enum _MM_HINT_T1 = 2; /// 1055 enum _MM_HINT_T2 = 1; /// 1056 enum _MM_HINT_NTA = 0; /// 1057 1058 1059 version(LDC) 1060 { 1061 // Starting with LLVM 10, it seems llvm.prefetch has changed its name. 1062 // Was reported at: https://github.com/ldc-developers/ldc/issues/3397 1063 static if (__VERSION__ >= 2091) 1064 { 1065 pragma(LDC_intrinsic, "llvm.prefetch.p0i8") // was "llvm.prefetch" 1066 void llvm_prefetch_fixed(void* ptr, uint rw, uint locality, uint cachetype) pure @safe; 1067 } 1068 } 1069 1070 /// Fetch the line of data from memory that contains address `p` to a location in the 1071 /// cache hierarchy specified by the locality hint i. 1072 /// 1073 /// Warning: `locality` is a compile-time parameter, unlike in Intel Intrinsics API. 1074 void _mm_prefetch(int locality)(const(void)* p) pure @trusted 1075 { 1076 static if (GDC_with_SSE) 1077 { 1078 return __builtin_prefetch(p, (locality & 0x4) >> 2, locality & 0x3); 1079 } 1080 else version(LDC) 1081 { 1082 static if (__VERSION__ >= 2091) 1083 { 1084 // const_cast here. `llvm_prefetch` wants a mutable pointer 1085 llvm_prefetch_fixed( cast(void*)p, 0, locality, 1); 1086 } 1087 else 1088 { 1089 // const_cast here. `llvm_prefetch` wants a mutable pointer 1090 llvm_prefetch( cast(void*)p, 0, locality, 1); 1091 } 1092 } 1093 else version(D_InlineAsm_X86_64) 1094 { 1095 static if (locality == _MM_HINT_NTA) 1096 { 1097 asm pure nothrow @nogc @trusted 1098 { 1099 mov RAX, p; 1100 prefetchnta [RAX]; 1101 } 1102 } 1103 else static if (locality == _MM_HINT_T0) 1104 { 1105 asm pure nothrow @nogc @trusted 1106 { 1107 mov RAX, p; 1108 prefetcht0 [RAX]; 1109 } 1110 } 1111 else static if (locality == _MM_HINT_T1) 1112 { 1113 asm pure nothrow @nogc @trusted 1114 { 1115 mov RAX, p; 1116 prefetcht1 [RAX]; 1117 } 1118 } 1119 else static if (locality == _MM_HINT_T2) 1120 { 1121 asm pure nothrow @nogc @trusted 1122 { 1123 mov RAX, p; 1124 prefetcht2 [RAX]; 1125 } 1126 } 1127 else 1128 assert(false); // invalid locality hint 1129 } 1130 else version(D_InlineAsm_X86) 1131 { 1132 static if (locality == _MM_HINT_NTA) 1133 { 1134 asm pure nothrow @nogc @trusted 1135 { 1136 mov EAX, p; 1137 prefetchnta [EAX]; 1138 } 1139 } 1140 else static if (locality == _MM_HINT_T0) 1141 { 1142 asm pure nothrow @nogc @trusted 1143 { 1144 mov EAX, p; 1145 prefetcht0 [EAX]; 1146 } 1147 } 1148 else static if (locality == _MM_HINT_T1) 1149 { 1150 asm pure nothrow @nogc @trusted 1151 { 1152 mov EAX, p; 1153 prefetcht1 [EAX]; 1154 } 1155 } 1156 else static if (locality == _MM_HINT_T2) 1157 { 1158 asm pure nothrow @nogc @trusted 1159 { 1160 mov EAX, p; 1161 prefetcht2 [EAX]; 1162 } 1163 } 1164 else 1165 assert(false); // invalid locality hint 1166 } 1167 else 1168 { 1169 // Generic version: do nothing. From bitter experience, 1170 // it's unlikely you get ANY speed-up with manual prefetching. 1171 // Prefetching or not doesn't change program behaviour. 1172 } 1173 } 1174 unittest 1175 { 1176 // From Intel documentation: 1177 // "The amount of data prefetched is also processor implementation-dependent. It will, however, be a minimum of 32 bytes." 1178 ubyte[256] cacheline; // though it seems it cannot generate GP fault 1179 _mm_prefetch!_MM_HINT_T0(cacheline.ptr); 1180 _mm_prefetch!_MM_HINT_T1(cacheline.ptr); 1181 _mm_prefetch!_MM_HINT_T2(cacheline.ptr); 1182 _mm_prefetch!_MM_HINT_NTA(cacheline.ptr); 1183 } 1184 1185 deprecated alias 1186 _m_psadbw = _mm_sad_pu8, 1187 _m_pshufw = _mm_shuffle_pi16; 1188 1189 static if (GDC_with_SSE) 1190 { 1191 alias _mm_rcp_ps = __builtin_ia32_rcpps; 1192 } 1193 else version(LDC) 1194 { 1195 alias _mm_rcp_ps = __builtin_ia32_rcpps; 1196 } 1197 else 1198 { 1199 __m128 _mm_rcp_ps (__m128 a) pure @safe 1200 { 1201 a[0] = 1.0f / a[0]; 1202 a[1] = 1.0f / a[1]; 1203 a[2] = 1.0f / a[2]; 1204 a[3] = 1.0f / a[3]; 1205 return a; 1206 } 1207 } 1208 1209 static if (GDC_with_SSE) 1210 { 1211 alias _mm_rcp_ss = __builtin_ia32_rcpss; 1212 } 1213 else version(LDC) 1214 { 1215 alias _mm_rcp_ss = __builtin_ia32_rcpss; 1216 } 1217 else 1218 { 1219 __m128 _mm_rcp_ss (__m128 a) pure @safe 1220 { 1221 a[0] = 1.0f / a[0]; 1222 return a; 1223 } 1224 } 1225 1226 static if (GDC_with_SSE) 1227 { 1228 alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps; 1229 } 1230 else version(LDC) 1231 { 1232 alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps; 1233 } 1234 else 1235 { 1236 __m128 _mm_rsqrt_ps (__m128 a) pure @safe 1237 { 1238 a[0] = 1.0f / sqrt(a[0]); 1239 a[1] = 1.0f / sqrt(a[1]); 1240 a[2] = 1.0f / sqrt(a[2]); 1241 a[3] = 1.0f / sqrt(a[3]); 1242 return a; 1243 } 1244 } 1245 1246 static if (GDC_with_SSE) 1247 { 1248 alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss; 1249 } 1250 else version(LDC) 1251 { 1252 alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss; 1253 } 1254 else 1255 { 1256 __m128 _mm_rsqrt_ss (__m128 a) pure @safe 1257 { 1258 a[0] = 1.0f / sqrt(a[0]); 1259 return a; 1260 } 1261 } 1262 1263 unittest 1264 { 1265 double maxRelativeError = 0.000245; // -72 dB 1266 void testInvSqrt(float number) nothrow @nogc 1267 { 1268 __m128 A = _mm_set1_ps(number); 1269 1270 // test _mm_rcp_ps 1271 __m128 B = _mm_rcp_ps(A); 1272 foreach(i; 0..4) 1273 { 1274 double exact = 1.0f / A.array[i]; 1275 double ratio = cast(double)(B.array[i]) / cast(double)(exact); 1276 assert(abs(ratio - 1) <= maxRelativeError); 1277 } 1278 1279 // test _mm_rcp_ss 1280 { 1281 B = _mm_rcp_ss(A); 1282 double exact = 1.0f / A.array[0]; 1283 double ratio = cast(double)(B.array[0]) / cast(double)(exact); 1284 assert(abs(ratio - 1) <= maxRelativeError); 1285 } 1286 1287 // test _mm_rsqrt_ps 1288 B = _mm_rsqrt_ps(A); 1289 foreach(i; 0..4) 1290 { 1291 double exact = 1.0f / sqrt(A.array[i]); 1292 double ratio = cast(double)(B.array[i]) / cast(double)(exact); 1293 assert(abs(ratio - 1) <= maxRelativeError); 1294 } 1295 1296 // test _mm_rsqrt_ss 1297 { 1298 B = _mm_rsqrt_ss(A); 1299 double exact = 1.0f / sqrt(A.array[0]); 1300 double ratio = cast(double)(B.array[0]) / cast(double)(exact); 1301 assert(abs(ratio - 1) <= maxRelativeError); 1302 } 1303 } 1304 1305 testInvSqrt(1.1f); 1306 testInvSqrt(2.45674864151f); 1307 testInvSqrt(27841456468.0f); 1308 } 1309 1310 __m64 _mm_sad_pu8 (__m64 a, __m64 b) pure @safe 1311 { 1312 return to_m64(_mm_sad_epu8(to_m128i(a), to_m128i(b))); 1313 } 1314 1315 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) pure @safe 1316 { 1317 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx); 1318 } 1319 1320 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) pure @safe 1321 { 1322 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx); 1323 } 1324 1325 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) pure @safe 1326 { 1327 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx); 1328 } 1329 1330 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted 1331 { 1332 // Note: despite appearances, generates sensible code, 1333 // inlines correctly and is constant folded 1334 float[4] result = [e0, e1, e2, e3]; 1335 return loadUnaligned!(float4)(result.ptr); 1336 } 1337 unittest 1338 { 1339 __m128 A = _mm_set_ps(3, 2, 1, 546); 1340 float[4] correct = [546.0f, 1.0f, 2.0f, 3.0f]; 1341 assert(A.array == correct); 1342 assert(A.array[0] == 546.0f); 1343 assert(A.array[1] == 1.0f); 1344 assert(A.array[2] == 2.0f); 1345 assert(A.array[3] == 3.0f); 1346 } 1347 1348 alias _mm_set_ps1 = _mm_set1_ps; 1349 1350 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) pure @safe 1351 { 1352 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx); 1353 } 1354 1355 __m128 _mm_set_ss (float a) pure @trusted 1356 { 1357 __m128 r = _mm_setzero_ps(); 1358 r.ptr[0] = a; 1359 return r; 1360 } 1361 unittest 1362 { 1363 float[4] correct = [42.0f, 0.0f, 0.0f, 0.0f]; 1364 __m128 A = _mm_set_ss(42.0f); 1365 assert(A.array == correct); 1366 } 1367 1368 __m128 _mm_set1_ps (float a) pure @trusted 1369 { 1370 __m128 r = void; 1371 r.ptr[0] = a; 1372 r.ptr[1] = a; 1373 r.ptr[2] = a; 1374 r.ptr[3] = a; 1375 return r; 1376 } 1377 unittest 1378 { 1379 float[4] correct = [42.0f, 42.0f, 42.0f, 42.0f]; 1380 __m128 A = _mm_set1_ps(42.0f); 1381 assert(A.array == correct); 1382 } 1383 1384 1385 void _mm_setcsr(uint controlWord) pure @safe 1386 { 1387 version(GNU) 1388 { 1389 static if (GDC_with_SSE) 1390 { 1391 __builtin_ia32_ldmxcsr(controlWord); 1392 } 1393 else version(X86) 1394 { 1395 asm pure nothrow @nogc @trusted 1396 { 1397 "ldmxcsr %0;\n" 1398 : 1399 : "m" (controlWord) 1400 : ; 1401 } 1402 } 1403 } 1404 else version (InlineX86Asm) 1405 { 1406 asm pure nothrow @nogc @safe 1407 { 1408 ldmxcsr controlWord; 1409 } 1410 } 1411 else 1412 static assert(0, "Not yet supported"); 1413 } 1414 unittest 1415 { 1416 _mm_setcsr(_mm_getcsr()); 1417 } 1418 1419 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted 1420 { 1421 float[4] result = [e3, e2, e1, e0]; 1422 return loadUnaligned!(float4)(result.ptr); 1423 } 1424 unittest 1425 { 1426 __m128 A = _mm_setr_ps(3, 2, 1, 546); 1427 float[4] correct = [3.0f, 2.0f, 1.0f, 546.0f]; 1428 assert(A.array == correct); 1429 assert(A.array[0] == 3.0f); 1430 assert(A.array[1] == 2.0f); 1431 assert(A.array[2] == 1.0f); 1432 assert(A.array[3] == 546.0f); 1433 } 1434 1435 __m128 _mm_setzero_ps() pure @trusted 1436 { 1437 // Compiles to xorps without problems 1438 float[4] result = [0.0f, 0.0f, 0.0f, 0.0f]; 1439 return loadUnaligned!(float4)(result.ptr); 1440 } 1441 1442 version(GNU) 1443 { 1444 void _mm_sfence() pure @trusted 1445 { 1446 static if (GDC_with_SSE) 1447 { 1448 __builtin_ia32_sfence(); 1449 } 1450 else version(X86) 1451 { 1452 asm pure nothrow @nogc @trusted 1453 { 1454 "sfence;\n" : : : ; 1455 } 1456 } 1457 else 1458 static assert(false); 1459 } 1460 } 1461 else version(LDC) 1462 { 1463 alias _mm_sfence = __builtin_ia32_sfence; 1464 } 1465 else static if (DMD_with_asm) 1466 { 1467 void _mm_sfence() pure @safe 1468 { 1469 asm nothrow @nogc pure @safe 1470 { 1471 sfence; 1472 } 1473 } 1474 } 1475 else 1476 static assert(false); 1477 unittest 1478 { 1479 _mm_sfence(); 1480 } 1481 1482 __m64 _mm_shuffle_pi16(int imm8)(__m64 a) pure @safe 1483 { 1484 return cast(__m64) shufflevector!(short4, ( (imm8 >> 0) & 3 ), 1485 ( (imm8 >> 2) & 3 ), 1486 ( (imm8 >> 4) & 3 ), 1487 ( (imm8 >> 6) & 3 ))(cast(short4)a, cast(short4)a); 1488 } 1489 unittest 1490 { 1491 __m64 A = _mm_setr_pi16(0, 1, 2, 3); 1492 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 1493 short4 B = cast(short4) _mm_shuffle_pi16!SHUFFLE(A); 1494 short[4] expectedB = [ 3, 2, 1, 0 ]; 1495 assert(B.array == expectedB); 1496 } 1497 1498 // Note: the immediate shuffle value is given at compile-time instead of runtime. 1499 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe 1500 { 1501 return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b); 1502 } 1503 1504 static if (GDC_with_SSE) 1505 { 1506 alias _mm_sqrt_ps = __builtin_ia32_sqrtps; 1507 } 1508 else version(LDC) 1509 { 1510 // Disappeared with LDC 1.11 1511 static if (__VERSION__ < 2081) 1512 alias _mm_sqrt_ps = __builtin_ia32_sqrtps; 1513 else 1514 { 1515 __m128 _mm_sqrt_ps(__m128 vec) pure @safe 1516 { 1517 vec.array[0] = llvm_sqrt(vec.array[0]); 1518 vec.array[1] = llvm_sqrt(vec.array[1]); 1519 vec.array[2] = llvm_sqrt(vec.array[2]); 1520 vec.array[3] = llvm_sqrt(vec.array[3]); 1521 return vec; 1522 } 1523 } 1524 } 1525 else 1526 { 1527 __m128 _mm_sqrt_ps(__m128 vec) pure @trusted 1528 { 1529 vec.ptr[0] = sqrt(vec.array[0]); 1530 vec.ptr[1] = sqrt(vec.array[1]); 1531 vec.ptr[2] = sqrt(vec.array[2]); 1532 vec.ptr[3] = sqrt(vec.array[3]); 1533 return vec; 1534 } 1535 } 1536 unittest 1537 { 1538 __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f)); 1539 assert(A.array[0] == 2.0f); 1540 assert(A.array[1] == 2.0f); 1541 assert(A.array[2] == 2.0f); 1542 assert(A.array[3] == 2.0f); 1543 } 1544 1545 static if (GDC_with_SSE) 1546 { 1547 alias _mm_sqrt_ss = __builtin_ia32_sqrtss; 1548 } 1549 else version(LDC) 1550 { 1551 // Disappeared with LDC 1.11 1552 static if (__VERSION__ < 2081) 1553 alias _mm_sqrt_ss = __builtin_ia32_sqrtss; 1554 else 1555 { 1556 __m128 _mm_sqrt_ss(__m128 vec) pure @safe 1557 { 1558 vec.array[0] = llvm_sqrt(vec.array[0]); 1559 vec.array[1] = vec.array[1]; 1560 vec.array[2] = vec.array[2]; 1561 vec.array[3] = vec.array[3]; 1562 return vec; 1563 } 1564 } 1565 } 1566 else 1567 { 1568 __m128 _mm_sqrt_ss(__m128 vec) pure @trusted 1569 { 1570 vec.ptr[0] = sqrt(vec.array[0]); 1571 return vec; 1572 } 1573 } 1574 unittest 1575 { 1576 __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f)); 1577 assert(A.array[0] == 2.0f); 1578 assert(A.array[1] == 4.0f); 1579 assert(A.array[2] == 4.0f); 1580 assert(A.array[3] == 4.0f); 1581 } 1582 1583 void _mm_store_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 1584 { 1585 __m128* aligned = cast(__m128*)mem_addr; 1586 *aligned = a; 1587 } 1588 1589 alias _mm_store_ps1 = _mm_store1_ps; 1590 1591 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe 1592 { 1593 *mem_addr = a.array[0]; 1594 } 1595 unittest 1596 { 1597 float a; 1598 _mm_store_ss(&a, _mm_set_ps(3, 2, 1, 546)); 1599 assert(a == 546); 1600 } 1601 1602 void _mm_store1_ps(float* mem_addr, __m128 a) pure @trusted // not safe since nothing guarantees alignment 1603 { 1604 __m128* aligned = cast(__m128*)mem_addr; 1605 __m128 r; 1606 r.ptr[0] = a.array[0]; 1607 r.ptr[1] = a.array[0]; 1608 r.ptr[2] = a.array[0]; 1609 r.ptr[3] = a.array[0]; 1610 *aligned = r; 1611 } 1612 1613 void _mm_storeh_pi(__m64* p, __m128 a) pure @trusted 1614 { 1615 long2 la = cast(long2)a; 1616 (*p).ptr[0] = la.array[1]; 1617 } 1618 unittest 1619 { 1620 __m64 R = _mm_setzero_si64(); 1621 long2 A = [13, 25]; 1622 _mm_storeh_pi(&R, cast(__m128)A); 1623 assert(R.array[0] == 25); 1624 } 1625 1626 void _mm_storel_pi(__m64* p, __m128 a) pure @trusted 1627 { 1628 long2 la = cast(long2)a; 1629 (*p).ptr[0] = la.array[0]; 1630 } 1631 unittest 1632 { 1633 __m64 R = _mm_setzero_si64(); 1634 long2 A = [13, 25]; 1635 _mm_storel_pi(&R, cast(__m128)A); 1636 assert(R.array[0] == 13); 1637 } 1638 1639 void _mm_storer_ps(float* mem_addr, __m128 a) pure @trusted // not safe since nothing guarantees alignment 1640 { 1641 __m128* aligned = cast(__m128*)mem_addr; 1642 __m128 r; 1643 r.ptr[0] = a.array[3]; 1644 r.ptr[1] = a.array[2]; 1645 r.ptr[2] = a.array[1]; 1646 r.ptr[3] = a.array[0]; 1647 *aligned = r; 1648 } 1649 1650 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe 1651 { 1652 storeUnaligned!(float4)(a, mem_addr); 1653 } 1654 1655 void _mm_stream_pi (__m64* mem_addr, __m64 a) 1656 { 1657 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 1658 *mem_addr = a; // it's a regular move instead 1659 } 1660 1661 // BUG: can't implement non-temporal store with LDC inlineIR since !nontemporal 1662 // needs some IR outside this function that would say: 1663 // 1664 // !0 = !{ i32 1 } 1665 // 1666 // It's a LLVM IR metadata description. 1667 // Regardless, non-temporal moves are really dangerous for performance... 1668 void _mm_stream_ps (float* mem_addr, __m128 a) 1669 { 1670 __m128* dest = cast(__m128*)mem_addr; 1671 *dest = a; // it's a regular move instead 1672 } 1673 unittest 1674 { 1675 align(16) float[4] A; 1676 _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f)); 1677 assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f); 1678 } 1679 1680 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe 1681 { 1682 return a - b; 1683 } 1684 unittest 1685 { 1686 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1687 a = _mm_sub_ps(a, a); 1688 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 1689 assert(a.array == correct); 1690 } 1691 1692 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe 1693 { 1694 static if (GDC_with_SSE) 1695 return __builtin_ia32_subss(a, b); 1696 else 1697 { 1698 a[0] -= b[0]; 1699 return a; 1700 } 1701 } 1702 unittest 1703 { 1704 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1705 a = _mm_sub_ss(a, a); 1706 float[4] correct = [0.0f, -2.0, 3.0f, 1.0f]; 1707 assert(a.array == correct); 1708 } 1709 1710 1711 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe 1712 { 1713 __m128 tmp3, tmp2, tmp1, tmp0; 1714 tmp0 = _mm_unpacklo_ps(row0, row1); 1715 tmp2 = _mm_unpacklo_ps(row2, row3); 1716 tmp1 = _mm_unpackhi_ps(row0, row1); 1717 tmp3 = _mm_unpackhi_ps(row2, row3); 1718 row0 = _mm_movelh_ps(tmp0, tmp2); 1719 row1 = _mm_movehl_ps(tmp2, tmp0); 1720 row2 = _mm_movelh_ps(tmp1, tmp3); 1721 row3 = _mm_movehl_ps(tmp3, tmp1); 1722 } 1723 1724 // Note: the only difference between these intrinsics is the signalling 1725 // behaviour of quiet NaNs. This is incorrect but the case where 1726 // you would want to differentiate between qNaN and sNaN and then 1727 // treat them differently on purpose seems extremely rare. 1728 alias _mm_ucomieq_ss = _mm_comieq_ss; 1729 alias _mm_ucomige_ss = _mm_comige_ss; 1730 alias _mm_ucomigt_ss = _mm_comigt_ss; 1731 alias _mm_ucomile_ss = _mm_comile_ss; 1732 alias _mm_ucomilt_ss = _mm_comilt_ss; 1733 alias _mm_ucomineq_ss = _mm_comineq_ss; 1734 1735 1736 __m128 _mm_undefined_ps() pure @safe 1737 { 1738 __m128 undef = void; 1739 return undef; 1740 } 1741 1742 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @trusted 1743 { 1744 __m128 r; 1745 r.ptr[0] = a.array[2]; 1746 r.ptr[1] = b.array[2]; 1747 r.ptr[2] = a.array[3]; 1748 r.ptr[3] = b.array[3]; 1749 return r; 1750 } 1751 1752 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @trusted 1753 { 1754 __m128 r; 1755 r.ptr[0] = a.array[0]; 1756 r.ptr[1] = b.array[0]; 1757 r.ptr[2] = a.array[1]; 1758 r.ptr[3] = b.array[1]; 1759 return r; 1760 } 1761 1762 __m128 _mm_xor_ps (__m128 a, __m128 b) pure @safe 1763 { 1764 return cast(__m128)(cast(__m128i)a ^ cast(__m128i)b); 1765 } 1766 1767 1768 private 1769 { 1770 /// Returns: `true` if the pointer is suitably aligned. 1771 bool isPointerAligned(void* p, size_t alignment) pure 1772 { 1773 assert(alignment != 0); 1774 return ( cast(size_t)p & (alignment - 1) ) == 0; 1775 } 1776 1777 /// Returns: next pointer aligned with alignment bytes. 1778 void* nextAlignedPointer(void* start, size_t alignment) pure 1779 { 1780 return cast(void*)nextMultipleOf(cast(size_t)(start), alignment); 1781 } 1782 1783 // Returns number of bytes to actually allocate when asking 1784 // for a particular alignment 1785 @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure 1786 { 1787 enum size_t pointerSize = size_t.sizeof; 1788 return askedSize + alignment - 1 + pointerSize * 3; 1789 } 1790 1791 // Store pointer given my malloc, size and alignment 1792 @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure 1793 { 1794 enum size_t pointerSize = size_t.sizeof; 1795 char* start = cast(char*)raw + pointerSize * 3; 1796 void* aligned = nextAlignedPointer(start, alignment); 1797 void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize); 1798 *rawLocation = raw; 1799 size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize); 1800 *sizeLocation = size; 1801 size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize); 1802 *alignmentLocation = alignment; 1803 assert( isPointerAligned(aligned, alignment) ); 1804 return aligned; 1805 } 1806 1807 // Returns: x, multiple of powerOfTwo, so that x >= n. 1808 @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow 1809 { 1810 // check power-of-two 1811 assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0)); 1812 1813 size_t mask = ~(powerOfTwo - 1); 1814 return (n + powerOfTwo - 1) & mask; 1815 } 1816 } 1817 1818 unittest 1819 { 1820 assert(nextMultipleOf(0, 4) == 0); 1821 assert(nextMultipleOf(1, 4) == 4); 1822 assert(nextMultipleOf(2, 4) == 4); 1823 assert(nextMultipleOf(3, 4) == 4); 1824 assert(nextMultipleOf(4, 4) == 4); 1825 assert(nextMultipleOf(5, 4) == 8); 1826 1827 { 1828 void* p = _mm_malloc(23, 16); 1829 assert(p !is null); 1830 assert(((cast(size_t)p) & 0xf) == 0); 1831 _mm_free(p); 1832 } 1833 1834 void* nullAlloc = _mm_malloc(0, 32); 1835 assert(nullAlloc != null); 1836 _mm_free(nullAlloc); 1837 }