1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2019. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.xmmintrin; 7 8 public import inteli.types; 9 10 import inteli.internals; 11 12 import inteli.mmx; 13 import inteli.emmintrin; 14 15 import core.stdc.stdlib: malloc, free; 16 import core.exception: onOutOfMemoryError; 17 18 version(D_InlineAsm_X86) 19 version = InlineX86Asm; 20 else version(D_InlineAsm_X86_64) 21 version = InlineX86Asm; 22 23 24 // SSE1 25 26 nothrow @nogc: 27 28 29 enum int _MM_EXCEPT_INVALID = 0x0001; 30 enum int _MM_EXCEPT_DENORM = 0x0002; 31 enum int _MM_EXCEPT_DIV_ZERO = 0x0004; 32 enum int _MM_EXCEPT_OVERFLOW = 0x0008; 33 enum int _MM_EXCEPT_UNDERFLOW = 0x0010; 34 enum int _MM_EXCEPT_INEXACT = 0x0020; 35 enum int _MM_EXCEPT_MASK = 0x003f; 36 37 enum int _MM_MASK_INVALID = 0x0080; 38 enum int _MM_MASK_DENORM = 0x0100; 39 enum int _MM_MASK_DIV_ZERO = 0x0200; 40 enum int _MM_MASK_OVERFLOW = 0x0400; 41 enum int _MM_MASK_UNDERFLOW = 0x0800; 42 enum int _MM_MASK_INEXACT = 0x1000; 43 enum int _MM_MASK_MASK = 0x1f80; 44 45 enum int _MM_ROUND_NEAREST = 0x0000; 46 enum int _MM_ROUND_DOWN = 0x2000; 47 enum int _MM_ROUND_UP = 0x4000; 48 enum int _MM_ROUND_TOWARD_ZERO = 0x6000; 49 enum int _MM_ROUND_MASK = 0x6000; 50 51 enum int _MM_FLUSH_ZERO_MASK = 0x8000; 52 enum int _MM_FLUSH_ZERO_ON = 0x8000; 53 enum int _MM_FLUSH_ZERO_OFF = 0x0000; 54 55 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe 56 { 57 return a + b; 58 } 59 60 unittest 61 { 62 __m128 a = [1, 2, 3, 4]; 63 a = _mm_add_ps(a, a); 64 assert(a.array[0] == 2); 65 assert(a.array[1] == 4); 66 assert(a.array[2] == 6); 67 assert(a.array[3] == 8); 68 } 69 70 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe 71 { 72 static if (GDC_with_SSE) 73 return __builtin_ia32_addss(a, b); 74 else 75 { 76 a[0] += b[0]; 77 return a; 78 } 79 } 80 unittest 81 { 82 __m128 a = [1, 2, 3, 4]; 83 a = _mm_add_ss(a, a); 84 assert(a.array == [2.0f, 2, 3, 4]); 85 } 86 87 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe 88 { 89 return cast(__m128)(cast(__m128i)a & cast(__m128i)b); 90 } 91 unittest 92 { 93 // Note: tested in emmintrin.d 94 } 95 96 __m128 _mm_andnot_ps (__m128 a, __m128 b) pure @safe 97 { 98 return cast(__m128)( (~cast(__m128i)a) & cast(__m128i)b ); 99 } 100 101 /// Average packed unsigned 16-bit integers in ``a` and `b`. 102 __m64 _mm_avg_pu16 (__m64 a, __m64 b) pure @safe 103 { 104 return to_m64(_mm_avg_epu16(to_m128i(a), to_m128i(b))); 105 } 106 107 /// Average packed unsigned 8-bit integers in ``a` and `b`. 108 __m64 _mm_avg_pu8 (__m64 a, __m64 b) pure @safe 109 { 110 return to_m64(_mm_avg_epu8(to_m128i(a), to_m128i(b))); 111 } 112 113 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe 114 { 115 return cast(__m128) cmpps!(FPComparison.oeq)(a, b); 116 } 117 118 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe 119 { 120 return cast(__m128) cmpss!(FPComparison.oeq)(a, b); 121 } 122 123 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe 124 { 125 return cast(__m128) cmpps!(FPComparison.oge)(a, b); 126 } 127 unittest 128 { 129 __m128i R = cast(__m128i) _mm_cmpge_ps(_mm_setr_ps(0, 1, -1, float.nan), 130 _mm_setr_ps(0, 0, 0, 0)); 131 int[4] correct = [-1, -1, 0, 0]; 132 assert(R.array == correct); 133 } 134 135 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe 136 { 137 return cast(__m128) cmpss!(FPComparison.oge)(a, b); 138 } 139 140 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe 141 { 142 return cast(__m128) cmpps!(FPComparison.ogt)(a, b); 143 } 144 145 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe 146 { 147 return cast(__m128) cmpss!(FPComparison.ogt)(a, b); 148 } 149 150 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe 151 { 152 return cast(__m128) cmpps!(FPComparison.ole)(a, b); 153 } 154 155 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe 156 { 157 return cast(__m128) cmpss!(FPComparison.ole)(a, b); 158 } 159 160 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe 161 { 162 return cast(__m128) cmpps!(FPComparison.olt)(a, b); 163 } 164 165 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe 166 { 167 return cast(__m128) cmpss!(FPComparison.olt)(a, b); 168 } 169 170 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe 171 { 172 return cast(__m128) cmpps!(FPComparison.une)(a, b); 173 } 174 175 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe 176 { 177 return cast(__m128) cmpss!(FPComparison.une)(a, b); 178 } 179 180 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe 181 { 182 return cast(__m128) cmpps!(FPComparison.ult)(a, b); 183 } 184 185 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe 186 { 187 return cast(__m128) cmpss!(FPComparison.ult)(a, b); 188 } 189 190 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe 191 { 192 return cast(__m128) cmpps!(FPComparison.ule)(a, b); 193 } 194 195 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe 196 { 197 return cast(__m128) cmpss!(FPComparison.ule)(a, b); 198 } 199 200 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe 201 { 202 return cast(__m128) cmpps!(FPComparison.ugt)(a, b); 203 } 204 205 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe 206 { 207 return cast(__m128) cmpss!(FPComparison.ugt)(a, b); 208 } 209 210 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe 211 { 212 return cast(__m128) cmpps!(FPComparison.uge)(a, b); 213 } 214 215 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe 216 { 217 return cast(__m128) cmpss!(FPComparison.uge)(a, b); 218 } 219 220 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe 221 { 222 return cast(__m128) cmpps!(FPComparison.ord)(a, b); 223 } 224 225 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe 226 { 227 return cast(__m128) cmpss!(FPComparison.ord)(a, b); 228 } 229 230 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe 231 { 232 return cast(__m128) cmpps!(FPComparison.uno)(a, b); 233 } 234 235 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe 236 { 237 return cast(__m128) cmpss!(FPComparison.uno)(a, b); 238 } 239 240 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS 241 // Some such comparisons yields true for NaNs, other don't. 242 243 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe // comiss + sete 244 { 245 return comss!(FPComparison.ueq)(a, b); // yields true for NaN! 246 } 247 248 int _mm_comige_ss (__m128 a, __m128 b) pure @safe // comiss + setae 249 { 250 return comss!(FPComparison.oge)(a, b); 251 } 252 253 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta 254 { 255 return comss!(FPComparison.ogt)(a, b); 256 } 257 258 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe 259 { 260 return comss!(FPComparison.ule)(a, b); // yields true for NaN! 261 } 262 263 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb 264 { 265 return comss!(FPComparison.ult)(a, b); // yields true for NaN! 266 } 267 268 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne 269 { 270 return comss!(FPComparison.one)(a, b); 271 } 272 273 alias _mm_cvt_pi2ps = _mm_cvtpi32_ps; 274 275 __m64 _mm_cvt_ps2pi (__m128 a) pure @safe 276 { 277 return to_m64(_mm_cvtps_epi32(a)); 278 } 279 280 __m128 _mm_cvt_si2ss(__m128 v, int x) pure @trusted 281 { 282 v.ptr[0] = cast(float)x; 283 return v; 284 } 285 unittest 286 { 287 __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42); 288 assert(a.array == [42f, 0, 0, 0]); 289 } 290 291 // Note: is just another name for _mm_cvtss_si32 292 alias _mm_cvt_ss2si = _mm_cvtss_si32; 293 294 295 __m128 _mm_cvtpi16_ps (__m64 a) pure @safe 296 { 297 __m128i ma = to_m128i(a); 298 ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit 299 ma = _mm_srai_epi32(_mm_slli_epi32(ma, 16), 16); // Replicate sign bit 300 return _mm_cvtepi32_ps(ma); 301 } 302 unittest 303 { 304 __m64 A = _mm_setr_pi16(-1, 2, -3, 4); 305 __m128 R = _mm_cvtpi16_ps(A); 306 float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f]; 307 assert(R.array == correct); 308 } 309 310 __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) pure @trusted 311 { 312 __m128 fb = _mm_cvtepi32_ps(to_m128i(b)); 313 a.ptr[0] = fb.array[0]; 314 a.ptr[1] = fb.array[1]; 315 return a; 316 } 317 unittest 318 { 319 __m128 R = _mm_cvtpi32_ps(_mm_set1_ps(4.0f), _mm_setr_pi32(1, 2)); 320 float[4] correct = [1.0f, 2.0f, 4.0f, 4.0f]; 321 assert(R.array == correct); 322 } 323 324 325 __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) pure @trusted 326 { 327 long2 l; 328 l.ptr[0] = a.array[0]; 329 l.ptr[1] = b.array[0]; 330 return _mm_cvtepi32_ps(cast(__m128i)l); 331 } 332 333 __m128 _mm_cvtpi8_ps (__m64 a) pure @safe 334 { 335 __m128i b = to_m128i(a); 336 337 // Zero extend to 32-bit 338 b = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 339 b = _mm_unpacklo_epi16(b, _mm_setzero_si128()); 340 341 // Replicate sign bit 342 b = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24); // Replicate sign bit 343 return _mm_cvtepi32_ps(b); 344 } 345 unittest 346 { 347 __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0); 348 __m128 R = _mm_cvtpi8_ps(A); 349 float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f]; 350 assert(R.array == correct); 351 } 352 353 __m64 _mm_cvtps_pi16 (__m128 a) pure @safe 354 { 355 // The C++ version of this intrinsic convert to 32-bit float, then use packssdw 356 // Which means the 16-bit integers should be saturated 357 __m128i b = _mm_cvtps_epi32(a); 358 b = _mm_packs_epi32(b, b); 359 return to_m64(b); 360 } 361 unittest 362 { 363 __m128 A = _mm_setr_ps(-1.0f, 2.0f, -33000.0f, 70000.0f); 364 short4 R = cast(short4) _mm_cvtps_pi16(A); 365 short[4] correct = [-1, 2, -32768, 32767]; 366 assert(R.array == correct); 367 } 368 369 __m64 _mm_cvtps_pi32 (__m128 a) pure @safe 370 { 371 return to_m64(_mm_cvtps_epi32(a)); 372 } 373 unittest 374 { 375 __m128 A = _mm_setr_ps(-33000.0f, 70000.0f, -1.0f, 2.0f, ); 376 int2 R = cast(int2) _mm_cvtps_pi32(A); 377 int[2] correct = [-33000, 70000]; 378 assert(R.array == correct); 379 } 380 381 __m64 _mm_cvtps_pi8 (__m128 a) pure @safe 382 { 383 // The C++ version of this intrinsic convert to 32-bit float, then use packssdw + packsswb 384 // Which means the 8-bit integers should be saturated 385 __m128i b = _mm_cvtps_epi32(a); 386 b = _mm_packs_epi32(b, _mm_setzero_si128()); 387 b = _mm_packs_epi16(b, _mm_setzero_si128()); 388 return to_m64(b); 389 } 390 unittest 391 { 392 __m128 A = _mm_setr_ps(-1.0f, 2.0f, -129.0f, 128.0f); 393 byte8 R = cast(byte8) _mm_cvtps_pi8(A); 394 byte[8] correct = [-1, 2, -128, 127, 0, 0, 0, 0]; 395 assert(R.array == correct); 396 } 397 398 __m128 _mm_cvtpu16_ps (__m64 a) pure @safe 399 { 400 __m128i ma = to_m128i(a); 401 ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit 402 return _mm_cvtepi32_ps(ma); 403 } 404 unittest 405 { 406 __m64 A = _mm_setr_pi16(-1, 2, -3, 4); 407 __m128 R = _mm_cvtpu16_ps(A); 408 float[4] correct = [65535.0f, 2.0f, 65533.0f, 4.0f]; 409 assert(R.array == correct); 410 } 411 412 __m128 _mm_cvtpu8_ps (__m64 a) pure @safe 413 { 414 __m128i b = to_m128i(a); 415 416 // Zero extend to 32-bit 417 b = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 418 b = _mm_unpacklo_epi16(b, _mm_setzero_si128()); 419 return _mm_cvtepi32_ps(b); 420 } 421 unittest 422 { 423 __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0); 424 __m128 R = _mm_cvtpu8_ps(A); 425 float[4] correct = [255.0f, 2.0f, 253.0f, 4.0f]; 426 assert(R.array == correct); 427 } 428 429 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @trusted 430 { 431 v.ptr[0] = cast(float)x; 432 return v; 433 } 434 unittest 435 { 436 __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42); 437 assert(a.array == [42.0f, 0, 0, 0]); 438 } 439 440 // Note: on macOS, using "llvm.x86.sse.cvtsi642ss" was buggy 441 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @trusted 442 { 443 v.ptr[0] = cast(float)x; 444 return v; 445 } 446 unittest 447 { 448 __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42); 449 assert(a.array == [42.0f, 0, 0, 0]); 450 } 451 452 float _mm_cvtss_f32(__m128 a) pure @safe 453 { 454 return a.array[0]; 455 } 456 457 version(LDC) 458 { 459 alias _mm_cvtss_si32 = __builtin_ia32_cvtss2si; 460 } 461 else 462 { 463 int _mm_cvtss_si32 (__m128 a) pure @safe 464 { 465 return convertFloatToInt32UsingMXCSR(a.array[0]); 466 } 467 } 468 unittest 469 { 470 assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 471 } 472 473 version(LDC) 474 { 475 version(X86_64) 476 alias _mm_cvtss_si64 = __builtin_ia32_cvtss2si64; 477 else 478 { 479 // Note: __builtin_ia32_cvtss2si64 crashes LDC in 32-bit 480 long _mm_cvtss_si64 (__m128 a) pure @safe 481 { 482 return convertFloatToInt64UsingMXCSR(a.array[0]); 483 } 484 } 485 } 486 else 487 { 488 long _mm_cvtss_si64 (__m128 a) pure @safe 489 { 490 return convertFloatToInt64UsingMXCSR(a.array[0]); 491 } 492 } 493 unittest 494 { 495 assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 496 497 uint savedRounding = _MM_GET_ROUNDING_MODE(); 498 499 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 500 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.5f))); 501 502 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 503 assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f))); 504 505 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 506 assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f))); 507 508 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 509 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f))); 510 511 _MM_SET_ROUNDING_MODE(savedRounding); 512 } 513 514 515 version(LDC) 516 { 517 alias _mm_cvtt_ss2si = __builtin_ia32_cvttss2si; 518 } 519 else 520 { 521 int _mm_cvtt_ss2si (__m128 a) pure @safe 522 { 523 return cast(int)(a.array[0]); 524 } 525 } 526 unittest 527 { 528 assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 529 } 530 531 __m64 _mm_cvtt_ps2pi (__m128 a) pure @safe 532 { 533 return to_m64(_mm_cvttps_epi32(a)); 534 } 535 536 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; // it's actually the same op 537 538 // Note: __builtin_ia32_cvttss2si64 crashes LDC when generating 32-bit x86 code. 539 long _mm_cvttss_si64 (__m128 a) pure @safe 540 { 541 return cast(long)(a.array[0]); // Generates cvttss2si as expected 542 } 543 unittest 544 { 545 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 546 } 547 548 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe 549 { 550 return a / b; 551 } 552 unittest 553 { 554 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 555 a = _mm_div_ps(a, a); 556 float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f]; 557 assert(a.array == correct); 558 } 559 560 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe 561 { 562 static if (GDC_with_SSE) 563 return __builtin_ia32_divss(a, b); 564 else 565 { 566 a[0] /= b[0]; 567 return a; 568 } 569 } 570 unittest 571 { 572 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 573 a = _mm_div_ss(a, a); 574 float[4] correct = [1.0f, -2.0, 3.0f, 1.0f]; 575 assert(a.array == correct); 576 } 577 578 int _mm_extract_pi16 (__m64 a, int imm8) 579 { 580 short4 sa = cast(short4)a; 581 return cast(ushort)(sa.array[imm8]); 582 } 583 unittest 584 { 585 __m64 A = _mm_setr_pi16(-1, 6, 0, 4); 586 assert(_mm_extract_pi16(A, 0) == 65535); 587 assert(_mm_extract_pi16(A, 1) == 6); 588 assert(_mm_extract_pi16(A, 2) == 0); 589 assert(_mm_extract_pi16(A, 3) == 4); 590 } 591 592 /// Free aligned memory that was allocated with `_mm_malloc`. 593 void _mm_free(void * mem_addr) @trusted 594 { 595 // support for free(NULL) 596 if (mem_addr is null) 597 return; 598 599 // Technically we don't need to store size and alignement in the chunk, but we do in case we 600 // have to implement _mm_realloc 601 602 size_t pointerSize = (void*).sizeof; 603 void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof); 604 size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize); 605 size_t alignment = *alignmentLocation; 606 assert(alignment != 0); 607 assert(isPointerAligned(mem_addr, alignment)); 608 free(*rawLocation); 609 } 610 611 uint _MM_GET_EXCEPTION_MASK() pure @safe 612 { 613 return _mm_getcsr() & _MM_MASK_MASK; 614 } 615 616 uint _MM_GET_EXCEPTION_STATE() pure @safe 617 { 618 return _mm_getcsr() & _MM_EXCEPT_MASK; 619 } 620 621 uint _MM_GET_FLUSH_ZERO_MODE() pure @safe 622 { 623 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 624 } 625 626 uint _MM_GET_ROUNDING_MODE() pure @safe 627 { 628 return _mm_getcsr() & _MM_ROUND_MASK; 629 } 630 631 uint _mm_getcsr() pure @safe 632 { 633 version(GNU) 634 { 635 static if (GDC_with_SSE) 636 { 637 return __builtin_ia32_stmxcsr(); 638 } 639 else version(X86) 640 { 641 uint sseRounding = 0; 642 asm pure nothrow @nogc @trusted 643 { 644 "stmxcsr %0;\n" 645 : "=m" (sseRounding) 646 : 647 : ; 648 } 649 return sseRounding; 650 } 651 else 652 static assert(false); 653 } 654 else version (InlineX86Asm) 655 { 656 uint controlWord; 657 asm nothrow @nogc pure @safe 658 { 659 stmxcsr controlWord; 660 } 661 return controlWord; 662 } 663 else 664 static assert(0, "Not yet supported"); 665 } 666 unittest 667 { 668 uint csr = _mm_getcsr(); 669 } 670 671 __m64 _mm_insert_pi16 (__m64 v, int i, int index) pure @trusted 672 { 673 short4 r = cast(short4)v; 674 r.ptr[index & 3] = cast(short)i; 675 return cast(__m64)r; 676 } 677 unittest 678 { 679 __m64 A = _mm_set_pi16(3, 2, 1, 0); 680 short4 R = cast(short4) _mm_insert_pi16(A, 42, 1 | 4); 681 short[4] correct = [0, 42, 2, 3]; 682 assert(R.array == correct); 683 } 684 685 __m128 _mm_load_ps(const(float)*p) pure @trusted 686 { 687 return *cast(__m128*)p; 688 } 689 690 __m128 _mm_load_ps1(const(float)*p) pure @trusted 691 { 692 return __m128(*p); 693 } 694 695 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted 696 { 697 __m128 r; 698 r.ptr[0] = *mem_addr; 699 r.ptr[1] = 0; 700 r.ptr[2] = 0; 701 r.ptr[3] = 0; 702 return r; 703 } 704 705 alias _mm_load1_ps = _mm_load_ps1; 706 707 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @trusted 708 { 709 long2 la = cast(long2)a; 710 la.ptr[1] = (*mem_addr).array[0]; 711 return cast(__m128)la; 712 } 713 714 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @trusted 715 { 716 long2 la = cast(long2)a; 717 la.ptr[0] = (*mem_addr).array[0]; 718 return cast(__m128)la; 719 } 720 721 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted 722 { 723 __m128* aligned = cast(__m128*)mem_addr; 724 __m128 a = *aligned; 725 __m128 r; 726 r.ptr[0] = a.array[3]; 727 r.ptr[1] = a.array[2]; 728 r.ptr[2] = a.array[1]; 729 r.ptr[3] = a.array[0]; 730 return r; 731 } 732 733 __m128 _mm_loadu_ps(const(float)*p) pure @safe 734 { 735 return loadUnaligned!(__m128)(p); 736 } 737 738 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted 739 { 740 short r = *cast(short*)(mem_addr); 741 short8 result = [0, 0, 0, 0, 0, 0, 0, 0]; 742 result.ptr[0] = r; 743 return cast(__m128i)result; 744 } 745 unittest 746 { 747 short r = 13; 748 short8 A = cast(short8) _mm_loadu_si16(&r); 749 short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0]; 750 assert(A.array == correct); 751 } 752 753 __m128i _mm_loadu_si64(const(void)* mem_addr) pure @trusted 754 { 755 long r = *cast(long*)(mem_addr); 756 long2 result = [0, 0]; 757 result.ptr[0] = r; 758 return cast(__m128i)result; 759 } 760 unittest 761 { 762 long r = 446446446446; 763 long2 A = cast(long2) _mm_loadu_si64(&r); 764 long[2] correct = [446446446446, 0]; 765 assert(A.array == correct); 766 } 767 768 /// Allocate size bytes of memory, aligned to the alignment specified in align, 769 /// and return a pointer to the allocated memory. `_mm_free` should be used to free 770 /// memory that is allocated with `_mm_malloc`. 771 void* _mm_malloc(size_t size, size_t alignment) @trusted 772 { 773 assert(alignment != 0); 774 size_t request = requestedSize(size, alignment); 775 void* raw = malloc(request); 776 if (request > 0 && raw == null) // malloc(0) can validly return anything 777 onOutOfMemoryError(); 778 return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size 779 } 780 781 void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr) @trusted 782 { 783 // this works since mask is zero-extended 784 return _mm_maskmoveu_si128 (to_m128i(a), to_m128i(mask), mem_addr); 785 } 786 787 deprecated alias _m_maskmovq = _mm_maskmove_si64; 788 789 __m64 _mm_max_pi16 (__m64 a, __m64 b) pure @safe 790 { 791 return to_m64(_mm_max_epi16(to_m128i(a), to_m128i(b))); 792 } 793 794 static if (GDC_with_SSE) 795 { 796 alias _mm_max_ps = __builtin_ia32_maxps; 797 } 798 else version(LDC) 799 { 800 alias _mm_max_ps = __builtin_ia32_maxps; 801 } 802 else 803 { 804 __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe 805 { 806 __m128 r; 807 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 808 r[1] = (a[1] > b[1]) ? a[1] : b[1]; 809 r[2] = (a[2] > b[2]) ? a[2] : b[2]; 810 r[3] = (a[3] > b[3]) ? a[3] : b[3]; 811 return r; 812 } 813 } 814 unittest 815 { 816 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 817 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 818 __m128 M = _mm_max_ps(A, B); 819 assert(M.array[0] == 4); 820 assert(M.array[1] == 2); 821 assert(M.array[2] == 4); // in case of NaN, second operand prevails (as it seems) 822 assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems) 823 } 824 825 __m64 _mm_max_pu8 (__m64 a, __m64 b) pure @safe 826 { 827 return to_m64(_mm_max_epu8(to_m128i(a), to_m128i(b))); 828 } 829 830 static if (GDC_with_SSE) 831 { 832 alias _mm_max_ss = __builtin_ia32_maxss; 833 } 834 else version(LDC) 835 { 836 alias _mm_max_ss = __builtin_ia32_maxss; 837 } 838 else 839 { 840 __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe 841 { 842 __m128 r = a; 843 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 844 return r; 845 } 846 } 847 unittest 848 { 849 __m128 A = _mm_setr_ps(1, 2, 3, 4); 850 __m128 B = _mm_setr_ps(4, 1, 4, 1); 851 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 852 __m128 M = _mm_max_ss(A, B); 853 assert(M.array[0] == 4); 854 assert(M.array[1] == 2); 855 assert(M.array[2] == 3); 856 assert(M.array[3] == 4); 857 M = _mm_max_ps(A, C); // in case of NaN, second operand prevails 858 assert(M.array[0] != M.array[0]); 859 M = _mm_max_ps(C, A); // in case of NaN, second operand prevails 860 assert(M.array[0] == 1); 861 } 862 863 __m64 _mm_min_pi16 (__m64 a, __m64 b) pure @safe 864 { 865 return to_m64(_mm_min_epi16(to_m128i(a), to_m128i(b))); 866 } 867 868 static if (GDC_with_SSE) 869 { 870 alias _mm_min_ps = __builtin_ia32_minps; 871 } 872 else version(LDC) 873 { 874 alias _mm_min_ps = __builtin_ia32_minps; 875 } 876 else 877 { 878 __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe 879 { 880 __m128 r; 881 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 882 r[1] = (a[1] < b[1]) ? a[1] : b[1]; 883 r[2] = (a[2] < b[2]) ? a[2] : b[2]; 884 r[3] = (a[3] < b[3]) ? a[3] : b[3]; 885 return r; 886 } 887 } 888 unittest 889 { 890 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 891 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 892 __m128 M = _mm_min_ps(A, B); 893 assert(M.array[0] == 1); 894 assert(M.array[1] == 1); 895 assert(M.array[2] == 4); // in case of NaN, second operand prevails (as it seems) 896 assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems) 897 } 898 899 __m64 _mm_min_pu8 (__m64 a, __m64 b) pure @safe 900 { 901 return to_m64(_mm_min_epu8(to_m128i(a), to_m128i(b))); 902 } 903 904 static if (GDC_with_SSE) 905 { 906 alias _mm_min_ss = __builtin_ia32_minss; 907 } 908 else version(LDC) 909 { 910 alias _mm_min_ss = __builtin_ia32_minss; 911 } 912 else 913 { 914 __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe 915 { 916 __m128 r = a; 917 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 918 return r; 919 } 920 } 921 unittest 922 { 923 __m128 A = _mm_setr_ps(1, 2, 3, 4); 924 __m128 B = _mm_setr_ps(4, 1, 4, 1); 925 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 926 __m128 M = _mm_min_ss(A, B); 927 assert(M.array[0] == 1); 928 assert(M.array[1] == 2); 929 assert(M.array[2] == 3); 930 assert(M.array[3] == 4); 931 M = _mm_min_ps(A, C); // in case of NaN, second operand prevails 932 assert(M.array[0] != M.array[0]); 933 M = _mm_min_ps(C, A); // in case of NaN, second operand prevails 934 assert(M.array[0] == 1); 935 } 936 937 __m128 _mm_move_ss (__m128 a, __m128 b) pure @trusted 938 { 939 a.ptr[0] = b.array[0]; 940 return a; 941 } 942 943 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @trusted 944 { 945 b.ptr[0] = a.array[2]; 946 b.ptr[1] = a.array[3]; 947 return b; 948 } 949 950 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @trusted 951 { 952 a.ptr[2] = b.array[0]; 953 a.ptr[3] = b.array[1]; 954 return a; 955 } 956 957 int _mm_movemask_pi8 (__m64 a) pure @safe 958 { 959 return _mm_movemask_epi8(to_m128i(a)); 960 } 961 unittest 962 { 963 assert(0x9C == _mm_movemask_pi8(_mm_set_pi8(-1, 0, 0, -1, -1, -1, 0, 0))); 964 } 965 966 static if (GDC_with_SSE) 967 { 968 alias _mm_movemask_ps = __builtin_ia32_movmskps; 969 } 970 else version(LDC) 971 { 972 alias _mm_movemask_ps = __builtin_ia32_movmskps; 973 } 974 else 975 { 976 int _mm_movemask_ps (__m128 a) pure @safe 977 { 978 int4 ai = cast(int4)a; 979 int r = 0; 980 if (ai[0] < 0) r += 1; 981 if (ai[1] < 0) r += 2; 982 if (ai[2] < 0) r += 4; 983 if (ai[3] < 0) r += 8; 984 return r; 985 } 986 } 987 unittest 988 { 989 int4 A = [-1, 0, -43, 0]; 990 assert(5 == _mm_movemask_ps(cast(float4)A)); 991 } 992 993 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe 994 { 995 return a * b; 996 } 997 unittest 998 { 999 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1000 a = _mm_mul_ps(a, a); 1001 float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f]; 1002 assert(a.array == correct); 1003 } 1004 1005 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe 1006 { 1007 static if (GDC_with_SSE) 1008 return __builtin_ia32_mulss(a, b); 1009 else 1010 { 1011 a[0] *= b[0]; 1012 return a; 1013 } 1014 } 1015 unittest 1016 { 1017 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1018 a = _mm_mul_ss(a, a); 1019 float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f]; 1020 assert(a.array == correct); 1021 } 1022 1023 __m64 _mm_mulhi_pu16 (__m64 a, __m64 b) pure @safe 1024 { 1025 return to_m64(_mm_mulhi_epu16(to_m128i(a), to_m128i(b))); 1026 } 1027 unittest 1028 { 1029 __m64 A = _mm_setr_pi16(0, -16, 2, 3); 1030 __m64 B = _mm_set1_pi16(16384); 1031 short4 R = cast(short4)_mm_mulhi_pu16(A, B); 1032 short[4] correct = [0, 0x3FFC, 0, 0]; 1033 assert(R.array == correct); 1034 } 1035 1036 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe 1037 { 1038 return cast(__m128)(cast(__m128i)a | cast(__m128i)b); 1039 } 1040 1041 deprecated alias 1042 _m_pavgb = _mm_avg_pu8, 1043 _m_pavgw = _mm_avg_pu16, 1044 _m_pextrw = _mm_extract_pi16, 1045 _m_pinsrw = _mm_insert_pi16, 1046 _m_pmaxsw = _mm_max_pi16, 1047 _m_pmaxub = _mm_max_pu8, 1048 _m_pminsw = _mm_min_pi16, 1049 _m_pminub = _mm_min_pu8, 1050 _m_pmovmskb = _mm_movemask_pi8, 1051 _m_pmulhuw = _mm_mulhi_pu16; 1052 1053 enum _MM_HINT_NTA = 0; 1054 enum _MM_HINT_T0 = 1; 1055 enum _MM_HINT_T1 = 2; 1056 enum _MM_HINT_T2 = 3; 1057 1058 // Note: locality must be compile-time, unlike Intel Intrinsics API 1059 void _mm_prefetch(int locality)(void* p) pure @safe 1060 { 1061 llvm_prefetch(p, 0, locality, 1); 1062 } 1063 1064 deprecated alias 1065 _m_psadbw = _mm_sad_pu8, 1066 _m_pshufw = _mm_shuffle_pi16; 1067 1068 static if (GDC_with_SSE) 1069 { 1070 alias _mm_rcp_ps = __builtin_ia32_rcpps; 1071 } 1072 else version(LDC) 1073 { 1074 alias _mm_rcp_ps = __builtin_ia32_rcpps; 1075 } 1076 else 1077 { 1078 __m128 _mm_rcp_ps (__m128 a) pure @safe 1079 { 1080 a[0] = 1.0f / a[0]; 1081 a[1] = 1.0f / a[1]; 1082 a[2] = 1.0f / a[2]; 1083 a[3] = 1.0f / a[3]; 1084 return a; 1085 } 1086 } 1087 1088 static if (GDC_with_SSE) 1089 { 1090 alias _mm_rcp_ss = __builtin_ia32_rcpss; 1091 } 1092 else version(LDC) 1093 { 1094 alias _mm_rcp_ss = __builtin_ia32_rcpss; 1095 } 1096 else 1097 { 1098 __m128 _mm_rcp_ss (__m128 a) pure @safe 1099 { 1100 a[0] = 1.0f / a[0]; 1101 return a; 1102 } 1103 } 1104 1105 static if (GDC_with_SSE) 1106 { 1107 alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps; 1108 } 1109 else version(LDC) 1110 { 1111 alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps; 1112 } 1113 else 1114 { 1115 __m128 _mm_rsqrt_ps (__m128 a) pure @safe 1116 { 1117 a[0] = 1.0f / sqrt(a[0]); 1118 a[1] = 1.0f / sqrt(a[1]); 1119 a[2] = 1.0f / sqrt(a[2]); 1120 a[3] = 1.0f / sqrt(a[3]); 1121 return a; 1122 } 1123 } 1124 1125 static if (GDC_with_SSE) 1126 { 1127 alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss; 1128 } 1129 else version(LDC) 1130 { 1131 alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss; 1132 } 1133 else 1134 { 1135 __m128 _mm_rsqrt_ss (__m128 a) pure @safe 1136 { 1137 a[0] = 1.0f / sqrt(a[0]); 1138 return a; 1139 } 1140 } 1141 1142 unittest 1143 { 1144 double maxRelativeError = 0.000245; // -72 dB 1145 void testInvSqrt(float number) nothrow @nogc 1146 { 1147 __m128 A = _mm_set1_ps(number); 1148 1149 // test _mm_rcp_ps 1150 __m128 B = _mm_rcp_ps(A); 1151 foreach(i; 0..4) 1152 { 1153 double exact = 1.0f / A.array[i]; 1154 double ratio = cast(double)(B.array[i]) / cast(double)(exact); 1155 assert(abs(ratio - 1) <= maxRelativeError); 1156 } 1157 1158 // test _mm_rcp_ss 1159 { 1160 B = _mm_rcp_ss(A); 1161 double exact = 1.0f / A.array[0]; 1162 double ratio = cast(double)(B.array[0]) / cast(double)(exact); 1163 assert(abs(ratio - 1) <= maxRelativeError); 1164 } 1165 1166 // test _mm_rsqrt_ps 1167 B = _mm_rsqrt_ps(A); 1168 foreach(i; 0..4) 1169 { 1170 double exact = 1.0f / sqrt(A.array[i]); 1171 double ratio = cast(double)(B.array[i]) / cast(double)(exact); 1172 assert(abs(ratio - 1) <= maxRelativeError); 1173 } 1174 1175 // test _mm_rsqrt_ss 1176 { 1177 B = _mm_rsqrt_ss(A); 1178 double exact = 1.0f / sqrt(A.array[0]); 1179 double ratio = cast(double)(B.array[0]) / cast(double)(exact); 1180 assert(abs(ratio - 1) <= maxRelativeError); 1181 } 1182 } 1183 1184 testInvSqrt(1.1f); 1185 testInvSqrt(2.45674864151f); 1186 testInvSqrt(27841456468.0f); 1187 } 1188 1189 __m64 _mm_sad_pu8 (__m64 a, __m64 b) pure @safe 1190 { 1191 return to_m64(_mm_sad_epu8(to_m128i(a), to_m128i(b))); 1192 } 1193 1194 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) pure @safe 1195 { 1196 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx); 1197 } 1198 1199 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) pure @safe 1200 { 1201 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx); 1202 } 1203 1204 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) pure @safe 1205 { 1206 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx); 1207 } 1208 1209 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted 1210 { 1211 // Note: despite appearances, generates sensible code, 1212 // inlines correctly and is constant folded 1213 float[4] result = [e0, e1, e2, e3]; 1214 return loadUnaligned!(float4)(result.ptr); 1215 } 1216 unittest 1217 { 1218 __m128 A = _mm_set_ps(3, 2, 1, 546); 1219 float[4] correct = [546.0f, 1.0f, 2.0f, 3.0f]; 1220 assert(A.array == correct); 1221 assert(A.array[0] == 546.0f); 1222 assert(A.array[1] == 1.0f); 1223 assert(A.array[2] == 2.0f); 1224 assert(A.array[3] == 3.0f); 1225 } 1226 1227 alias _mm_set_ps1 = _mm_set1_ps; 1228 1229 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) pure @safe 1230 { 1231 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx); 1232 } 1233 1234 __m128 _mm_set_ss (float a) pure @trusted 1235 { 1236 __m128 r = _mm_setzero_ps(); 1237 r.ptr[0] = a; 1238 return r; 1239 } 1240 unittest 1241 { 1242 float[4] correct = [42.0f, 0.0f, 0.0f, 0.0f]; 1243 __m128 A = _mm_set_ss(42.0f); 1244 assert(A.array == correct); 1245 } 1246 1247 __m128 _mm_set1_ps (float a) pure @trusted 1248 { 1249 return __m128(a); 1250 } 1251 unittest 1252 { 1253 float[4] correct = [42.0f, 42.0f, 42.0f, 42.0f]; 1254 __m128 A = _mm_set1_ps(42.0f); 1255 assert(A.array == correct); 1256 } 1257 1258 1259 void _mm_setcsr(uint controlWord) pure @safe 1260 { 1261 version(GNU) 1262 { 1263 static if (GDC_with_SSE) 1264 { 1265 __builtin_ia32_ldmxcsr(controlWord); 1266 } 1267 else version(X86) 1268 { 1269 asm pure nothrow @nogc @trusted 1270 { 1271 "ldmxcsr %0;\n" 1272 : 1273 : "m" (controlWord) 1274 : ; 1275 } 1276 } 1277 } 1278 else version (InlineX86Asm) 1279 { 1280 asm pure nothrow @nogc @safe 1281 { 1282 ldmxcsr controlWord; 1283 } 1284 } 1285 else 1286 static assert(0, "Not yet supported"); 1287 } 1288 unittest 1289 { 1290 _mm_setcsr(_mm_getcsr()); 1291 } 1292 1293 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted 1294 { 1295 float[4] result = [e3, e2, e1, e0]; 1296 return loadUnaligned!(float4)(result.ptr); 1297 } 1298 unittest 1299 { 1300 __m128 A = _mm_setr_ps(3, 2, 1, 546); 1301 float[4] correct = [3.0f, 2.0f, 1.0f, 546.0f]; 1302 assert(A.array == correct); 1303 assert(A.array[0] == 3.0f); 1304 assert(A.array[1] == 2.0f); 1305 assert(A.array[2] == 1.0f); 1306 assert(A.array[3] == 546.0f); 1307 } 1308 1309 __m128 _mm_setzero_ps() pure @trusted 1310 { 1311 // Compiles to xorps without problems 1312 float[4] result = [0.0f, 0.0f, 0.0f, 0.0f]; 1313 return loadUnaligned!(float4)(result.ptr); 1314 } 1315 1316 version(GNU) 1317 { 1318 void _mm_sfence() pure @trusted 1319 { 1320 static if (GDC_with_SSE) 1321 { 1322 __builtin_ia32_sfence(); 1323 } 1324 else version(X86) 1325 { 1326 asm pure nothrow @nogc @trusted 1327 { 1328 "sfence;\n" : : : ; 1329 } 1330 } 1331 else 1332 static assert(false); 1333 } 1334 } 1335 else version(LDC) 1336 { 1337 alias _mm_sfence = __builtin_ia32_sfence; 1338 } 1339 else static if (DMD_with_asm) 1340 { 1341 void _mm_sfence() pure @safe 1342 { 1343 asm nothrow @nogc pure @safe 1344 { 1345 sfence; 1346 } 1347 } 1348 } 1349 else 1350 static assert(false); 1351 unittest 1352 { 1353 _mm_sfence(); 1354 } 1355 1356 __m64 _mm_shuffle_pi16(int imm8)(__m64 a) pure @safe 1357 { 1358 return cast(__m64) shufflevector!(short4, ( (imm8 >> 0) & 3 ), 1359 ( (imm8 >> 2) & 3 ), 1360 ( (imm8 >> 4) & 3 ), 1361 ( (imm8 >> 6) & 3 ))(cast(short4)a, cast(short4)a); 1362 } 1363 unittest 1364 { 1365 __m64 A = _mm_setr_pi16(0, 1, 2, 3); 1366 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 1367 short4 B = cast(short4) _mm_shuffle_pi16!SHUFFLE(A); 1368 short[4] expectedB = [ 3, 2, 1, 0 ]; 1369 assert(B.array == expectedB); 1370 } 1371 1372 // Note: the immediate shuffle value is given at compile-time instead of runtime. 1373 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe 1374 { 1375 return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b); 1376 } 1377 1378 static if (GDC_with_SSE) 1379 { 1380 alias _mm_sqrt_ps = __builtin_ia32_sqrtps; 1381 } 1382 else version(LDC) 1383 { 1384 // Disappeared with LDC 1.11 1385 static if (__VERSION__ < 2081) 1386 alias _mm_sqrt_ps = __builtin_ia32_sqrtps; 1387 else 1388 { 1389 __m128 _mm_sqrt_ps(__m128 vec) pure @safe 1390 { 1391 vec.array[0] = llvm_sqrt(vec.array[0]); 1392 vec.array[1] = llvm_sqrt(vec.array[1]); 1393 vec.array[2] = llvm_sqrt(vec.array[2]); 1394 vec.array[3] = llvm_sqrt(vec.array[3]); 1395 return vec; 1396 } 1397 } 1398 } 1399 else 1400 { 1401 __m128 _mm_sqrt_ps(__m128 vec) pure @trusted 1402 { 1403 vec.ptr[0] = sqrt(vec.array[0]); 1404 vec.ptr[1] = sqrt(vec.array[1]); 1405 vec.ptr[2] = sqrt(vec.array[2]); 1406 vec.ptr[3] = sqrt(vec.array[3]); 1407 return vec; 1408 } 1409 } 1410 unittest 1411 { 1412 __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f)); 1413 assert(A.array[0] == 2.0f); 1414 assert(A.array[1] == 2.0f); 1415 assert(A.array[2] == 2.0f); 1416 assert(A.array[3] == 2.0f); 1417 } 1418 1419 static if (GDC_with_SSE) 1420 { 1421 alias _mm_sqrt_ss = __builtin_ia32_sqrtss; 1422 } 1423 else version(LDC) 1424 { 1425 // Disappeared with LDC 1.11 1426 static if (__VERSION__ < 2081) 1427 alias _mm_sqrt_ss = __builtin_ia32_sqrtss; 1428 else 1429 { 1430 __m128 _mm_sqrt_ss(__m128 vec) pure @safe 1431 { 1432 vec.array[0] = llvm_sqrt(vec.array[0]); 1433 vec.array[1] = vec.array[1]; 1434 vec.array[2] = vec.array[2]; 1435 vec.array[3] = vec.array[3]; 1436 return vec; 1437 } 1438 } 1439 } 1440 else 1441 { 1442 __m128 _mm_sqrt_ss(__m128 vec) pure @trusted 1443 { 1444 vec.ptr[0] = sqrt(vec.array[0]); 1445 return vec; 1446 } 1447 } 1448 unittest 1449 { 1450 __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f)); 1451 assert(A.array[0] == 2.0f); 1452 assert(A.array[1] == 4.0f); 1453 assert(A.array[2] == 4.0f); 1454 assert(A.array[3] == 4.0f); 1455 } 1456 1457 void _mm_store_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 1458 { 1459 __m128* aligned = cast(__m128*)mem_addr; 1460 *aligned = a; 1461 } 1462 1463 alias _mm_store_ps1 = _mm_store1_ps; 1464 1465 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe 1466 { 1467 *mem_addr = a.array[0]; 1468 } 1469 unittest 1470 { 1471 float a; 1472 _mm_store_ss(&a, _mm_set_ps(3, 2, 1, 546)); 1473 assert(a == 546); 1474 } 1475 1476 void _mm_store1_ps(float* mem_addr, __m128 a) pure @trusted // not safe since nothing guarantees alignment 1477 { 1478 __m128* aligned = cast(__m128*)mem_addr; 1479 __m128 r; 1480 r.ptr[0] = a.array[0]; 1481 r.ptr[1] = a.array[0]; 1482 r.ptr[2] = a.array[0]; 1483 r.ptr[3] = a.array[0]; 1484 *aligned = r; 1485 } 1486 1487 void _mm_storeh_pi(__m64* p, __m128 a) pure @trusted 1488 { 1489 long2 la = cast(long2)a; 1490 (*p).ptr[0] = la.array[1]; 1491 } 1492 unittest 1493 { 1494 __m64 R = _mm_setzero_si64(); 1495 long2 A = [13, 25]; 1496 _mm_storeh_pi(&R, cast(__m128)A); 1497 assert(R.array[0] == 25); 1498 } 1499 1500 void _mm_storel_pi(__m64* p, __m128 a) pure @trusted 1501 { 1502 long2 la = cast(long2)a; 1503 (*p).ptr[0] = la.array[0]; 1504 } 1505 unittest 1506 { 1507 __m64 R = _mm_setzero_si64(); 1508 long2 A = [13, 25]; 1509 _mm_storel_pi(&R, cast(__m128)A); 1510 assert(R.array[0] == 13); 1511 } 1512 1513 void _mm_storer_ps(float* mem_addr, __m128 a) pure @trusted // not safe since nothing guarantees alignment 1514 { 1515 __m128* aligned = cast(__m128*)mem_addr; 1516 __m128 r; 1517 r.ptr[0] = a.array[3]; 1518 r.ptr[1] = a.array[2]; 1519 r.ptr[2] = a.array[1]; 1520 r.ptr[3] = a.array[0]; 1521 *aligned = r; 1522 } 1523 1524 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe 1525 { 1526 storeUnaligned!(float4)(a, mem_addr); 1527 } 1528 1529 void _mm_stream_pi (__m64* mem_addr, __m64 a) 1530 { 1531 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 1532 *mem_addr = a; // it's a regular move instead 1533 } 1534 1535 // BUG: can't implement non-temporal store with LDC inlineIR since !nontemporal 1536 // needs some IR outside this function that would say: 1537 // 1538 // !0 = !{ i32 1 } 1539 // 1540 // It's a LLVM IR metadata description. 1541 // Regardless, non-temporal moves are really dangerous for performance... 1542 void _mm_stream_ps (float* mem_addr, __m128 a) 1543 { 1544 __m128* dest = cast(__m128*)mem_addr; 1545 *dest = a; // it's a regular move instead 1546 } 1547 unittest 1548 { 1549 align(16) float[4] A; 1550 _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f)); 1551 assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f); 1552 } 1553 1554 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe 1555 { 1556 return a - b; 1557 } 1558 unittest 1559 { 1560 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1561 a = _mm_sub_ps(a, a); 1562 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 1563 assert(a.array == correct); 1564 } 1565 1566 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe 1567 { 1568 static if (GDC_with_SSE) 1569 return __builtin_ia32_subss(a, b); 1570 else 1571 { 1572 a[0] -= b[0]; 1573 return a; 1574 } 1575 } 1576 unittest 1577 { 1578 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1579 a = _mm_sub_ss(a, a); 1580 float[4] correct = [0.0f, -2.0, 3.0f, 1.0f]; 1581 assert(a.array == correct); 1582 } 1583 1584 1585 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe 1586 { 1587 __m128 tmp3, tmp2, tmp1, tmp0; 1588 tmp0 = _mm_unpacklo_ps(row0, row1); 1589 tmp2 = _mm_unpacklo_ps(row2, row3); 1590 tmp1 = _mm_unpackhi_ps(row0, row1); 1591 tmp3 = _mm_unpackhi_ps(row2, row3); 1592 row0 = _mm_movelh_ps(tmp0, tmp2); 1593 row1 = _mm_movehl_ps(tmp2, tmp0); 1594 row2 = _mm_movelh_ps(tmp1, tmp3); 1595 row3 = _mm_movehl_ps(tmp3, tmp1); 1596 } 1597 1598 // Note: the only difference between these intrinsics is the signalling 1599 // behaviour of quiet NaNs. This is incorrect but the case where 1600 // you would want to differentiate between qNaN and sNaN and then 1601 // treat them differently on purpose seems extremely rare. 1602 alias _mm_ucomieq_ss = _mm_comieq_ss; 1603 alias _mm_ucomige_ss = _mm_comige_ss; 1604 alias _mm_ucomigt_ss = _mm_comigt_ss; 1605 alias _mm_ucomile_ss = _mm_comile_ss; 1606 alias _mm_ucomilt_ss = _mm_comilt_ss; 1607 alias _mm_ucomineq_ss = _mm_comineq_ss; 1608 1609 1610 __m128 _mm_undefined_ps() pure @safe 1611 { 1612 __m128 undef = void; 1613 return undef; 1614 } 1615 1616 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @trusted 1617 { 1618 __m128 r; 1619 r.ptr[0] = a.array[2]; 1620 r.ptr[1] = b.array[2]; 1621 r.ptr[2] = a.array[3]; 1622 r.ptr[3] = b.array[3]; 1623 return r; 1624 } 1625 1626 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @trusted 1627 { 1628 __m128 r; 1629 r.ptr[0] = a.array[0]; 1630 r.ptr[1] = b.array[0]; 1631 r.ptr[2] = a.array[1]; 1632 r.ptr[3] = b.array[1]; 1633 return r; 1634 } 1635 1636 __m128 _mm_xor_ps (__m128 a, __m128 b) pure @safe 1637 { 1638 return cast(__m128)(cast(__m128i)a ^ cast(__m128i)b); 1639 } 1640 1641 1642 private 1643 { 1644 /// Returns: `true` if the pointer is suitably aligned. 1645 bool isPointerAligned(void* p, size_t alignment) pure 1646 { 1647 assert(alignment != 0); 1648 return ( cast(size_t)p & (alignment - 1) ) == 0; 1649 } 1650 1651 /// Returns: next pointer aligned with alignment bytes. 1652 void* nextAlignedPointer(void* start, size_t alignment) pure 1653 { 1654 return cast(void*)nextMultipleOf(cast(size_t)(start), alignment); 1655 } 1656 1657 // Returns number of bytes to actually allocate when asking 1658 // for a particular alignment 1659 @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure 1660 { 1661 enum size_t pointerSize = size_t.sizeof; 1662 return askedSize + alignment - 1 + pointerSize * 3; 1663 } 1664 1665 // Store pointer given my malloc, size and alignment 1666 @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure 1667 { 1668 enum size_t pointerSize = size_t.sizeof; 1669 char* start = cast(char*)raw + pointerSize * 3; 1670 void* aligned = nextAlignedPointer(start, alignment); 1671 void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize); 1672 *rawLocation = raw; 1673 size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize); 1674 *sizeLocation = size; 1675 size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize); 1676 *alignmentLocation = alignment; 1677 assert( isPointerAligned(aligned, alignment) ); 1678 return aligned; 1679 } 1680 1681 // Returns: x, multiple of powerOfTwo, so that x >= n. 1682 @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow 1683 { 1684 // check power-of-two 1685 assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0)); 1686 1687 size_t mask = ~(powerOfTwo - 1); 1688 return (n + powerOfTwo - 1) & mask; 1689 } 1690 } 1691 1692 unittest 1693 { 1694 assert(nextMultipleOf(0, 4) == 0); 1695 assert(nextMultipleOf(1, 4) == 4); 1696 assert(nextMultipleOf(2, 4) == 4); 1697 assert(nextMultipleOf(3, 4) == 4); 1698 assert(nextMultipleOf(4, 4) == 4); 1699 assert(nextMultipleOf(5, 4) == 8); 1700 1701 { 1702 void* p = _mm_malloc(23, 16); 1703 assert(p !is null); 1704 assert(((cast(size_t)p) & 0xf) == 0); 1705 _mm_free(p); 1706 } 1707 1708 void* nullAlloc = _mm_malloc(0, 32); 1709 assert(nullAlloc != null); 1710 _mm_free(nullAlloc); 1711 }