1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2019. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.xmmintrin; 7 8 public import inteli.types; 9 10 import inteli.internals; 11 12 import inteli.mmx; 13 import inteli.emmintrin; 14 15 import core.stdc.stdlib: malloc, free; 16 import core.exception: onOutOfMemoryError; 17 18 version(D_InlineAsm_X86) 19 version = InlineX86Asm; 20 else version(D_InlineAsm_X86_64) 21 version = InlineX86Asm; 22 23 24 // SSE1 25 26 nothrow @nogc: 27 28 29 enum int _MM_EXCEPT_INVALID = 0x0001; 30 enum int _MM_EXCEPT_DENORM = 0x0002; 31 enum int _MM_EXCEPT_DIV_ZERO = 0x0004; 32 enum int _MM_EXCEPT_OVERFLOW = 0x0008; 33 enum int _MM_EXCEPT_UNDERFLOW = 0x0010; 34 enum int _MM_EXCEPT_INEXACT = 0x0020; 35 enum int _MM_EXCEPT_MASK = 0x003f; 36 37 enum int _MM_MASK_INVALID = 0x0080; 38 enum int _MM_MASK_DENORM = 0x0100; 39 enum int _MM_MASK_DIV_ZERO = 0x0200; 40 enum int _MM_MASK_OVERFLOW = 0x0400; 41 enum int _MM_MASK_UNDERFLOW = 0x0800; 42 enum int _MM_MASK_INEXACT = 0x1000; 43 enum int _MM_MASK_MASK = 0x1f80; 44 45 enum int _MM_ROUND_NEAREST = 0x0000; 46 enum int _MM_ROUND_DOWN = 0x2000; 47 enum int _MM_ROUND_UP = 0x4000; 48 enum int _MM_ROUND_TOWARD_ZERO = 0x6000; 49 enum int _MM_ROUND_MASK = 0x6000; 50 51 enum int _MM_FLUSH_ZERO_MASK = 0x8000; 52 enum int _MM_FLUSH_ZERO_ON = 0x8000; 53 enum int _MM_FLUSH_ZERO_OFF = 0x0000; 54 55 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe 56 { 57 return a + b; 58 } 59 60 unittest 61 { 62 __m128 a = [1, 2, 3, 4]; 63 a = _mm_add_ps(a, a); 64 assert(a.array[0] == 2); 65 assert(a.array[1] == 4); 66 assert(a.array[2] == 6); 67 assert(a.array[3] == 8); 68 } 69 70 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe 71 { 72 a[0] += b[0]; 73 return a; 74 } 75 unittest 76 { 77 __m128 a = [1, 2, 3, 4]; 78 a = _mm_add_ss(a, a); 79 assert(a.array == [2.0f, 2, 3, 4]); 80 } 81 82 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe 83 { 84 return cast(__m128)(cast(__m128i)a & cast(__m128i)b); 85 } 86 unittest 87 { 88 // Note: tested in emmintrin.d 89 } 90 91 __m128 _mm_andnot_ps (__m128 a, __m128 b) pure @safe 92 { 93 return cast(__m128)( (~cast(__m128i)a) & cast(__m128i)b ); 94 } 95 96 /// Average packed unsigned 16-bit integers in ``a` and `b`. 97 __m64 _mm_avg_pu16 (__m64 a, __m64 b) pure @safe 98 { 99 return to_m64(_mm_avg_epu16(to_m128i(a), to_m128i(b))); 100 } 101 102 /// Average packed unsigned 8-bit integers in ``a` and `b`. 103 __m64 _mm_avg_pu8 (__m64 a, __m64 b) pure @safe 104 { 105 return to_m64(_mm_avg_epu8(to_m128i(a), to_m128i(b))); 106 } 107 108 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe 109 { 110 return cast(__m128) cmpps!(FPComparison.oeq)(a, b); 111 } 112 113 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe 114 { 115 return cast(__m128) cmpss!(FPComparison.oeq)(a, b); 116 } 117 118 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe 119 { 120 return cast(__m128) cmpps!(FPComparison.oge)(a, b); 121 } 122 unittest 123 { 124 __m128i R = cast(__m128i) _mm_cmpge_ps(_mm_setr_ps(0, 1, -1, float.nan), 125 _mm_setr_ps(0, 0, 0, 0)); 126 int[4] correct = [-1, -1, 0, 0]; 127 assert(R.array == correct); 128 } 129 130 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe 131 { 132 return cast(__m128) cmpss!(FPComparison.oge)(a, b); 133 } 134 135 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe 136 { 137 return cast(__m128) cmpps!(FPComparison.ogt)(a, b); 138 } 139 140 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe 141 { 142 return cast(__m128) cmpss!(FPComparison.ogt)(a, b); 143 } 144 145 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe 146 { 147 return cast(__m128) cmpps!(FPComparison.ole)(a, b); 148 } 149 150 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe 151 { 152 return cast(__m128) cmpss!(FPComparison.ole)(a, b); 153 } 154 155 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe 156 { 157 return cast(__m128) cmpps!(FPComparison.olt)(a, b); 158 } 159 160 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe 161 { 162 return cast(__m128) cmpss!(FPComparison.olt)(a, b); 163 } 164 165 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe 166 { 167 return cast(__m128) cmpps!(FPComparison.une)(a, b); 168 } 169 170 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe 171 { 172 return cast(__m128) cmpss!(FPComparison.une)(a, b); 173 } 174 175 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe 176 { 177 return cast(__m128) cmpps!(FPComparison.ult)(a, b); 178 } 179 180 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe 181 { 182 return cast(__m128) cmpss!(FPComparison.ult)(a, b); 183 } 184 185 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe 186 { 187 return cast(__m128) cmpps!(FPComparison.ule)(a, b); 188 } 189 190 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe 191 { 192 return cast(__m128) cmpss!(FPComparison.ule)(a, b); 193 } 194 195 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe 196 { 197 return cast(__m128) cmpps!(FPComparison.ugt)(a, b); 198 } 199 200 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe 201 { 202 return cast(__m128) cmpss!(FPComparison.ugt)(a, b); 203 } 204 205 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe 206 { 207 return cast(__m128) cmpps!(FPComparison.uge)(a, b); 208 } 209 210 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe 211 { 212 return cast(__m128) cmpss!(FPComparison.uge)(a, b); 213 } 214 215 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe 216 { 217 return cast(__m128) cmpps!(FPComparison.ord)(a, b); 218 } 219 220 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe 221 { 222 return cast(__m128) cmpss!(FPComparison.ord)(a, b); 223 } 224 225 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe 226 { 227 return cast(__m128) cmpps!(FPComparison.uno)(a, b); 228 } 229 230 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe 231 { 232 return cast(__m128) cmpss!(FPComparison.uno)(a, b); 233 } 234 235 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS 236 // Some such comparisons yields true for NaNs, other don't. 237 238 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe // comiss + sete 239 { 240 return comss!(FPComparison.ueq)(a, b); // yields true for NaN! 241 } 242 243 int _mm_comige_ss (__m128 a, __m128 b) pure @safe // comiss + setae 244 { 245 return comss!(FPComparison.oge)(a, b); 246 } 247 248 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta 249 { 250 return comss!(FPComparison.ogt)(a, b); 251 } 252 253 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe 254 { 255 return comss!(FPComparison.ule)(a, b); // yields true for NaN! 256 } 257 258 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb 259 { 260 return comss!(FPComparison.ult)(a, b); // yields true for NaN! 261 } 262 263 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne 264 { 265 return comss!(FPComparison.one)(a, b); 266 } 267 268 alias _mm_cvt_pi2ps = _mm_cvtpi32_ps; 269 270 __m64 _mm_cvt_ps2pi (__m128 a) pure @safe 271 { 272 return to_m64(_mm_cvtps_epi32(a)); 273 } 274 275 __m128 _mm_cvt_si2ss(__m128 v, int x) pure @safe 276 { 277 v[0] = cast(float)x; 278 return v; 279 } 280 unittest 281 { 282 __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42); 283 assert(a.array == [42f, 0, 0, 0]); 284 } 285 286 // Note: is just another name for _mm_cvtss_si32 287 alias _mm_cvt_ss2si = _mm_cvtss_si32; 288 289 290 __m128 _mm_cvtpi16_ps (__m64 a) pure @safe 291 { 292 __m128i ma = to_m128i(a); 293 ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit 294 ma = _mm_srai_epi32(_mm_slli_epi32(ma, 16), 16); // Replicate sign bit 295 return _mm_cvtepi32_ps(ma); 296 } 297 unittest 298 { 299 __m64 A = _mm_setr_pi16(-1, 2, -3, 4); 300 __m128 R = _mm_cvtpi16_ps(A); 301 float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f]; 302 assert(R.array == correct); 303 } 304 305 __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) 306 { 307 __m128 fb = _mm_cvtepi32_ps(to_m128i(b)); 308 a[0] = fb[0]; 309 a[1] = fb[1]; 310 return a; 311 } 312 unittest 313 { 314 __m128 R = _mm_cvtpi32_ps(_mm_set1_ps(4.0f), _mm_setr_pi32(1, 2)); 315 float[4] correct = [1.0f, 2.0f, 4.0f, 4.0f]; 316 assert(R.array == correct); 317 } 318 319 320 __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) pure @safe 321 { 322 long2 l; 323 l[0] = a[0]; 324 l[1] = b[0]; 325 return _mm_cvtepi32_ps(cast(__m128i)l); 326 } 327 328 __m128 _mm_cvtpi8_ps (__m64 a) pure @safe 329 { 330 __m128i b = to_m128i(a); 331 332 // Zero extend to 32-bit 333 b = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 334 b = _mm_unpacklo_epi16(b, _mm_setzero_si128()); 335 336 // Replicate sign bit 337 b = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24); // Replicate sign bit 338 return _mm_cvtepi32_ps(b); 339 } 340 unittest 341 { 342 __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0); 343 __m128 R = _mm_cvtpi8_ps(A); 344 float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f]; 345 assert(R.array == correct); 346 } 347 348 __m64 _mm_cvtps_pi16 (__m128 a) pure @safe 349 { 350 // The C++ version of this intrinsic convert to 32-bit float, then use packssdw 351 // Which means the 16-bit integers should be saturated 352 __m128i b = _mm_cvtps_epi32(a); 353 b = _mm_packs_epi32(b, b); 354 return to_m64(b); 355 } 356 unittest 357 { 358 __m128 A = _mm_setr_ps(-1.0f, 2.0f, -33000.0f, 70000.0f); 359 short4 R = cast(short4) _mm_cvtps_pi16(A); 360 short[4] correct = [-1, 2, -32768, 32767]; 361 assert(R.array == correct); 362 } 363 364 __m64 _mm_cvtps_pi32 (__m128 a) pure @safe 365 { 366 return to_m64(_mm_cvtps_epi32(a)); 367 } 368 unittest 369 { 370 __m128 A = _mm_setr_ps(-33000.0f, 70000.0f, -1.0f, 2.0f, ); 371 int2 R = cast(int2) _mm_cvtps_pi32(A); 372 int[2] correct = [-33000, 70000]; 373 assert(R.array == correct); 374 } 375 376 __m64 _mm_cvtps_pi8 (__m128 a) pure @safe 377 { 378 // The C++ version of this intrinsic convert to 32-bit float, then use packssdw + packsswb 379 // Which means the 8-bit integers should be saturated 380 __m128i b = _mm_cvtps_epi32(a); 381 b = _mm_packs_epi32(b, _mm_setzero_si128()); 382 b = _mm_packs_epi16(b, _mm_setzero_si128()); 383 return to_m64(b); 384 } 385 unittest 386 { 387 __m128 A = _mm_setr_ps(-1.0f, 2.0f, -129.0f, 128.0f); 388 byte8 R = cast(byte8) _mm_cvtps_pi8(A); 389 byte[8] correct = [-1, 2, -128, 127, 0, 0, 0, 0]; 390 assert(R.array == correct); 391 } 392 393 __m128 _mm_cvtpu16_ps (__m64 a) pure @safe 394 { 395 __m128i ma = to_m128i(a); 396 ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit 397 return _mm_cvtepi32_ps(ma); 398 } 399 unittest 400 { 401 __m64 A = _mm_setr_pi16(-1, 2, -3, 4); 402 __m128 R = _mm_cvtpu16_ps(A); 403 float[4] correct = [65535.0f, 2.0f, 65533.0f, 4.0f]; 404 assert(R.array == correct); 405 } 406 407 __m128 _mm_cvtpu8_ps (__m64 a) pure @safe 408 { 409 __m128i b = to_m128i(a); 410 411 // Zero extend to 32-bit 412 b = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 413 b = _mm_unpacklo_epi16(b, _mm_setzero_si128()); 414 return _mm_cvtepi32_ps(b); 415 } 416 unittest 417 { 418 __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0); 419 __m128 R = _mm_cvtpu8_ps(A); 420 float[4] correct = [255.0f, 2.0f, 253.0f, 4.0f]; 421 assert(R.array == correct); 422 } 423 424 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @safe 425 { 426 v[0] = cast(float)x; 427 return v; 428 } 429 unittest 430 { 431 __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42); 432 assert(a.array == [42.0f, 0, 0, 0]); 433 } 434 435 // Note: on macOS, using "llvm.x86.sse.cvtsi642ss" was buggy 436 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @safe 437 { 438 v[0] = cast(float)x; 439 return v; 440 } 441 unittest 442 { 443 __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42); 444 assert(a.array == [42.0f, 0, 0, 0]); 445 } 446 447 float _mm_cvtss_f32(__m128 a) pure @safe 448 { 449 return a[0]; 450 } 451 452 version(LDC) 453 { 454 alias _mm_cvtss_si32 = __builtin_ia32_cvtss2si; 455 } 456 else 457 { 458 int _mm_cvtss_si32 (__m128 a) pure @safe 459 { 460 return convertFloatToInt32UsingMXCSR(a[0]); 461 } 462 } 463 unittest 464 { 465 assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 466 } 467 468 version(LDC) 469 { 470 version(X86_64) 471 alias _mm_cvtss_si64 = __builtin_ia32_cvtss2si64; 472 else 473 { 474 // Note: __builtin_ia32_cvtss2si64 crashes LDC in 32-bit 475 long _mm_cvtss_si64 (__m128 a) pure @safe 476 { 477 return convertFloatToInt64UsingMXCSR(a[0]); 478 } 479 } 480 } 481 else 482 { 483 long _mm_cvtss_si64 (__m128 a) pure @safe 484 { 485 return convertFloatToInt64UsingMXCSR(a[0]); 486 } 487 } 488 unittest 489 { 490 assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 491 492 uint savedRounding = _MM_GET_ROUNDING_MODE(); 493 494 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 495 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.5f))); 496 497 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 498 assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f))); 499 500 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 501 assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f))); 502 503 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 504 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f))); 505 506 _MM_SET_ROUNDING_MODE(savedRounding); 507 } 508 509 510 version(LDC) 511 { 512 alias _mm_cvtt_ss2si = __builtin_ia32_cvttss2si; 513 } 514 else 515 { 516 int _mm_cvtt_ss2si (__m128 a) pure @safe 517 { 518 return cast(int)(a[0]); 519 } 520 } 521 unittest 522 { 523 assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 524 } 525 526 __m64 _mm_cvtt_ps2pi (__m128 a) pure @safe 527 { 528 return to_m64(_mm_cvttps_epi32(a)); 529 } 530 531 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; // it's actually the same op 532 533 // Note: __builtin_ia32_cvttss2si64 crashes LDC when generating 32-bit x86 code. 534 long _mm_cvttss_si64 (__m128 a) pure @safe 535 { 536 return cast(long)(a[0]); // Generates cvttss2si as expected 537 } 538 unittest 539 { 540 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 541 } 542 543 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe 544 { 545 return a / b; 546 } 547 unittest 548 { 549 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 550 a = _mm_div_ps(a, a); 551 float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f]; 552 assert(a.array == correct); 553 } 554 555 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe 556 { 557 a[0] /= b[0]; 558 return a; 559 } 560 unittest 561 { 562 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 563 a = _mm_div_ss(a, a); 564 float[4] correct = [1.0f, -2.0, 3.0f, 1.0f]; 565 assert(a.array == correct); 566 } 567 568 int _mm_extract_pi16 (__m64 a, int imm8) 569 { 570 short4 sa = cast(short4)a; 571 return cast(ushort)(sa[imm8]); 572 } 573 unittest 574 { 575 __m64 A = _mm_setr_pi16(-1, 6, 0, 4); 576 assert(_mm_extract_pi16(A, 0) == 65535); 577 assert(_mm_extract_pi16(A, 1) == 6); 578 assert(_mm_extract_pi16(A, 2) == 0); 579 assert(_mm_extract_pi16(A, 3) == 4); 580 } 581 582 /// Free aligned memory that was allocated with `_mm_malloc`. 583 void _mm_free(void * mem_addr) @trusted 584 { 585 // support for free(NULL) 586 if (mem_addr is null) 587 return; 588 589 // Technically we don't need to store size and alignement in the chunk, but we do in case we 590 // have to implement _mm_realloc 591 592 size_t pointerSize = (void*).sizeof; 593 void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof); 594 size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize); 595 size_t alignment = *alignmentLocation; 596 assert(alignment != 0); 597 assert(isPointerAligned(mem_addr, alignment)); 598 free(*rawLocation); 599 } 600 601 uint _MM_GET_EXCEPTION_MASK() pure @safe 602 { 603 return _mm_getcsr() & _MM_MASK_MASK; 604 } 605 606 uint _MM_GET_EXCEPTION_STATE() pure @safe 607 { 608 return _mm_getcsr() & _MM_EXCEPT_MASK; 609 } 610 611 uint _MM_GET_FLUSH_ZERO_MODE() pure @safe 612 { 613 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 614 } 615 616 uint _MM_GET_ROUNDING_MODE() pure @safe 617 { 618 return _mm_getcsr() & _MM_ROUND_MASK; 619 } 620 621 uint _mm_getcsr() pure @safe 622 { 623 version (InlineX86Asm) 624 { 625 uint controlWord; 626 asm nothrow @nogc pure @safe 627 { 628 stmxcsr controlWord; 629 } 630 return controlWord; 631 } 632 else 633 static assert(0, "Not yet supported"); 634 } 635 636 __m64 _mm_insert_pi16 (__m64 v, int i, int index) 637 { 638 short4 r = cast(short4)v; 639 r[index & 3] = cast(short)i; 640 return cast(__m64)r; 641 } 642 unittest 643 { 644 __m64 A = _mm_set_pi16(3, 2, 1, 0); 645 short4 R = cast(short4) _mm_insert_pi16(A, 42, 1 | 4); 646 short[4] correct = [0, 42, 2, 3]; 647 assert(R.array == correct); 648 } 649 650 __m128 _mm_load_ps(const(float)*p) pure @trusted 651 { 652 return *cast(__m128*)p; 653 } 654 655 __m128 _mm_load_ps1(const(float)*p) pure @trusted 656 { 657 return __m128(*p); 658 } 659 660 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted 661 { 662 float[4] f = [ *mem_addr, 0.0f, 0.0f, 0.0f ]; 663 return loadUnaligned!(float4)(f.ptr); 664 } 665 666 alias _mm_load1_ps = _mm_load_ps1; 667 668 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @safe 669 { 670 long2 la = cast(long2)a; 671 la[1] = (*mem_addr)[0]; 672 return cast(__m128)la; 673 } 674 675 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @safe 676 { 677 long2 la = cast(long2)a; 678 la[0] = (*mem_addr)[0]; 679 return cast(__m128)la; 680 } 681 682 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted 683 { 684 __m128* aligned = cast(__m128*)mem_addr; 685 __m128 a = *aligned; 686 return shufflevector!(__m128, 3, 2, 1, 0)(a, a); 687 } 688 689 __m128 _mm_loadu_ps(const(float)*p) pure @safe 690 { 691 return loadUnaligned!(__m128)(p); 692 } 693 694 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted 695 { 696 short r = *cast(short*)(mem_addr); 697 short8 result = [0, 0, 0, 0, 0, 0, 0, 0]; 698 result[0] = r; 699 return cast(__m128i)result; 700 } 701 unittest 702 { 703 short r = 13; 704 short8 A = cast(short8) _mm_loadu_si16(&r); 705 short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0]; 706 assert(A.array == correct); 707 } 708 709 __m128i _mm_loadu_si64(const(void)* mem_addr) pure @trusted 710 { 711 long r = *cast(long*)(mem_addr); 712 long2 result = [0, 0]; 713 result[0] = r; 714 return cast(__m128i)result; 715 } 716 unittest 717 { 718 long r = 446446446446; 719 long2 A = cast(long2) _mm_loadu_si64(&r); 720 long[2] correct = [446446446446, 0]; 721 assert(A.array == correct); 722 } 723 724 /// Allocate size bytes of memory, aligned to the alignment specified in align, 725 /// and return a pointer to the allocated memory. `_mm_free` should be used to free 726 /// memory that is allocated with `_mm_malloc`. 727 void* _mm_malloc(size_t size, size_t alignment) @trusted 728 { 729 assert(alignment != 0); 730 size_t request = requestedSize(size, alignment); 731 void* raw = malloc(request); 732 if (request > 0 && raw == null) // malloc(0) can validly return anything 733 onOutOfMemoryError(); 734 return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size 735 } 736 737 void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr) @trusted 738 { 739 // this works since mask is zero-extended 740 return _mm_maskmoveu_si128 (to_m128i(a), to_m128i(mask), mem_addr); 741 } 742 743 deprecated alias _m_maskmovq = _mm_maskmove_si64; 744 745 __m64 _mm_max_pi16 (__m64 a, __m64 b) pure @safe 746 { 747 return to_m64(_mm_max_epi16(to_m128i(a), to_m128i(b))); 748 } 749 750 version(LDC) 751 { 752 alias _mm_max_ps = __builtin_ia32_maxps; 753 } 754 else 755 { 756 __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe 757 { 758 __m128 r; 759 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 760 r[1] = (a[1] > b[1]) ? a[1] : b[1]; 761 r[2] = (a[2] > b[2]) ? a[2] : b[2]; 762 r[3] = (a[3] > b[3]) ? a[3] : b[3]; 763 return r; 764 } 765 } 766 unittest 767 { 768 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 769 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 770 __m128 M = _mm_max_ps(A, B); 771 assert(M[0] == 4); 772 assert(M[1] == 2); 773 assert(M[2] == 4); // in case of NaN, second operand prevails (as it seems) 774 assert(M[3] != M[3]); // in case of NaN, second operand prevails (as it seems) 775 } 776 777 __m64 _mm_max_pu8 (__m64 a, __m64 b) pure @safe 778 { 779 return to_m64(_mm_max_epu8(to_m128i(a), to_m128i(b))); 780 } 781 782 version(LDC) 783 { 784 alias _mm_max_ss = __builtin_ia32_maxss; 785 } 786 else 787 { 788 __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe 789 { 790 __m128 r = a; 791 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 792 return r; 793 } 794 } 795 unittest 796 { 797 __m128 A = _mm_setr_ps(1, 2, 3, 4); 798 __m128 B = _mm_setr_ps(4, 1, 4, 1); 799 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 800 __m128 M = _mm_max_ss(A, B); 801 assert(M[0] == 4); 802 assert(M[1] == 2); 803 assert(M[2] == 3); 804 assert(M[3] == 4); 805 M = _mm_max_ps(A, C); // in case of NaN, second operand prevails 806 assert(M[0] != M[0]); 807 M = _mm_max_ps(C, A); // in case of NaN, second operand prevails 808 assert(M[0] == 1); 809 } 810 811 __m64 _mm_min_pi16 (__m64 a, __m64 b) pure @safe 812 { 813 return to_m64(_mm_min_epi16(to_m128i(a), to_m128i(b))); 814 } 815 816 version(LDC) 817 { 818 alias _mm_min_ps = __builtin_ia32_minps; 819 } 820 else 821 { 822 __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe 823 { 824 __m128 r; 825 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 826 r[1] = (a[1] < b[1]) ? a[1] : b[1]; 827 r[2] = (a[2] < b[2]) ? a[2] : b[2]; 828 r[3] = (a[3] < b[3]) ? a[3] : b[3]; 829 return r; 830 } 831 } 832 unittest 833 { 834 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 835 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 836 __m128 M = _mm_min_ps(A, B); 837 assert(M[0] == 1); 838 assert(M[1] == 1); 839 assert(M[2] == 4); // in case of NaN, second operand prevails (as it seems) 840 assert(M[3] != M[3]); // in case of NaN, second operand prevails (as it seems) 841 } 842 843 __m64 _mm_min_pu8 (__m64 a, __m64 b) pure @safe 844 { 845 return to_m64(_mm_min_epu8(to_m128i(a), to_m128i(b))); 846 } 847 848 version(LDC) 849 { 850 alias _mm_min_ss = __builtin_ia32_minss; 851 } 852 else 853 { 854 __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe 855 { 856 __m128 r = a; 857 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 858 return r; 859 } 860 } 861 unittest 862 { 863 __m128 A = _mm_setr_ps(1, 2, 3, 4); 864 __m128 B = _mm_setr_ps(4, 1, 4, 1); 865 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 866 __m128 M = _mm_min_ss(A, B); 867 assert(M[0] == 1); 868 assert(M[1] == 2); 869 assert(M[2] == 3); 870 assert(M[3] == 4); 871 M = _mm_min_ps(A, C); // in case of NaN, second operand prevails 872 assert(M[0] != M[0]); 873 M = _mm_min_ps(C, A); // in case of NaN, second operand prevails 874 assert(M[0] == 1); 875 } 876 877 __m128 _mm_move_ss (__m128 a, __m128 b) pure @safe 878 { 879 return shufflevector!(__m128, 4, 1, 2, 3)(a, b); 880 } 881 882 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @safe 883 { 884 return shufflevector!(float4, 2, 3, 6, 7)(a, b); 885 } 886 887 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @safe 888 { 889 return shufflevector!(float4, 0, 1, 4, 5)(a, b); 890 } 891 892 int _mm_movemask_pi8 (__m64 a) pure @safe 893 { 894 return _mm_movemask_epi8(to_m128i(a)); 895 } 896 unittest 897 { 898 assert(0x9C == _mm_movemask_pi8(_mm_set_pi8(-1, 0, 0, -1, -1, -1, 0, 0))); 899 } 900 901 version(LDC) 902 { 903 alias _mm_movemask_ps = __builtin_ia32_movmskps; 904 } 905 else 906 { 907 int _mm_movemask_ps (__m128 a) pure @safe 908 { 909 int4 ai = cast(int4)a; 910 int r = 0; 911 if (ai[0] < 0) r += 1; 912 if (ai[1] < 0) r += 2; 913 if (ai[2] < 0) r += 4; 914 if (ai[3] < 0) r += 8; 915 return r; 916 } 917 } 918 unittest 919 { 920 int4 A = [-1, 0, -43, 0]; 921 assert(5 == _mm_movemask_ps(cast(float4)A)); 922 } 923 924 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe 925 { 926 return a * b; 927 } 928 unittest 929 { 930 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 931 a = _mm_mul_ps(a, a); 932 float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f]; 933 assert(a.array == correct); 934 } 935 936 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe 937 { 938 a[0] *= b[0]; 939 return a; 940 } 941 unittest 942 { 943 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 944 a = _mm_mul_ss(a, a); 945 float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f]; 946 assert(a.array == correct); 947 } 948 949 __m64 _mm_mulhi_pu16 (__m64 a, __m64 b) pure @safe 950 { 951 return to_m64(_mm_mulhi_epu16(to_m128i(a), to_m128i(b))); 952 } 953 unittest 954 { 955 __m64 A = _mm_setr_pi16(0, -16, 2, 3); 956 __m64 B = _mm_set1_pi16(16384); 957 short4 R = cast(short4)_mm_mulhi_pu16(A, B); 958 short[4] correct = [0, 0x3FFC, 0, 0]; 959 assert(R.array == correct); 960 } 961 962 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe 963 { 964 return cast(__m128)(cast(__m128i)a | cast(__m128i)b); 965 } 966 967 deprecated alias 968 _m_pavgb = _mm_avg_pu8, 969 _m_pavgw = _mm_avg_pu16, 970 _m_pextrw = _mm_extract_pi16, 971 _m_pinsrw = _mm_insert_pi16, 972 _m_pmaxsw = _mm_max_pi16, 973 _m_pmaxub = _mm_max_pu8, 974 _m_pminsw = _mm_min_pi16, 975 _m_pminub = _mm_min_pu8, 976 _m_pmovmskb = _mm_movemask_pi8, 977 _m_pmulhuw = _mm_mulhi_pu16; 978 979 enum _MM_HINT_NTA = 0; 980 enum _MM_HINT_T0 = 1; 981 enum _MM_HINT_T1 = 2; 982 enum _MM_HINT_T2 = 3; 983 984 // Note: locality must be compile-time, unlike Intel Intrinsics API 985 void _mm_prefetch(int locality)(void* p) pure @safe 986 { 987 llvm_prefetch(p, 0, locality, 1); 988 } 989 990 deprecated alias 991 _m_psadbw = _mm_sad_pu8, 992 _m_pshufw = _mm_shuffle_pi16; 993 994 version(LDC) 995 { 996 alias _mm_rcp_ps = __builtin_ia32_rcpps; 997 } 998 else 999 { 1000 __m128 _mm_rcp_ps (__m128 a) pure @safe 1001 { 1002 a[0] = 1.0f / a[0]; 1003 a[1] = 1.0f / a[1]; 1004 a[2] = 1.0f / a[2]; 1005 a[3] = 1.0f / a[3]; 1006 return a; 1007 } 1008 } 1009 1010 version(LDC) 1011 { 1012 alias _mm_rcp_ss = __builtin_ia32_rcpss; 1013 } 1014 else 1015 { 1016 __m128 _mm_rcp_ss (__m128 a) pure @safe 1017 { 1018 a[0] = 1.0f / a[0]; 1019 return a; 1020 } 1021 } 1022 1023 version(LDC) 1024 { 1025 alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps; 1026 } 1027 else 1028 { 1029 __m128 _mm_rsqrt_ps (__m128 a) pure @safe 1030 { 1031 a[0] = 1.0f / sqrt(a[0]); 1032 a[1] = 1.0f / sqrt(a[1]); 1033 a[2] = 1.0f / sqrt(a[2]); 1034 a[3] = 1.0f / sqrt(a[3]); 1035 return a; 1036 } 1037 } 1038 1039 version(LDC) 1040 { 1041 alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss; 1042 } 1043 else 1044 { 1045 __m128 _mm_rsqrt_ss (__m128 a) pure @safe 1046 { 1047 a[0] = 1.0f / sqrt(a[0]); 1048 return a; 1049 } 1050 } 1051 1052 unittest 1053 { 1054 double maxRelativeError = 0.000245; // -72 dB 1055 void testInvSqrt(float number) 1056 { 1057 __m128 A = _mm_set1_ps(number); 1058 1059 // test _mm_rcp_ps 1060 __m128 B = _mm_rcp_ps(A); 1061 foreach(i; 0..4) 1062 { 1063 double exact = 1.0f / A[i]; 1064 double ratio = cast(double)(B[i]) / cast(double)(exact); 1065 assert(fabs(ratio - 1) <= maxRelativeError); 1066 } 1067 1068 // test _mm_rcp_ss 1069 { 1070 B = _mm_rcp_ss(A); 1071 double exact = 1.0f / A[0]; 1072 double ratio = cast(double)(B[0]) / cast(double)(exact); 1073 assert(fabs(ratio - 1) <= maxRelativeError); 1074 } 1075 1076 // test _mm_rsqrt_ps 1077 B = _mm_rsqrt_ps(A); 1078 foreach(i; 0..4) 1079 { 1080 double exact = 1.0f / sqrt(A[i]); 1081 double ratio = cast(double)(B[i]) / cast(double)(exact); 1082 assert(fabs(ratio - 1) <= maxRelativeError); 1083 } 1084 1085 // test _mm_rsqrt_ss 1086 { 1087 B = _mm_rsqrt_ss(A); 1088 double exact = 1.0f / sqrt(A[0]); 1089 double ratio = cast(double)(B[0]) / cast(double)(exact); 1090 assert(fabs(ratio - 1) <= maxRelativeError); 1091 } 1092 } 1093 1094 testInvSqrt(1.1f); 1095 testInvSqrt(2.45674864151f); 1096 testInvSqrt(27841456468.0f); 1097 } 1098 1099 __m64 _mm_sad_pu8 (__m64 a, __m64 b) pure @safe 1100 { 1101 return to_m64(_mm_sad_epu8(to_m128i(a), to_m128i(b))); 1102 } 1103 1104 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) pure @safe 1105 { 1106 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx); 1107 } 1108 1109 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) pure @safe 1110 { 1111 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx); 1112 } 1113 1114 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) pure @safe 1115 { 1116 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx); 1117 } 1118 1119 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted 1120 { 1121 // Note: despite appearances, generates sensible code, 1122 // inlines correctly and is constant folded 1123 float[4] result = [e0, e1, e2, e3]; 1124 return loadUnaligned!(float4)(result.ptr); 1125 } 1126 1127 alias _mm_set_ps1 = _mm_set1_ps; 1128 1129 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) pure @safe 1130 { 1131 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx); 1132 } 1133 1134 __m128 _mm_set_ss (float a) pure @trusted 1135 { 1136 __m128 r = _mm_setzero_ps(); 1137 r[0] = a; 1138 return r; 1139 } 1140 1141 __m128 _mm_set1_ps (float a) pure @trusted 1142 { 1143 return __m128(a); 1144 } 1145 1146 void _mm_setcsr(uint controlWord) pure @safe 1147 { 1148 version (InlineX86Asm) 1149 { 1150 asm pure nothrow @nogc @safe 1151 { 1152 ldmxcsr controlWord; 1153 } 1154 } 1155 else 1156 static assert(0, "Not yet supported"); 1157 } 1158 1159 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted 1160 { 1161 float[4] result = [e3, e2, e1, e0]; 1162 return loadUnaligned!(float4)(result.ptr); 1163 } 1164 1165 __m128 _mm_setzero_ps() pure @trusted 1166 { 1167 // Compiles to xorps without problems 1168 float[4] result = [0.0f, 0.0f, 0.0f, 0.0f]; 1169 return loadUnaligned!(float4)(result.ptr); 1170 } 1171 1172 version(LDC) 1173 { 1174 alias _mm_sfence = __builtin_ia32_sfence; 1175 } 1176 else 1177 { 1178 void _mm_sfence() pure @safe 1179 { 1180 asm nothrow @nogc pure @safe 1181 { 1182 sfence; 1183 } 1184 } 1185 } 1186 unittest 1187 { 1188 _mm_sfence(); 1189 } 1190 1191 __m64 _mm_shuffle_pi16(int imm8)(__m64 a) pure @safe 1192 { 1193 return cast(__m64) shufflevector!(short4, ( (imm8 >> 0) & 3 ), 1194 ( (imm8 >> 2) & 3 ), 1195 ( (imm8 >> 4) & 3 ), 1196 ( (imm8 >> 6) & 3 ))(cast(short4)a, cast(short4)a); 1197 } 1198 unittest 1199 { 1200 __m64 A = _mm_setr_pi16(0, 1, 2, 3); 1201 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 1202 short4 B = cast(short4) _mm_shuffle_pi16!SHUFFLE(A); 1203 short[4] expectedB = [ 3, 2, 1, 0 ]; 1204 assert(B.array == expectedB); 1205 } 1206 1207 // Note: the immediate shuffle value is given at compile-time instead of runtime. 1208 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe 1209 { 1210 return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b); 1211 } 1212 1213 version(LDC) 1214 { 1215 // Disappeared with LDC 1.11 1216 static if (__VERSION__ < 2081) 1217 alias _mm_sqrt_ps = __builtin_ia32_sqrtps; 1218 else 1219 { 1220 __m128 _mm_sqrt_ps(__m128 vec) pure @safe 1221 { 1222 vec.array[0] = llvm_sqrt(vec.array[0]); 1223 vec.array[1] = llvm_sqrt(vec.array[1]); 1224 vec.array[2] = llvm_sqrt(vec.array[2]); 1225 vec.array[3] = llvm_sqrt(vec.array[3]); 1226 return vec; 1227 } 1228 } 1229 } 1230 else 1231 { 1232 __m128 _mm_sqrt_ps(__m128 vec) pure @safe 1233 { 1234 vec.array[0] = sqrt(vec.array[0]); 1235 vec.array[1] = sqrt(vec.array[1]); 1236 vec.array[2] = sqrt(vec.array[2]); 1237 vec.array[3] = sqrt(vec.array[3]); 1238 return vec; 1239 } 1240 } 1241 unittest 1242 { 1243 __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f)); 1244 assert(A.array[0] == 2.0f); 1245 assert(A.array[1] == 2.0f); 1246 assert(A.array[2] == 2.0f); 1247 assert(A.array[3] == 2.0f); 1248 } 1249 1250 version(LDC) 1251 { 1252 // Disappeared with LDC 1.11 1253 static if (__VERSION__ < 2081) 1254 alias _mm_sqrt_ss = __builtin_ia32_sqrtss; 1255 else 1256 { 1257 __m128 _mm_sqrt_ss(__m128 vec) pure @safe 1258 { 1259 vec.array[0] = llvm_sqrt(vec.array[0]); 1260 vec.array[1] = vec.array[1]; 1261 vec.array[2] = vec.array[2]; 1262 vec.array[3] = vec.array[3]; 1263 return vec; 1264 } 1265 } 1266 } 1267 else 1268 { 1269 __m128 _mm_sqrt_ss(__m128 vec) pure @safe 1270 { 1271 vec.array[0] = sqrt(vec.array[0]); 1272 return vec; 1273 } 1274 } 1275 unittest 1276 { 1277 __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f)); 1278 assert(A.array[0] == 2.0f); 1279 assert(A.array[1] == 4.0f); 1280 assert(A.array[2] == 4.0f); 1281 assert(A.array[3] == 4.0f); 1282 } 1283 1284 void _mm_store_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 1285 { 1286 __m128* aligned = cast(__m128*)mem_addr; 1287 *aligned = a; 1288 } 1289 1290 alias _mm_store_ps1 = _mm_store1_ps; 1291 1292 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe 1293 { 1294 *mem_addr = a[0]; 1295 } 1296 unittest 1297 { 1298 float a; 1299 _mm_store_ss(&a, _mm_set_ps(3, 2, 1, 546)); 1300 assert(a == 546); 1301 } 1302 1303 void _mm_store1_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 1304 { 1305 __m128* aligned = cast(__m128*)mem_addr; 1306 *aligned = shufflevector!(__m128, 0, 0, 0, 0)(a, a); 1307 } 1308 1309 void _mm_storeh_pi(__m64* p, __m128 a) pure @safe 1310 { 1311 long2 la = cast(long2)a; 1312 (*p)[0] = la[1]; 1313 } 1314 unittest 1315 { 1316 __m64 R = _mm_setzero_si64(); 1317 long2 A = [13, 25]; 1318 _mm_storeh_pi(&R, cast(__m128)A); 1319 assert(R[0] == 25); 1320 } 1321 1322 void _mm_storel_pi(__m64* p, __m128 a) pure @safe 1323 { 1324 long2 la = cast(long2)a; 1325 (*p)[0] = la[0]; 1326 } 1327 unittest 1328 { 1329 __m64 R = _mm_setzero_si64(); 1330 long2 A = [13, 25]; 1331 _mm_storel_pi(&R, cast(__m128)A); 1332 assert(R[0] == 13); 1333 } 1334 1335 void _mm_storer_ps(float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 1336 { 1337 __m128* aligned = cast(__m128*)mem_addr; 1338 *aligned = shufflevector!(__m128, 3, 2, 1, 0)(a, a); 1339 } 1340 1341 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe 1342 { 1343 storeUnaligned!(float4)(a, mem_addr); 1344 } 1345 1346 void _mm_stream_pi (__m64* mem_addr, __m64 a) 1347 { 1348 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 1349 *mem_addr = a; // it's a regular move instead 1350 } 1351 1352 // BUG: can't implement non-temporal store with LDC inlineIR since !nontemporal 1353 // needs some IR outside this function that would say: 1354 // 1355 // !0 = !{ i32 1 } 1356 // 1357 // It's a LLVM IR metadata description. 1358 // Regardless, non-temporal moves are really dangerous for performance... 1359 void _mm_stream_ps (float* mem_addr, __m128 a) 1360 { 1361 __m128* dest = cast(__m128*)mem_addr; 1362 *dest = a; // it's a regular move instead 1363 } 1364 unittest 1365 { 1366 align(16) float[4] A; 1367 _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f)); 1368 assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f); 1369 } 1370 1371 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe 1372 { 1373 return a - b; 1374 } 1375 unittest 1376 { 1377 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1378 a = _mm_sub_ps(a, a); 1379 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 1380 assert(a.array == correct); 1381 } 1382 1383 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe 1384 { 1385 a[0] -= b[0]; 1386 return a; 1387 } 1388 unittest 1389 { 1390 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1391 a = _mm_sub_ss(a, a); 1392 float[4] correct = [0.0f, -2.0, 3.0f, 1.0f]; 1393 assert(a.array == correct); 1394 } 1395 1396 1397 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe 1398 { 1399 __m128 tmp3, tmp2, tmp1, tmp0; 1400 tmp0 = _mm_unpacklo_ps(row0, row1); 1401 tmp2 = _mm_unpacklo_ps(row2, row3); 1402 tmp1 = _mm_unpackhi_ps(row0, row1); 1403 tmp3 = _mm_unpackhi_ps(row2, row3); 1404 row0 = _mm_movelh_ps(tmp0, tmp2); 1405 row1 = _mm_movehl_ps(tmp2, tmp0); 1406 row2 = _mm_movelh_ps(tmp1, tmp3); 1407 row3 = _mm_movehl_ps(tmp3, tmp1); 1408 } 1409 1410 // Note: the only difference between these intrinsics is the signalling 1411 // behaviour of quiet NaNs. This is incorrect but the case where 1412 // you would want to differentiate between qNaN and sNaN and then 1413 // treat them differently on purpose seems extremely rare. 1414 alias _mm_ucomieq_ss = _mm_comieq_ss; 1415 alias _mm_ucomige_ss = _mm_comige_ss; 1416 alias _mm_ucomigt_ss = _mm_comigt_ss; 1417 alias _mm_ucomile_ss = _mm_comile_ss; 1418 alias _mm_ucomilt_ss = _mm_comilt_ss; 1419 alias _mm_ucomineq_ss = _mm_comineq_ss; 1420 1421 1422 __m128 _mm_undefined_ps() pure @safe 1423 { 1424 __m128 undef = void; 1425 return undef; 1426 } 1427 1428 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @safe 1429 { 1430 return shufflevector!(float4, 2, 6, 3, 7)(a, b); 1431 } 1432 1433 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @safe 1434 { 1435 return shufflevector!(float4, 0, 4, 1, 5)(a, b); 1436 } 1437 1438 __m128 _mm_xor_ps (__m128 a, __m128 b) pure @safe 1439 { 1440 return cast(__m128)(cast(__m128i)a ^ cast(__m128i)b); 1441 } 1442 1443 1444 private 1445 { 1446 /// Returns: `true` if the pointer is suitably aligned. 1447 bool isPointerAligned(void* p, size_t alignment) pure 1448 { 1449 assert(alignment != 0); 1450 return ( cast(size_t)p & (alignment - 1) ) == 0; 1451 } 1452 1453 /// Returns: next pointer aligned with alignment bytes. 1454 void* nextAlignedPointer(void* start, size_t alignment) pure 1455 { 1456 return cast(void*)nextMultipleOf(cast(size_t)(start), alignment); 1457 } 1458 1459 // Returns number of bytes to actually allocate when asking 1460 // for a particular alignment 1461 @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure 1462 { 1463 enum size_t pointerSize = size_t.sizeof; 1464 return askedSize + alignment - 1 + pointerSize * 3; 1465 } 1466 1467 // Store pointer given my malloc, size and alignment 1468 @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure 1469 { 1470 enum size_t pointerSize = size_t.sizeof; 1471 char* start = cast(char*)raw + pointerSize * 3; 1472 void* aligned = nextAlignedPointer(start, alignment); 1473 void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize); 1474 *rawLocation = raw; 1475 size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize); 1476 *sizeLocation = size; 1477 size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize); 1478 *alignmentLocation = alignment; 1479 assert( isPointerAligned(aligned, alignment) ); 1480 return aligned; 1481 } 1482 1483 // Returns: x, multiple of powerOfTwo, so that x >= n. 1484 @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow 1485 { 1486 // check power-of-two 1487 assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0)); 1488 1489 size_t mask = ~(powerOfTwo - 1); 1490 return (n + powerOfTwo - 1) & mask; 1491 } 1492 } 1493 1494 unittest 1495 { 1496 assert(nextMultipleOf(0, 4) == 0); 1497 assert(nextMultipleOf(1, 4) == 4); 1498 assert(nextMultipleOf(2, 4) == 4); 1499 assert(nextMultipleOf(3, 4) == 4); 1500 assert(nextMultipleOf(4, 4) == 4); 1501 assert(nextMultipleOf(5, 4) == 8); 1502 1503 { 1504 void* p = _mm_malloc(23, 16); 1505 assert(p !is null); 1506 assert(((cast(size_t)p) & 0xf) == 0); 1507 _mm_free(p); 1508 } 1509 1510 void* nullAlloc = _mm_malloc(0, 32); 1511 assert(nullAlloc != null); 1512 _mm_free(nullAlloc); 1513 }