1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2019. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.xmmintrin; 7 8 public import inteli.types; 9 10 import inteli.internals; 11 12 import inteli.mmx; 13 import inteli.emmintrin; 14 15 import core.stdc.stdlib: malloc, free; 16 import core.exception: onOutOfMemoryError; 17 18 version(D_InlineAsm_X86) 19 version = InlineX86Asm; 20 else version(D_InlineAsm_X86_64) 21 version = InlineX86Asm; 22 23 24 // SSE1 25 26 nothrow @nogc: 27 28 29 enum int _MM_EXCEPT_INVALID = 0x0001; 30 enum int _MM_EXCEPT_DENORM = 0x0002; 31 enum int _MM_EXCEPT_DIV_ZERO = 0x0004; 32 enum int _MM_EXCEPT_OVERFLOW = 0x0008; 33 enum int _MM_EXCEPT_UNDERFLOW = 0x0010; 34 enum int _MM_EXCEPT_INEXACT = 0x0020; 35 enum int _MM_EXCEPT_MASK = 0x003f; 36 37 enum int _MM_MASK_INVALID = 0x0080; 38 enum int _MM_MASK_DENORM = 0x0100; 39 enum int _MM_MASK_DIV_ZERO = 0x0200; 40 enum int _MM_MASK_OVERFLOW = 0x0400; 41 enum int _MM_MASK_UNDERFLOW = 0x0800; 42 enum int _MM_MASK_INEXACT = 0x1000; 43 enum int _MM_MASK_MASK = 0x1f80; 44 45 enum int _MM_ROUND_NEAREST = 0x0000; 46 enum int _MM_ROUND_DOWN = 0x2000; 47 enum int _MM_ROUND_UP = 0x4000; 48 enum int _MM_ROUND_TOWARD_ZERO = 0x6000; 49 enum int _MM_ROUND_MASK = 0x6000; 50 51 enum int _MM_FLUSH_ZERO_MASK = 0x8000; 52 enum int _MM_FLUSH_ZERO_ON = 0x8000; 53 enum int _MM_FLUSH_ZERO_OFF = 0x0000; 54 55 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe 56 { 57 return a + b; 58 } 59 60 unittest 61 { 62 __m128 a = [1, 2, 3, 4]; 63 a = _mm_add_ps(a, a); 64 assert(a.array[0] == 2); 65 assert(a.array[1] == 4); 66 assert(a.array[2] == 6); 67 assert(a.array[3] == 8); 68 } 69 70 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe 71 { 72 static if (GDC_with_SSE) 73 return __builtin_ia32_addss(a, b); 74 else 75 { 76 a[0] += b[0]; 77 return a; 78 } 79 } 80 unittest 81 { 82 __m128 a = [1, 2, 3, 4]; 83 a = _mm_add_ss(a, a); 84 assert(a.array == [2.0f, 2, 3, 4]); 85 } 86 87 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe 88 { 89 return cast(__m128)(cast(__m128i)a & cast(__m128i)b); 90 } 91 unittest 92 { 93 // Note: tested in emmintrin.d 94 } 95 96 __m128 _mm_andnot_ps (__m128 a, __m128 b) pure @safe 97 { 98 return cast(__m128)( (~cast(__m128i)a) & cast(__m128i)b ); 99 } 100 101 /// Average packed unsigned 16-bit integers in ``a` and `b`. 102 __m64 _mm_avg_pu16 (__m64 a, __m64 b) pure @safe 103 { 104 return to_m64(_mm_avg_epu16(to_m128i(a), to_m128i(b))); 105 } 106 107 /// Average packed unsigned 8-bit integers in ``a` and `b`. 108 __m64 _mm_avg_pu8 (__m64 a, __m64 b) pure @safe 109 { 110 return to_m64(_mm_avg_epu8(to_m128i(a), to_m128i(b))); 111 } 112 113 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe 114 { 115 return cast(__m128) cmpps!(FPComparison.oeq)(a, b); 116 } 117 118 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe 119 { 120 return cast(__m128) cmpss!(FPComparison.oeq)(a, b); 121 } 122 123 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe 124 { 125 return cast(__m128) cmpps!(FPComparison.oge)(a, b); 126 } 127 unittest 128 { 129 __m128i R = cast(__m128i) _mm_cmpge_ps(_mm_setr_ps(0, 1, -1, float.nan), 130 _mm_setr_ps(0, 0, 0, 0)); 131 int[4] correct = [-1, -1, 0, 0]; 132 assert(R.array == correct); 133 } 134 135 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe 136 { 137 return cast(__m128) cmpss!(FPComparison.oge)(a, b); 138 } 139 140 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe 141 { 142 return cast(__m128) cmpps!(FPComparison.ogt)(a, b); 143 } 144 145 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe 146 { 147 return cast(__m128) cmpss!(FPComparison.ogt)(a, b); 148 } 149 150 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe 151 { 152 return cast(__m128) cmpps!(FPComparison.ole)(a, b); 153 } 154 155 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe 156 { 157 return cast(__m128) cmpss!(FPComparison.ole)(a, b); 158 } 159 160 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe 161 { 162 return cast(__m128) cmpps!(FPComparison.olt)(a, b); 163 } 164 165 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe 166 { 167 return cast(__m128) cmpss!(FPComparison.olt)(a, b); 168 } 169 170 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe 171 { 172 return cast(__m128) cmpps!(FPComparison.une)(a, b); 173 } 174 175 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe 176 { 177 return cast(__m128) cmpss!(FPComparison.une)(a, b); 178 } 179 180 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe 181 { 182 return cast(__m128) cmpps!(FPComparison.ult)(a, b); 183 } 184 185 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe 186 { 187 return cast(__m128) cmpss!(FPComparison.ult)(a, b); 188 } 189 190 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe 191 { 192 return cast(__m128) cmpps!(FPComparison.ule)(a, b); 193 } 194 195 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe 196 { 197 return cast(__m128) cmpss!(FPComparison.ule)(a, b); 198 } 199 200 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe 201 { 202 return cast(__m128) cmpps!(FPComparison.ugt)(a, b); 203 } 204 205 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe 206 { 207 return cast(__m128) cmpss!(FPComparison.ugt)(a, b); 208 } 209 210 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe 211 { 212 return cast(__m128) cmpps!(FPComparison.uge)(a, b); 213 } 214 215 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe 216 { 217 return cast(__m128) cmpss!(FPComparison.uge)(a, b); 218 } 219 220 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe 221 { 222 return cast(__m128) cmpps!(FPComparison.ord)(a, b); 223 } 224 225 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe 226 { 227 return cast(__m128) cmpss!(FPComparison.ord)(a, b); 228 } 229 230 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe 231 { 232 return cast(__m128) cmpps!(FPComparison.uno)(a, b); 233 } 234 235 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe 236 { 237 return cast(__m128) cmpss!(FPComparison.uno)(a, b); 238 } 239 240 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS 241 // Some such comparisons yields true for NaNs, other don't. 242 243 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe // comiss + sete 244 { 245 return comss!(FPComparison.ueq)(a, b); // yields true for NaN! 246 } 247 248 int _mm_comige_ss (__m128 a, __m128 b) pure @safe // comiss + setae 249 { 250 return comss!(FPComparison.oge)(a, b); 251 } 252 253 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta 254 { 255 return comss!(FPComparison.ogt)(a, b); 256 } 257 258 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe 259 { 260 return comss!(FPComparison.ule)(a, b); // yields true for NaN! 261 } 262 263 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb 264 { 265 return comss!(FPComparison.ult)(a, b); // yields true for NaN! 266 } 267 268 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne 269 { 270 return comss!(FPComparison.one)(a, b); 271 } 272 273 alias _mm_cvt_pi2ps = _mm_cvtpi32_ps; 274 275 __m64 _mm_cvt_ps2pi (__m128 a) @safe 276 { 277 return to_m64(_mm_cvtps_epi32(a)); 278 } 279 280 __m128 _mm_cvt_si2ss(__m128 v, int x) pure @trusted 281 { 282 v.ptr[0] = cast(float)x; 283 return v; 284 } 285 unittest 286 { 287 __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42); 288 assert(a.array == [42f, 0, 0, 0]); 289 } 290 291 292 __m128 _mm_cvtpi16_ps (__m64 a) pure @safe 293 { 294 __m128i ma = to_m128i(a); 295 ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit 296 ma = _mm_srai_epi32(_mm_slli_epi32(ma, 16), 16); // Replicate sign bit 297 return _mm_cvtepi32_ps(ma); 298 } 299 unittest 300 { 301 __m64 A = _mm_setr_pi16(-1, 2, -3, 4); 302 __m128 R = _mm_cvtpi16_ps(A); 303 float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f]; 304 assert(R.array == correct); 305 } 306 307 __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) pure @trusted 308 { 309 __m128 fb = _mm_cvtepi32_ps(to_m128i(b)); 310 a.ptr[0] = fb.array[0]; 311 a.ptr[1] = fb.array[1]; 312 return a; 313 } 314 unittest 315 { 316 __m128 R = _mm_cvtpi32_ps(_mm_set1_ps(4.0f), _mm_setr_pi32(1, 2)); 317 float[4] correct = [1.0f, 2.0f, 4.0f, 4.0f]; 318 assert(R.array == correct); 319 } 320 321 322 __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) pure @trusted 323 { 324 long2 l; 325 l.ptr[0] = a.array[0]; 326 l.ptr[1] = b.array[0]; 327 return _mm_cvtepi32_ps(cast(__m128i)l); 328 } 329 330 __m128 _mm_cvtpi8_ps (__m64 a) pure @safe 331 { 332 __m128i b = to_m128i(a); 333 334 // Zero extend to 32-bit 335 b = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 336 b = _mm_unpacklo_epi16(b, _mm_setzero_si128()); 337 338 // Replicate sign bit 339 b = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24); // Replicate sign bit 340 return _mm_cvtepi32_ps(b); 341 } 342 unittest 343 { 344 __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0); 345 __m128 R = _mm_cvtpi8_ps(A); 346 float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f]; 347 assert(R.array == correct); 348 } 349 350 __m64 _mm_cvtps_pi16 (__m128 a) @safe 351 { 352 // The C++ version of this intrinsic convert to 32-bit float, then use packssdw 353 // Which means the 16-bit integers should be saturated 354 __m128i b = _mm_cvtps_epi32(a); 355 b = _mm_packs_epi32(b, b); 356 return to_m64(b); 357 } 358 unittest 359 { 360 __m128 A = _mm_setr_ps(-1.0f, 2.0f, -33000.0f, 70000.0f); 361 short4 R = cast(short4) _mm_cvtps_pi16(A); 362 short[4] correct = [-1, 2, -32768, 32767]; 363 assert(R.array == correct); 364 } 365 366 __m64 _mm_cvtps_pi32 (__m128 a) @safe 367 { 368 return to_m64(_mm_cvtps_epi32(a)); 369 } 370 unittest 371 { 372 __m128 A = _mm_setr_ps(-33000.0f, 70000.0f, -1.0f, 2.0f, ); 373 int2 R = cast(int2) _mm_cvtps_pi32(A); 374 int[2] correct = [-33000, 70000]; 375 assert(R.array == correct); 376 } 377 378 __m64 _mm_cvtps_pi8 (__m128 a) @safe 379 { 380 // The C++ version of this intrinsic convert to 32-bit float, then use packssdw + packsswb 381 // Which means the 8-bit integers should be saturated 382 __m128i b = _mm_cvtps_epi32(a); 383 b = _mm_packs_epi32(b, _mm_setzero_si128()); 384 b = _mm_packs_epi16(b, _mm_setzero_si128()); 385 return to_m64(b); 386 } 387 unittest 388 { 389 __m128 A = _mm_setr_ps(-1.0f, 2.0f, -129.0f, 128.0f); 390 byte8 R = cast(byte8) _mm_cvtps_pi8(A); 391 byte[8] correct = [-1, 2, -128, 127, 0, 0, 0, 0]; 392 assert(R.array == correct); 393 } 394 395 __m128 _mm_cvtpu16_ps (__m64 a) pure @safe 396 { 397 __m128i ma = to_m128i(a); 398 ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit 399 return _mm_cvtepi32_ps(ma); 400 } 401 unittest 402 { 403 __m64 A = _mm_setr_pi16(-1, 2, -3, 4); 404 __m128 R = _mm_cvtpu16_ps(A); 405 float[4] correct = [65535.0f, 2.0f, 65533.0f, 4.0f]; 406 assert(R.array == correct); 407 } 408 409 __m128 _mm_cvtpu8_ps (__m64 a) pure @safe 410 { 411 __m128i b = to_m128i(a); 412 413 // Zero extend to 32-bit 414 b = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 415 b = _mm_unpacklo_epi16(b, _mm_setzero_si128()); 416 return _mm_cvtepi32_ps(b); 417 } 418 unittest 419 { 420 __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0); 421 __m128 R = _mm_cvtpu8_ps(A); 422 float[4] correct = [255.0f, 2.0f, 253.0f, 4.0f]; 423 assert(R.array == correct); 424 } 425 426 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @trusted 427 { 428 v.ptr[0] = cast(float)x; 429 return v; 430 } 431 unittest 432 { 433 __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42); 434 assert(a.array == [42.0f, 0, 0, 0]); 435 } 436 437 // Note: on macOS, using "llvm.x86.sse.cvtsi642ss" was buggy 438 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @trusted 439 { 440 v.ptr[0] = cast(float)x; 441 return v; 442 } 443 unittest 444 { 445 __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42); 446 assert(a.array == [42.0f, 0, 0, 0]); 447 } 448 449 float _mm_cvtss_f32(__m128 a) pure @safe 450 { 451 return a.array[0]; 452 } 453 454 static if (LDC_with_SSE1) 455 { 456 alias _mm_cvtss_si32 = __builtin_ia32_cvtss2si; 457 } 458 else 459 { 460 int _mm_cvtss_si32 (__m128 a) @safe 461 { 462 return convertFloatToInt32UsingMXCSR(a.array[0]); 463 } 464 } 465 unittest 466 { 467 assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 468 } 469 470 version(LDC) 471 { 472 version(X86_64) 473 alias _mm_cvtss_si64 = __builtin_ia32_cvtss2si64; 474 else 475 { 476 // Note: __builtin_ia32_cvtss2si64 crashes LDC in 32-bit 477 long _mm_cvtss_si64 (__m128 a) @safe 478 { 479 return convertFloatToInt64UsingMXCSR(a.array[0]); 480 } 481 } 482 } 483 else 484 { 485 long _mm_cvtss_si64 (__m128 a) @safe 486 { 487 return convertFloatToInt64UsingMXCSR(a.array[0]); 488 } 489 } 490 unittest 491 { 492 assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 493 494 uint savedRounding = _MM_GET_ROUNDING_MODE(); 495 496 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 497 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.49f))); 498 499 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 500 assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f))); 501 502 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 503 assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f))); 504 505 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 506 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f))); 507 508 _MM_SET_ROUNDING_MODE(savedRounding); 509 } 510 511 512 static if(LDC_with_SSE1) 513 { 514 alias _mm_cvtt_ss2si = __builtin_ia32_cvttss2si; 515 } 516 else 517 { 518 int _mm_cvtt_ss2si (__m128 a) pure @safe 519 { 520 return cast(int)(a.array[0]); 521 } 522 } 523 unittest 524 { 525 assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 526 } 527 528 __m64 _mm_cvtt_ps2pi (__m128 a) pure @safe 529 { 530 return to_m64(_mm_cvttps_epi32(a)); 531 } 532 533 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; // it's actually the same op 534 535 // Note: __builtin_ia32_cvttss2si64 crashes LDC when generating 32-bit x86 code. 536 long _mm_cvttss_si64 (__m128 a) pure @safe 537 { 538 return cast(long)(a.array[0]); // Generates cvttss2si as expected 539 } 540 unittest 541 { 542 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 543 } 544 545 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe 546 { 547 return a / b; 548 } 549 unittest 550 { 551 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 552 a = _mm_div_ps(a, a); 553 float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f]; 554 assert(a.array == correct); 555 } 556 557 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe 558 { 559 static if (GDC_with_SSE) 560 return __builtin_ia32_divss(a, b); 561 else 562 { 563 a[0] /= b[0]; 564 return a; 565 } 566 } 567 unittest 568 { 569 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 570 a = _mm_div_ss(a, a); 571 float[4] correct = [1.0f, -2.0, 3.0f, 1.0f]; 572 assert(a.array == correct); 573 } 574 575 int _mm_extract_pi16 (__m64 a, int imm8) 576 { 577 short4 sa = cast(short4)a; 578 return cast(ushort)(sa.array[imm8]); 579 } 580 unittest 581 { 582 __m64 A = _mm_setr_pi16(-1, 6, 0, 4); 583 assert(_mm_extract_pi16(A, 0) == 65535); 584 assert(_mm_extract_pi16(A, 1) == 6); 585 assert(_mm_extract_pi16(A, 2) == 0); 586 assert(_mm_extract_pi16(A, 3) == 4); 587 } 588 589 /// Free aligned memory that was allocated with `_mm_malloc`. 590 void _mm_free(void * mem_addr) @trusted 591 { 592 // support for free(NULL) 593 if (mem_addr is null) 594 return; 595 596 // Technically we don't need to store size and alignement in the chunk, but we do in case we 597 // have to implement _mm_realloc 598 599 size_t pointerSize = (void*).sizeof; 600 void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof); 601 size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize); 602 size_t alignment = *alignmentLocation; 603 assert(alignment != 0); 604 assert(isPointerAligned(mem_addr, alignment)); 605 free(*rawLocation); 606 } 607 608 uint _MM_GET_EXCEPTION_MASK() @safe 609 { 610 return _mm_getcsr() & _MM_MASK_MASK; 611 } 612 613 uint _MM_GET_EXCEPTION_STATE() @safe 614 { 615 return _mm_getcsr() & _MM_EXCEPT_MASK; 616 } 617 618 uint _MM_GET_FLUSH_ZERO_MODE() @safe 619 { 620 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 621 } 622 623 uint _MM_GET_ROUNDING_MODE() @safe 624 { 625 return _mm_getcsr() & _MM_ROUND_MASK; 626 } 627 628 uint _mm_getcsr() @trusted 629 { 630 static if (LDC_with_ARM) 631 { 632 // Note: we convert the ARM FPSCR into a x86 SSE control word. 633 // However, only rounding mode and flush to zero are actually set. 634 // The returned control word will have all exceptions masked, and no exception detected. 635 636 uint fpscr = arm_get_fpcr(); 637 638 uint cw = 0; // No exception detected 639 if (fpscr & _MM_FLUSH_ZERO_MASK_ARM) 640 { 641 // ARM has one single flag for ARM. 642 // It does both x86 bits. 643 // https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode 644 cw |= _MM_FLUSH_ZERO_ON; 645 cw |= 0x40; // set "denormals are zeros" 646 } 647 cw |= _MM_MASK_MASK; // All exception maske 648 649 // Rounding mode 650 switch(fpscr & _MM_ROUND_MASK_ARM) 651 { 652 default: 653 case _MM_ROUND_NEAREST_ARM: cw |= _MM_ROUND_NEAREST; break; 654 case _MM_ROUND_DOWN_ARM: cw |= _MM_ROUND_DOWN; break; 655 case _MM_ROUND_UP_ARM: cw |= _MM_ROUND_UP; break; 656 case _MM_ROUND_TOWARD_ZERO_ARM: cw |= _MM_ROUND_TOWARD_ZERO; break; 657 } 658 return cw; 659 } 660 else version(GNU) 661 { 662 static if (GDC_with_SSE) 663 { 664 return __builtin_ia32_stmxcsr(); 665 } 666 else version(X86) 667 { 668 uint sseRounding = 0; 669 asm pure nothrow @nogc @trusted 670 { 671 "stmxcsr %0;\n" 672 : "=m" (sseRounding) 673 : 674 : ; 675 } 676 return sseRounding; 677 } 678 else 679 static assert(false); 680 } 681 else version (InlineX86Asm) 682 { 683 uint controlWord; 684 asm nothrow @nogc pure @safe 685 { 686 stmxcsr controlWord; 687 } 688 return controlWord; 689 } 690 else 691 static assert(0, "Not yet supported"); 692 } 693 unittest 694 { 695 uint csr = _mm_getcsr(); 696 } 697 698 __m64 _mm_insert_pi16 (__m64 v, int i, int index) pure @trusted 699 { 700 short4 r = cast(short4)v; 701 r.ptr[index & 3] = cast(short)i; 702 return cast(__m64)r; 703 } 704 unittest 705 { 706 __m64 A = _mm_set_pi16(3, 2, 1, 0); 707 short4 R = cast(short4) _mm_insert_pi16(A, 42, 1 | 4); 708 short[4] correct = [0, 42, 2, 3]; 709 assert(R.array == correct); 710 } 711 712 __m128 _mm_load_ps(const(float)*p) pure @trusted 713 { 714 return *cast(__m128*)p; 715 } 716 717 __m128 _mm_load_ps1(const(float)*p) pure @trusted 718 { 719 return __m128(*p); 720 } 721 722 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted 723 { 724 __m128 r; 725 r.ptr[0] = *mem_addr; 726 r.ptr[1] = 0; 727 r.ptr[2] = 0; 728 r.ptr[3] = 0; 729 return r; 730 } 731 732 alias _mm_load1_ps = _mm_load_ps1; 733 734 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @trusted 735 { 736 long2 la = cast(long2)a; 737 la.ptr[1] = (*mem_addr).array[0]; 738 return cast(__m128)la; 739 } 740 741 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @trusted 742 { 743 long2 la = cast(long2)a; 744 la.ptr[0] = (*mem_addr).array[0]; 745 return cast(__m128)la; 746 } 747 748 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted 749 { 750 __m128* aligned = cast(__m128*)mem_addr; 751 __m128 a = *aligned; 752 __m128 r; 753 r.ptr[0] = a.array[3]; 754 r.ptr[1] = a.array[2]; 755 r.ptr[2] = a.array[1]; 756 r.ptr[3] = a.array[0]; 757 return r; 758 } 759 760 __m128 _mm_loadu_ps(const(float)*p) pure @safe 761 { 762 return loadUnaligned!(__m128)(p); 763 } 764 765 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted 766 { 767 short r = *cast(short*)(mem_addr); 768 short8 result = [0, 0, 0, 0, 0, 0, 0, 0]; 769 result.ptr[0] = r; 770 return cast(__m128i)result; 771 } 772 unittest 773 { 774 short r = 13; 775 short8 A = cast(short8) _mm_loadu_si16(&r); 776 short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0]; 777 assert(A.array == correct); 778 } 779 780 __m128i _mm_loadu_si64(const(void)* mem_addr) pure @trusted 781 { 782 long r = *cast(long*)(mem_addr); 783 long2 result = [0, 0]; 784 result.ptr[0] = r; 785 return cast(__m128i)result; 786 } 787 unittest 788 { 789 long r = 446446446446; 790 long2 A = cast(long2) _mm_loadu_si64(&r); 791 long[2] correct = [446446446446, 0]; 792 assert(A.array == correct); 793 } 794 795 /// Allocate size bytes of memory, aligned to the alignment specified in align, 796 /// and return a pointer to the allocated memory. `_mm_free` should be used to free 797 /// memory that is allocated with `_mm_malloc`. 798 void* _mm_malloc(size_t size, size_t alignment) @trusted 799 { 800 assert(alignment != 0); 801 size_t request = requestedSize(size, alignment); 802 void* raw = malloc(request); 803 if (request > 0 && raw == null) // malloc(0) can validly return anything 804 onOutOfMemoryError(); 805 return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size 806 } 807 808 void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr) @trusted 809 { 810 // this works since mask is zero-extended 811 return _mm_maskmoveu_si128 (to_m128i(a), to_m128i(mask), mem_addr); 812 } 813 814 deprecated alias _m_maskmovq = _mm_maskmove_si64; 815 816 __m64 _mm_max_pi16 (__m64 a, __m64 b) pure @safe 817 { 818 return to_m64(_mm_max_epi16(to_m128i(a), to_m128i(b))); 819 } 820 821 static if (GDC_with_SSE) 822 { 823 alias _mm_max_ps = __builtin_ia32_maxps; 824 } 825 else static if (LDC_with_SSE1) 826 { 827 alias _mm_max_ps = __builtin_ia32_maxps; 828 } 829 else 830 { 831 __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe 832 { 833 // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02 834 __m128 r; 835 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 836 r[1] = (a[1] > b[1]) ? a[1] : b[1]; 837 r[2] = (a[2] > b[2]) ? a[2] : b[2]; 838 r[3] = (a[3] > b[3]) ? a[3] : b[3]; 839 return r; 840 } 841 } 842 unittest 843 { 844 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 845 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 846 __m128 M = _mm_max_ps(A, B); 847 assert(M.array[0] == 4); 848 assert(M.array[1] == 2); 849 assert(M.array[2] == 4); // in case of NaN, second operand prevails (as it seems) 850 assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems) 851 } 852 853 __m64 _mm_max_pu8 (__m64 a, __m64 b) pure @safe 854 { 855 return to_m64(_mm_max_epu8(to_m128i(a), to_m128i(b))); 856 } 857 858 static if (GDC_with_SSE) 859 { 860 alias _mm_max_ss = __builtin_ia32_maxss; 861 } 862 else static if (LDC_with_SSE1) 863 { 864 alias _mm_max_ss = __builtin_ia32_maxss; 865 } 866 else 867 { 868 __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe 869 { 870 __m128 r = a; 871 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 872 return r; 873 } 874 } 875 unittest 876 { 877 __m128 A = _mm_setr_ps(1, 2, 3, 4); 878 __m128 B = _mm_setr_ps(4, 1, 4, 1); 879 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 880 __m128 M = _mm_max_ss(A, B); 881 assert(M.array[0] == 4); 882 assert(M.array[1] == 2); 883 assert(M.array[2] == 3); 884 assert(M.array[3] == 4); 885 M = _mm_max_ps(A, C); // in case of NaN, second operand prevails 886 assert(M.array[0] != M.array[0]); 887 M = _mm_max_ps(C, A); // in case of NaN, second operand prevails 888 assert(M.array[0] == 1); 889 } 890 891 __m64 _mm_min_pi16 (__m64 a, __m64 b) pure @safe 892 { 893 return to_m64(_mm_min_epi16(to_m128i(a), to_m128i(b))); 894 } 895 896 static if (GDC_with_SSE) 897 { 898 alias _mm_min_ps = __builtin_ia32_minps; 899 } 900 else static if (LDC_with_SSE1) 901 { 902 alias _mm_min_ps = __builtin_ia32_minps; 903 } 904 else 905 { 906 __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe 907 { 908 // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02 909 __m128 r; 910 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 911 r[1] = (a[1] < b[1]) ? a[1] : b[1]; 912 r[2] = (a[2] < b[2]) ? a[2] : b[2]; 913 r[3] = (a[3] < b[3]) ? a[3] : b[3]; 914 return r; 915 } 916 } 917 unittest 918 { 919 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 920 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 921 __m128 M = _mm_min_ps(A, B); 922 assert(M.array[0] == 1); 923 assert(M.array[1] == 1); 924 assert(M.array[2] == 4); // in case of NaN, second operand prevails (as it seems) 925 assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems) 926 } 927 928 __m64 _mm_min_pu8 (__m64 a, __m64 b) pure @safe 929 { 930 return to_m64(_mm_min_epu8(to_m128i(a), to_m128i(b))); 931 } 932 933 static if (GDC_with_SSE) 934 { 935 alias _mm_min_ss = __builtin_ia32_minss; 936 } 937 else static if (LDC_with_SSE1) 938 { 939 alias _mm_min_ss = __builtin_ia32_minss; 940 } 941 else 942 { 943 __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe 944 { 945 // Generates minss since LDC 1.3 -O1 946 __m128 r = a; 947 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 948 return r; 949 } 950 } 951 unittest 952 { 953 __m128 A = _mm_setr_ps(1, 2, 3, 4); 954 __m128 B = _mm_setr_ps(4, 1, 4, 1); 955 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 956 __m128 M = _mm_min_ss(A, B); 957 assert(M.array[0] == 1); 958 assert(M.array[1] == 2); 959 assert(M.array[2] == 3); 960 assert(M.array[3] == 4); 961 M = _mm_min_ps(A, C); // in case of NaN, second operand prevails 962 assert(M.array[0] != M.array[0]); 963 M = _mm_min_ps(C, A); // in case of NaN, second operand prevails 964 assert(M.array[0] == 1); 965 } 966 967 __m128 _mm_move_ss (__m128 a, __m128 b) pure @trusted 968 { 969 a.ptr[0] = b.array[0]; 970 return a; 971 } 972 973 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @trusted 974 { 975 b.ptr[0] = a.array[2]; 976 b.ptr[1] = a.array[3]; 977 return b; 978 } 979 980 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @trusted 981 { 982 a.ptr[2] = b.array[0]; 983 a.ptr[3] = b.array[1]; 984 return a; 985 } 986 987 int _mm_movemask_pi8 (__m64 a) pure @safe 988 { 989 return _mm_movemask_epi8(to_m128i(a)); 990 } 991 unittest 992 { 993 assert(0x9C == _mm_movemask_pi8(_mm_set_pi8(-1, 0, 0, -1, -1, -1, 0, 0))); 994 } 995 996 static if (GDC_with_SSE) 997 { 998 alias _mm_movemask_ps = __builtin_ia32_movmskps; 999 } 1000 else static if (LDC_with_SSE1) 1001 { 1002 alias _mm_movemask_ps = __builtin_ia32_movmskps; 1003 } 1004 else static if (LDC_with_ARM) 1005 { 1006 int _mm_movemask_ps (__m128 a) pure @safe 1007 { 1008 int4 ai = cast(int4)a; 1009 int4 shift31 = [31, 31, 31, 31]; 1010 ai = ai >>> shift31; 1011 int4 shift = [0, 1, 2, 3]; 1012 ai = ai << shift; // 4-way shift, only efficient on ARM. 1013 int r = ai.array[0] + (ai.array[1]) + (ai.array[2]) + (ai.array[3]); 1014 return r; 1015 } 1016 } 1017 else 1018 { 1019 int _mm_movemask_ps (__m128 a) pure @safe 1020 { 1021 int4 ai = cast(int4)a; 1022 int r = 0; 1023 if (ai[0] < 0) r += 1; 1024 if (ai[1] < 0) r += 2; 1025 if (ai[2] < 0) r += 4; 1026 if (ai[3] < 0) r += 8; 1027 return r; 1028 } 1029 } 1030 unittest 1031 { 1032 int4 A = [-1, 0, -43, 0]; 1033 assert(5 == _mm_movemask_ps(cast(float4)A)); 1034 } 1035 1036 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe 1037 { 1038 return a * b; 1039 } 1040 unittest 1041 { 1042 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1043 a = _mm_mul_ps(a, a); 1044 float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f]; 1045 assert(a.array == correct); 1046 } 1047 1048 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe 1049 { 1050 static if (GDC_with_SSE) 1051 return __builtin_ia32_mulss(a, b); 1052 else 1053 { 1054 a[0] *= b[0]; 1055 return a; 1056 } 1057 } 1058 unittest 1059 { 1060 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1061 a = _mm_mul_ss(a, a); 1062 float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f]; 1063 assert(a.array == correct); 1064 } 1065 1066 __m64 _mm_mulhi_pu16 (__m64 a, __m64 b) pure @safe 1067 { 1068 return to_m64(_mm_mulhi_epu16(to_m128i(a), to_m128i(b))); 1069 } 1070 unittest 1071 { 1072 __m64 A = _mm_setr_pi16(0, -16, 2, 3); 1073 __m64 B = _mm_set1_pi16(16384); 1074 short4 R = cast(short4)_mm_mulhi_pu16(A, B); 1075 short[4] correct = [0, 0x3FFC, 0, 0]; 1076 assert(R.array == correct); 1077 } 1078 1079 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe 1080 { 1081 return cast(__m128)(cast(__m128i)a | cast(__m128i)b); 1082 } 1083 1084 deprecated alias 1085 _m_pavgb = _mm_avg_pu8, 1086 _m_pavgw = _mm_avg_pu16, 1087 _m_pextrw = _mm_extract_pi16, 1088 _m_pinsrw = _mm_insert_pi16, 1089 _m_pmaxsw = _mm_max_pi16, 1090 _m_pmaxub = _mm_max_pu8, 1091 _m_pminsw = _mm_min_pi16, 1092 _m_pminub = _mm_min_pu8, 1093 _m_pmovmskb = _mm_movemask_pi8, 1094 _m_pmulhuw = _mm_mulhi_pu16; 1095 1096 enum _MM_HINT_T0 = 3; /// 1097 enum _MM_HINT_T1 = 2; /// 1098 enum _MM_HINT_T2 = 1; /// 1099 enum _MM_HINT_NTA = 0; /// 1100 1101 1102 version(LDC) 1103 { 1104 // Starting with LLVM 10, it seems llvm.prefetch has changed its name. 1105 // Was reported at: https://github.com/ldc-developers/ldc/issues/3397 1106 static if (__VERSION__ >= 2091) 1107 { 1108 pragma(LDC_intrinsic, "llvm.prefetch.p0i8") // was "llvm.prefetch" 1109 void llvm_prefetch_fixed(void* ptr, uint rw, uint locality, uint cachetype) pure @safe; 1110 } 1111 } 1112 1113 /// Fetch the line of data from memory that contains address `p` to a location in the 1114 /// cache hierarchy specified by the locality hint i. 1115 /// 1116 /// Warning: `locality` is a compile-time parameter, unlike in Intel Intrinsics API. 1117 void _mm_prefetch(int locality)(const(void)* p) pure @trusted 1118 { 1119 static if (GDC_with_SSE) 1120 { 1121 return __builtin_prefetch(p, (locality & 0x4) >> 2, locality & 0x3); 1122 } 1123 else version(LDC) 1124 { 1125 static if (__VERSION__ >= 2091) 1126 { 1127 // const_cast here. `llvm_prefetch` wants a mutable pointer 1128 llvm_prefetch_fixed( cast(void*)p, 0, locality, 1); 1129 } 1130 else 1131 { 1132 // const_cast here. `llvm_prefetch` wants a mutable pointer 1133 llvm_prefetch( cast(void*)p, 0, locality, 1); 1134 } 1135 } 1136 else version(D_InlineAsm_X86_64) 1137 { 1138 static if (locality == _MM_HINT_NTA) 1139 { 1140 asm pure nothrow @nogc @trusted 1141 { 1142 mov RAX, p; 1143 prefetchnta [RAX]; 1144 } 1145 } 1146 else static if (locality == _MM_HINT_T0) 1147 { 1148 asm pure nothrow @nogc @trusted 1149 { 1150 mov RAX, p; 1151 prefetcht0 [RAX]; 1152 } 1153 } 1154 else static if (locality == _MM_HINT_T1) 1155 { 1156 asm pure nothrow @nogc @trusted 1157 { 1158 mov RAX, p; 1159 prefetcht1 [RAX]; 1160 } 1161 } 1162 else static if (locality == _MM_HINT_T2) 1163 { 1164 asm pure nothrow @nogc @trusted 1165 { 1166 mov RAX, p; 1167 prefetcht2 [RAX]; 1168 } 1169 } 1170 else 1171 assert(false); // invalid locality hint 1172 } 1173 else version(D_InlineAsm_X86) 1174 { 1175 static if (locality == _MM_HINT_NTA) 1176 { 1177 asm pure nothrow @nogc @trusted 1178 { 1179 mov EAX, p; 1180 prefetchnta [EAX]; 1181 } 1182 } 1183 else static if (locality == _MM_HINT_T0) 1184 { 1185 asm pure nothrow @nogc @trusted 1186 { 1187 mov EAX, p; 1188 prefetcht0 [EAX]; 1189 } 1190 } 1191 else static if (locality == _MM_HINT_T1) 1192 { 1193 asm pure nothrow @nogc @trusted 1194 { 1195 mov EAX, p; 1196 prefetcht1 [EAX]; 1197 } 1198 } 1199 else static if (locality == _MM_HINT_T2) 1200 { 1201 asm pure nothrow @nogc @trusted 1202 { 1203 mov EAX, p; 1204 prefetcht2 [EAX]; 1205 } 1206 } 1207 else 1208 assert(false); // invalid locality hint 1209 } 1210 else 1211 { 1212 // Generic version: do nothing. From bitter experience, 1213 // it's unlikely you get ANY speed-up with manual prefetching. 1214 // Prefetching or not doesn't change program behaviour. 1215 } 1216 } 1217 unittest 1218 { 1219 // From Intel documentation: 1220 // "The amount of data prefetched is also processor implementation-dependent. It will, however, be a minimum of 32 bytes." 1221 ubyte[256] cacheline; // though it seems it cannot generate GP fault 1222 _mm_prefetch!_MM_HINT_T0(cacheline.ptr); 1223 _mm_prefetch!_MM_HINT_T1(cacheline.ptr); 1224 _mm_prefetch!_MM_HINT_T2(cacheline.ptr); 1225 _mm_prefetch!_MM_HINT_NTA(cacheline.ptr); 1226 } 1227 1228 deprecated alias 1229 _m_psadbw = _mm_sad_pu8, 1230 _m_pshufw = _mm_shuffle_pi16; 1231 1232 static if (GDC_with_SSE) 1233 { 1234 alias _mm_rcp_ps = __builtin_ia32_rcpps; 1235 } 1236 else static if (LDC_with_SSE1) 1237 { 1238 alias _mm_rcp_ps = __builtin_ia32_rcpps; 1239 } 1240 else 1241 { 1242 __m128 _mm_rcp_ps (__m128 a) pure @safe 1243 { 1244 a[0] = 1.0f / a[0]; 1245 a[1] = 1.0f / a[1]; 1246 a[2] = 1.0f / a[2]; 1247 a[3] = 1.0f / a[3]; 1248 return a; 1249 } 1250 } 1251 1252 static if (GDC_with_SSE) 1253 { 1254 alias _mm_rcp_ss = __builtin_ia32_rcpss; 1255 } 1256 else static if (LDC_with_SSE1) 1257 { 1258 alias _mm_rcp_ss = __builtin_ia32_rcpss; 1259 } 1260 else 1261 { 1262 __m128 _mm_rcp_ss (__m128 a) pure @safe 1263 { 1264 a[0] = 1.0f / a[0]; 1265 return a; 1266 } 1267 } 1268 1269 static if (GDC_with_SSE) 1270 { 1271 alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps; 1272 } 1273 else static if (LDC_with_SSE1) 1274 { 1275 alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps; 1276 } 1277 else version(LDC) 1278 { 1279 __m128 _mm_rsqrt_ps (__m128 a) pure @safe 1280 { 1281 a[0] = 1.0f / llvm_sqrt(a[0]); 1282 a[1] = 1.0f / llvm_sqrt(a[1]); 1283 a[2] = 1.0f / llvm_sqrt(a[2]); 1284 a[3] = 1.0f / llvm_sqrt(a[3]); 1285 return a; 1286 } 1287 } 1288 else 1289 { 1290 __m128 _mm_rsqrt_ps (__m128 a) pure @safe 1291 { 1292 a[0] = 1.0f / sqrt(a[0]); 1293 a[1] = 1.0f / sqrt(a[1]); 1294 a[2] = 1.0f / sqrt(a[2]); 1295 a[3] = 1.0f / sqrt(a[3]); 1296 return a; 1297 } 1298 } 1299 1300 static if (GDC_with_SSE) 1301 { 1302 alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss; 1303 } 1304 else static if (LDC_with_SSE1) 1305 { 1306 alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss; 1307 } 1308 else version(LDC) 1309 { 1310 __m128 _mm_rsqrt_ss (__m128 a) pure @safe 1311 { 1312 a[0] = 1.0f / llvm_sqrt(a[0]); 1313 return a; 1314 } 1315 } 1316 else 1317 { 1318 __m128 _mm_rsqrt_ss (__m128 a) pure @safe 1319 { 1320 a[0] = 1.0f / sqrt(a[0]); 1321 return a; 1322 } 1323 } 1324 1325 unittest 1326 { 1327 double maxRelativeError = 0.000245; // -72 dB 1328 void testInvSqrt(float number) nothrow @nogc 1329 { 1330 __m128 A = _mm_set1_ps(number); 1331 1332 // test _mm_rcp_ps 1333 __m128 B = _mm_rcp_ps(A); 1334 foreach(i; 0..4) 1335 { 1336 double exact = 1.0f / A.array[i]; 1337 double ratio = cast(double)(B.array[i]) / cast(double)(exact); 1338 assert(abs(ratio - 1) <= maxRelativeError); 1339 } 1340 1341 // test _mm_rcp_ss 1342 { 1343 B = _mm_rcp_ss(A); 1344 double exact = 1.0f / A.array[0]; 1345 double ratio = cast(double)(B.array[0]) / cast(double)(exact); 1346 assert(abs(ratio - 1) <= maxRelativeError); 1347 } 1348 1349 // test _mm_rsqrt_ps 1350 B = _mm_rsqrt_ps(A); 1351 foreach(i; 0..4) 1352 { 1353 double exact = 1.0f / sqrt(A.array[i]); 1354 double ratio = cast(double)(B.array[i]) / cast(double)(exact); 1355 assert(abs(ratio - 1) <= maxRelativeError); 1356 } 1357 1358 // test _mm_rsqrt_ss 1359 { 1360 B = _mm_rsqrt_ss(A); 1361 double exact = 1.0f / sqrt(A.array[0]); 1362 double ratio = cast(double)(B.array[0]) / cast(double)(exact); 1363 assert(abs(ratio - 1) <= maxRelativeError); 1364 } 1365 } 1366 1367 testInvSqrt(1.1f); 1368 testInvSqrt(2.45674864151f); 1369 testInvSqrt(27841456468.0f); 1370 } 1371 1372 __m64 _mm_sad_pu8 (__m64 a, __m64 b) pure @safe 1373 { 1374 return to_m64(_mm_sad_epu8(to_m128i(a), to_m128i(b))); 1375 } 1376 1377 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) @safe 1378 { 1379 // Note: unsupported on ARM 1380 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx); 1381 } 1382 1383 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) @safe 1384 { 1385 // Note: unsupported on ARM 1386 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx); 1387 } 1388 1389 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) @safe 1390 { 1391 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx); 1392 } 1393 1394 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted 1395 { 1396 // Note: despite appearances, generates sensible code, 1397 // inlines correctly and is constant folded 1398 float[4] result = [e0, e1, e2, e3]; 1399 return loadUnaligned!(float4)(result.ptr); 1400 } 1401 unittest 1402 { 1403 __m128 A = _mm_set_ps(3, 2, 1, 546); 1404 float[4] correct = [546.0f, 1.0f, 2.0f, 3.0f]; 1405 assert(A.array == correct); 1406 assert(A.array[0] == 546.0f); 1407 assert(A.array[1] == 1.0f); 1408 assert(A.array[2] == 2.0f); 1409 assert(A.array[3] == 3.0f); 1410 } 1411 1412 alias _mm_set_ps1 = _mm_set1_ps; 1413 1414 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) @safe 1415 { 1416 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx); 1417 } 1418 1419 __m128 _mm_set_ss (float a) pure @trusted 1420 { 1421 __m128 r = _mm_setzero_ps(); 1422 r.ptr[0] = a; 1423 return r; 1424 } 1425 unittest 1426 { 1427 float[4] correct = [42.0f, 0.0f, 0.0f, 0.0f]; 1428 __m128 A = _mm_set_ss(42.0f); 1429 assert(A.array == correct); 1430 } 1431 1432 __m128 _mm_set1_ps (float a) pure @trusted 1433 { 1434 __m128 r = void; 1435 r.ptr[0] = a; 1436 r.ptr[1] = a; 1437 r.ptr[2] = a; 1438 r.ptr[3] = a; 1439 return r; 1440 } 1441 unittest 1442 { 1443 float[4] correct = [42.0f, 42.0f, 42.0f, 42.0f]; 1444 __m128 A = _mm_set1_ps(42.0f); 1445 assert(A.array == correct); 1446 } 1447 1448 void _mm_setcsr(uint controlWord) @trusted 1449 { 1450 static if (LDC_with_ARM) 1451 { 1452 // Convert from SSE to ARM control word. This is done _partially_ 1453 // and only support rounding mode changes. 1454 1455 // "To alter some bits of a VFP system register without 1456 // affecting other bits, use a read-modify-write procedure" 1457 uint fpscr = arm_get_fpcr(); 1458 1459 // Bits 23 to 22 are rounding modes, however not used in NEON 1460 fpscr = fpscr & ~_MM_ROUND_MASK_ARM; 1461 switch(controlWord & _MM_ROUND_MASK) 1462 { 1463 default: 1464 case _MM_ROUND_NEAREST: fpscr |= _MM_ROUND_NEAREST_ARM; break; 1465 case _MM_ROUND_DOWN: fpscr |= _MM_ROUND_DOWN_ARM; break; 1466 case _MM_ROUND_UP: fpscr |= _MM_ROUND_UP_ARM; break; 1467 case _MM_ROUND_TOWARD_ZERO: fpscr |= _MM_ROUND_TOWARD_ZERO_ARM; break; 1468 } 1469 fpscr = fpscr & ~_MM_FLUSH_ZERO_MASK_ARM; 1470 if (controlWord & _MM_FLUSH_ZERO_MASK) 1471 fpscr |= _MM_FLUSH_ZERO_MASK_ARM; 1472 arm_set_fpcr(fpscr); 1473 } 1474 else version(GNU) 1475 { 1476 static if (GDC_with_SSE) 1477 { 1478 __builtin_ia32_ldmxcsr(controlWord); 1479 } 1480 else version(X86) 1481 { 1482 asm pure nothrow @nogc @trusted 1483 { 1484 "ldmxcsr %0;\n" 1485 : 1486 : "m" (controlWord) 1487 : ; 1488 } 1489 } 1490 else 1491 static assert(false); 1492 } 1493 else version (InlineX86Asm) 1494 { 1495 asm pure nothrow @nogc @safe 1496 { 1497 ldmxcsr controlWord; 1498 } 1499 } 1500 else 1501 static assert(0, "Not yet supported"); 1502 } 1503 unittest 1504 { 1505 _mm_setcsr(_mm_getcsr()); 1506 } 1507 1508 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted 1509 { 1510 float[4] result = [e3, e2, e1, e0]; 1511 return loadUnaligned!(float4)(result.ptr); 1512 } 1513 unittest 1514 { 1515 __m128 A = _mm_setr_ps(3, 2, 1, 546); 1516 float[4] correct = [3.0f, 2.0f, 1.0f, 546.0f]; 1517 assert(A.array == correct); 1518 assert(A.array[0] == 3.0f); 1519 assert(A.array[1] == 2.0f); 1520 assert(A.array[2] == 1.0f); 1521 assert(A.array[3] == 546.0f); 1522 } 1523 1524 __m128 _mm_setzero_ps() pure @trusted 1525 { 1526 // Compiles to xorps without problems 1527 float[4] result = [0.0f, 0.0f, 0.0f, 0.0f]; 1528 return loadUnaligned!(float4)(result.ptr); 1529 } 1530 1531 version(GNU) 1532 { 1533 void _mm_sfence() pure @trusted 1534 { 1535 static if (GDC_with_SSE) 1536 { 1537 __builtin_ia32_sfence(); 1538 } 1539 else version(X86) 1540 { 1541 asm pure nothrow @nogc @trusted 1542 { 1543 "sfence;\n" : : : ; 1544 } 1545 } 1546 else 1547 static assert(false); 1548 } 1549 } 1550 else static if (LDC_with_SSE1) 1551 { 1552 alias _mm_sfence = __builtin_ia32_sfence; 1553 } 1554 else static if (DMD_with_asm) 1555 { 1556 void _mm_sfence() pure @safe 1557 { 1558 asm nothrow @nogc pure @safe 1559 { 1560 sfence; 1561 } 1562 } 1563 } 1564 else version(LDC) 1565 { 1566 void _mm_sfence() pure @safe 1567 { 1568 llvm_memory_fence(); // PERF: generates mfence 1569 } 1570 } 1571 else 1572 static assert(false); 1573 unittest 1574 { 1575 _mm_sfence(); 1576 } 1577 1578 __m64 _mm_shuffle_pi16(int imm8)(__m64 a) pure @safe 1579 { 1580 return cast(__m64) shufflevector!(short4, ( (imm8 >> 0) & 3 ), 1581 ( (imm8 >> 2) & 3 ), 1582 ( (imm8 >> 4) & 3 ), 1583 ( (imm8 >> 6) & 3 ))(cast(short4)a, cast(short4)a); 1584 } 1585 unittest 1586 { 1587 __m64 A = _mm_setr_pi16(0, 1, 2, 3); 1588 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 1589 short4 B = cast(short4) _mm_shuffle_pi16!SHUFFLE(A); 1590 short[4] expectedB = [ 3, 2, 1, 0 ]; 1591 assert(B.array == expectedB); 1592 } 1593 1594 // Note: the immediate shuffle value is given at compile-time instead of runtime. 1595 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe 1596 { 1597 return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b); 1598 } 1599 1600 static if (GDC_with_SSE) 1601 { 1602 alias _mm_sqrt_ps = __builtin_ia32_sqrtps; 1603 } 1604 else version(LDC) 1605 { 1606 // Disappeared with LDC 1.11 1607 static if (__VERSION__ < 2081) 1608 alias _mm_sqrt_ps = __builtin_ia32_sqrtps; 1609 else 1610 { 1611 __m128 _mm_sqrt_ps(__m128 vec) pure @safe 1612 { 1613 vec.array[0] = llvm_sqrt(vec.array[0]); 1614 vec.array[1] = llvm_sqrt(vec.array[1]); 1615 vec.array[2] = llvm_sqrt(vec.array[2]); 1616 vec.array[3] = llvm_sqrt(vec.array[3]); 1617 return vec; 1618 } 1619 } 1620 } 1621 else 1622 { 1623 __m128 _mm_sqrt_ps(__m128 vec) pure @trusted 1624 { 1625 vec.ptr[0] = sqrt(vec.array[0]); 1626 vec.ptr[1] = sqrt(vec.array[1]); 1627 vec.ptr[2] = sqrt(vec.array[2]); 1628 vec.ptr[3] = sqrt(vec.array[3]); 1629 return vec; 1630 } 1631 } 1632 unittest 1633 { 1634 __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f)); 1635 assert(A.array[0] == 2.0f); 1636 assert(A.array[1] == 2.0f); 1637 assert(A.array[2] == 2.0f); 1638 assert(A.array[3] == 2.0f); 1639 } 1640 1641 static if (GDC_with_SSE) 1642 { 1643 alias _mm_sqrt_ss = __builtin_ia32_sqrtss; 1644 } 1645 else version(LDC) 1646 { 1647 // Disappeared with LDC 1.11 1648 static if (__VERSION__ < 2081) 1649 alias _mm_sqrt_ss = __builtin_ia32_sqrtss; 1650 else 1651 { 1652 __m128 _mm_sqrt_ss(__m128 vec) pure @safe 1653 { 1654 vec.array[0] = llvm_sqrt(vec.array[0]); 1655 vec.array[1] = vec.array[1]; 1656 vec.array[2] = vec.array[2]; 1657 vec.array[3] = vec.array[3]; 1658 return vec; 1659 } 1660 } 1661 } 1662 else 1663 { 1664 __m128 _mm_sqrt_ss(__m128 vec) pure @trusted 1665 { 1666 vec.ptr[0] = sqrt(vec.array[0]); 1667 return vec; 1668 } 1669 } 1670 unittest 1671 { 1672 __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f)); 1673 assert(A.array[0] == 2.0f); 1674 assert(A.array[1] == 4.0f); 1675 assert(A.array[2] == 4.0f); 1676 assert(A.array[3] == 4.0f); 1677 } 1678 1679 void _mm_store_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 1680 { 1681 __m128* aligned = cast(__m128*)mem_addr; 1682 *aligned = a; 1683 } 1684 1685 alias _mm_store_ps1 = _mm_store1_ps; 1686 1687 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe 1688 { 1689 *mem_addr = a.array[0]; 1690 } 1691 unittest 1692 { 1693 float a; 1694 _mm_store_ss(&a, _mm_set_ps(3, 2, 1, 546)); 1695 assert(a == 546); 1696 } 1697 1698 void _mm_store1_ps(float* mem_addr, __m128 a) pure @trusted // not safe since nothing guarantees alignment 1699 { 1700 __m128* aligned = cast(__m128*)mem_addr; 1701 __m128 r; 1702 r.ptr[0] = a.array[0]; 1703 r.ptr[1] = a.array[0]; 1704 r.ptr[2] = a.array[0]; 1705 r.ptr[3] = a.array[0]; 1706 *aligned = r; 1707 } 1708 1709 void _mm_storeh_pi(__m64* p, __m128 a) pure @trusted 1710 { 1711 long2 la = cast(long2)a; 1712 (*p).ptr[0] = la.array[1]; 1713 } 1714 unittest 1715 { 1716 __m64 R = _mm_setzero_si64(); 1717 long2 A = [13, 25]; 1718 _mm_storeh_pi(&R, cast(__m128)A); 1719 assert(R.array[0] == 25); 1720 } 1721 1722 void _mm_storel_pi(__m64* p, __m128 a) pure @trusted 1723 { 1724 long2 la = cast(long2)a; 1725 (*p).ptr[0] = la.array[0]; 1726 } 1727 unittest 1728 { 1729 __m64 R = _mm_setzero_si64(); 1730 long2 A = [13, 25]; 1731 _mm_storel_pi(&R, cast(__m128)A); 1732 assert(R.array[0] == 13); 1733 } 1734 1735 void _mm_storer_ps(float* mem_addr, __m128 a) pure @trusted // not safe since nothing guarantees alignment 1736 { 1737 __m128* aligned = cast(__m128*)mem_addr; 1738 __m128 r; 1739 r.ptr[0] = a.array[3]; 1740 r.ptr[1] = a.array[2]; 1741 r.ptr[2] = a.array[1]; 1742 r.ptr[3] = a.array[0]; 1743 *aligned = r; 1744 } 1745 1746 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe 1747 { 1748 storeUnaligned!(float4)(a, mem_addr); 1749 } 1750 1751 void _mm_stream_pi (__m64* mem_addr, __m64 a) 1752 { 1753 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 1754 *mem_addr = a; // it's a regular move instead 1755 } 1756 1757 // BUG: can't implement non-temporal store with LDC inlineIR since !nontemporal 1758 // needs some IR outside this function that would say: 1759 // 1760 // !0 = !{ i32 1 } 1761 // 1762 // It's a LLVM IR metadata description. 1763 // Regardless, non-temporal moves are really dangerous for performance... 1764 void _mm_stream_ps (float* mem_addr, __m128 a) 1765 { 1766 __m128* dest = cast(__m128*)mem_addr; 1767 *dest = a; // it's a regular move instead 1768 } 1769 unittest 1770 { 1771 align(16) float[4] A; 1772 _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f)); 1773 assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f); 1774 } 1775 1776 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe 1777 { 1778 return a - b; 1779 } 1780 unittest 1781 { 1782 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1783 a = _mm_sub_ps(a, a); 1784 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 1785 assert(a.array == correct); 1786 } 1787 1788 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe 1789 { 1790 static if (GDC_with_SSE) 1791 return __builtin_ia32_subss(a, b); 1792 else 1793 { 1794 a[0] -= b[0]; 1795 return a; 1796 } 1797 } 1798 unittest 1799 { 1800 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1801 a = _mm_sub_ss(a, a); 1802 float[4] correct = [0.0f, -2.0, 3.0f, 1.0f]; 1803 assert(a.array == correct); 1804 } 1805 1806 1807 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe 1808 { 1809 __m128 tmp3, tmp2, tmp1, tmp0; 1810 tmp0 = _mm_unpacklo_ps(row0, row1); 1811 tmp2 = _mm_unpacklo_ps(row2, row3); 1812 tmp1 = _mm_unpackhi_ps(row0, row1); 1813 tmp3 = _mm_unpackhi_ps(row2, row3); 1814 row0 = _mm_movelh_ps(tmp0, tmp2); 1815 row1 = _mm_movehl_ps(tmp2, tmp0); 1816 row2 = _mm_movelh_ps(tmp1, tmp3); 1817 row3 = _mm_movehl_ps(tmp3, tmp1); 1818 } 1819 1820 // Note: the only difference between these intrinsics is the signalling 1821 // behaviour of quiet NaNs. This is incorrect but the case where 1822 // you would want to differentiate between qNaN and sNaN and then 1823 // treat them differently on purpose seems extremely rare. 1824 alias _mm_ucomieq_ss = _mm_comieq_ss; 1825 alias _mm_ucomige_ss = _mm_comige_ss; 1826 alias _mm_ucomigt_ss = _mm_comigt_ss; 1827 alias _mm_ucomile_ss = _mm_comile_ss; 1828 alias _mm_ucomilt_ss = _mm_comilt_ss; 1829 alias _mm_ucomineq_ss = _mm_comineq_ss; 1830 1831 1832 __m128 _mm_undefined_ps() pure @safe 1833 { 1834 __m128 undef = void; 1835 return undef; 1836 } 1837 1838 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @trusted 1839 { 1840 __m128 r; 1841 r.ptr[0] = a.array[2]; 1842 r.ptr[1] = b.array[2]; 1843 r.ptr[2] = a.array[3]; 1844 r.ptr[3] = b.array[3]; 1845 return r; 1846 } 1847 1848 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @trusted 1849 { 1850 __m128 r; 1851 r.ptr[0] = a.array[0]; 1852 r.ptr[1] = b.array[0]; 1853 r.ptr[2] = a.array[1]; 1854 r.ptr[3] = b.array[1]; 1855 return r; 1856 } 1857 1858 __m128 _mm_xor_ps (__m128 a, __m128 b) pure @safe 1859 { 1860 return cast(__m128)(cast(__m128i)a ^ cast(__m128i)b); 1861 } 1862 1863 1864 private 1865 { 1866 /// Returns: `true` if the pointer is suitably aligned. 1867 bool isPointerAligned(void* p, size_t alignment) pure 1868 { 1869 assert(alignment != 0); 1870 return ( cast(size_t)p & (alignment - 1) ) == 0; 1871 } 1872 1873 /// Returns: next pointer aligned with alignment bytes. 1874 void* nextAlignedPointer(void* start, size_t alignment) pure 1875 { 1876 return cast(void*)nextMultipleOf(cast(size_t)(start), alignment); 1877 } 1878 1879 // Returns number of bytes to actually allocate when asking 1880 // for a particular alignment 1881 @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure 1882 { 1883 enum size_t pointerSize = size_t.sizeof; 1884 return askedSize + alignment - 1 + pointerSize * 3; 1885 } 1886 1887 // Store pointer given my malloc, size and alignment 1888 @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure 1889 { 1890 enum size_t pointerSize = size_t.sizeof; 1891 char* start = cast(char*)raw + pointerSize * 3; 1892 void* aligned = nextAlignedPointer(start, alignment); 1893 void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize); 1894 *rawLocation = raw; 1895 size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize); 1896 *sizeLocation = size; 1897 size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize); 1898 *alignmentLocation = alignment; 1899 assert( isPointerAligned(aligned, alignment) ); 1900 return aligned; 1901 } 1902 1903 // Returns: x, multiple of powerOfTwo, so that x >= n. 1904 @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow 1905 { 1906 // check power-of-two 1907 assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0)); 1908 1909 size_t mask = ~(powerOfTwo - 1); 1910 return (n + powerOfTwo - 1) & mask; 1911 } 1912 } 1913 1914 unittest 1915 { 1916 assert(nextMultipleOf(0, 4) == 0); 1917 assert(nextMultipleOf(1, 4) == 4); 1918 assert(nextMultipleOf(2, 4) == 4); 1919 assert(nextMultipleOf(3, 4) == 4); 1920 assert(nextMultipleOf(4, 4) == 4); 1921 assert(nextMultipleOf(5, 4) == 8); 1922 1923 { 1924 void* p = _mm_malloc(23, 16); 1925 assert(p !is null); 1926 assert(((cast(size_t)p) & 0xf) == 0); 1927 _mm_free(p); 1928 } 1929 1930 void* nullAlloc = _mm_malloc(0, 32); 1931 assert(nullAlloc != null); 1932 _mm_free(nullAlloc); 1933 } 1934 1935 // For some reason, order of declaration is important for this one... 4 1936 // so it is misplaced. 1937 // Note: is just another name for _mm_cvtss_si32 1938 alias _mm_cvt_ss2si = _mm_cvtss_si32;