1 /** 2 * SSE intrinsics. 3 * 4 * Copyright: Copyright Auburn Sounds 2016-2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 * Authors: Guillaume Piolat 7 */ 8 module inteli.xmmintrin; 9 10 public import inteli.types; 11 12 import inteli.internals; 13 14 import inteli.mmx; 15 import inteli.emmintrin; 16 17 import core.stdc.stdlib: malloc, free; 18 import core.exception: onOutOfMemoryError; 19 20 version(D_InlineAsm_X86) 21 version = InlineX86Asm; 22 else version(D_InlineAsm_X86_64) 23 version = InlineX86Asm; 24 25 26 // SSE1 27 28 nothrow @nogc: 29 30 31 enum int _MM_EXCEPT_INVALID = 0x0001; /// MXCSR Exception states. 32 enum int _MM_EXCEPT_DENORM = 0x0002; ///ditto 33 enum int _MM_EXCEPT_DIV_ZERO = 0x0004; ///ditto 34 enum int _MM_EXCEPT_OVERFLOW = 0x0008; ///ditto 35 enum int _MM_EXCEPT_UNDERFLOW = 0x0010; ///ditto 36 enum int _MM_EXCEPT_INEXACT = 0x0020; ///ditto 37 enum int _MM_EXCEPT_MASK = 0x003f; /// MXCSR Exception states mask. 38 39 enum int _MM_MASK_INVALID = 0x0080; /// MXCSR Exception masks. 40 enum int _MM_MASK_DENORM = 0x0100; ///ditto 41 enum int _MM_MASK_DIV_ZERO = 0x0200; ///ditto 42 enum int _MM_MASK_OVERFLOW = 0x0400; ///ditto 43 enum int _MM_MASK_UNDERFLOW = 0x0800; ///ditto 44 enum int _MM_MASK_INEXACT = 0x1000; ///ditto 45 enum int _MM_MASK_MASK = 0x1f80; /// MXCSR Exception masks mask. 46 47 enum int _MM_ROUND_NEAREST = 0x0000; /// MXCSR Rounding mode. 48 enum int _MM_ROUND_DOWN = 0x2000; ///ditto 49 enum int _MM_ROUND_UP = 0x4000; ///ditto 50 enum int _MM_ROUND_TOWARD_ZERO = 0x6000; ///ditto 51 enum int _MM_ROUND_MASK = 0x6000; /// MXCSR Rounding mode mask. 52 53 enum int _MM_FLUSH_ZERO_MASK = 0x8000; /// MXCSR Denormal flush to zero mask. 54 enum int _MM_FLUSH_ZERO_ON = 0x8000; /// MXCSR Denormal flush to zero modes. 55 enum int _MM_FLUSH_ZERO_OFF = 0x0000; ///ditto 56 57 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`. 58 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe 59 { 60 return a + b; 61 } 62 unittest 63 { 64 __m128 a = [1, 2, 3, 4]; 65 a = _mm_add_ps(a, a); 66 assert(a.array[0] == 2); 67 assert(a.array[1] == 4); 68 assert(a.array[2] == 6); 69 assert(a.array[3] == 8); 70 } 71 72 /// Add the lower single-precision (32-bit) floating-point element 73 /// in `a` and `b`, store the result in the lower element of result, 74 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 75 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe 76 { 77 static if (GDC_with_SSE) 78 return __builtin_ia32_addss(a, b); 79 else 80 { 81 a[0] += b[0]; 82 return a; 83 } 84 } 85 unittest 86 { 87 __m128 a = [1, 2, 3, 4]; 88 a = _mm_add_ss(a, a); 89 assert(a.array == [2.0f, 2, 3, 4]); 90 } 91 92 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`. 93 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe 94 { 95 return cast(__m128)(cast(__m128i)a & cast(__m128i)b); 96 } 97 unittest 98 { 99 float a = 4.32f; 100 float b = -78.99f; 101 int correct = (*cast(int*)(&a)) & (*cast(int*)(&b)); 102 __m128 A = _mm_set_ps(a, b, a, b); 103 __m128 B = _mm_set_ps(b, a, b, a); 104 int4 R = cast(int4)( _mm_and_ps(A, B) ); 105 assert(R.array[0] == correct); 106 assert(R.array[1] == correct); 107 assert(R.array[2] == correct); 108 assert(R.array[3] == correct); 109 } 110 111 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` and then AND with `b`. 112 __m128 _mm_andnot_ps (__m128 a, __m128 b) pure @safe 113 { 114 return cast(__m128)( (~cast(__m128i)a) & cast(__m128i)b ); 115 } 116 unittest 117 { 118 float a = 4.32f; 119 float b = -78.99f; 120 int correct = ~(*cast(int*)(&a)) & (*cast(int*)(&b)); 121 int correct2 = (*cast(int*)(&a)) & ~(*cast(int*)(&b)); 122 __m128 A = _mm_set_ps(a, b, a, b); 123 __m128 B = _mm_set_ps(b, a, b, a); 124 int4 R = cast(int4)( _mm_andnot_ps(A, B) ); 125 assert(R.array[0] == correct2); 126 assert(R.array[1] == correct); 127 assert(R.array[2] == correct2); 128 assert(R.array[3] == correct); 129 } 130 131 /// Average packed unsigned 16-bit integers in ``a` and `b`. 132 __m64 _mm_avg_pu16 (__m64 a, __m64 b) pure @safe 133 { 134 return to_m64(_mm_avg_epu16(to_m128i(a), to_m128i(b))); 135 } 136 137 /// Average packed unsigned 8-bit integers in ``a` and `b`. 138 __m64 _mm_avg_pu8 (__m64 a, __m64 b) pure @safe 139 { 140 return to_m64(_mm_avg_epu8(to_m128i(a), to_m128i(b))); 141 } 142 143 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for equality. 144 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe 145 { 146 return cast(__m128) cmpps!(FPComparison.oeq)(a, b); 147 } 148 149 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for equality, 150 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 151 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe 152 { 153 return cast(__m128) cmpss!(FPComparison.oeq)(a, b); 154 } 155 156 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal. 157 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe 158 { 159 return cast(__m128) cmpps!(FPComparison.oge)(a, b); 160 } 161 unittest 162 { 163 __m128i R = cast(__m128i) _mm_cmpge_ps(_mm_setr_ps(0, 1, -1, float.nan), 164 _mm_setr_ps(0, 0, 0, 0)); 165 int[4] correct = [-1, -1, 0, 0]; 166 assert(R.array == correct); 167 } 168 169 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal, 170 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 171 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe 172 { 173 return cast(__m128) cmpss!(FPComparison.oge)(a, b); 174 } 175 176 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than. 177 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe 178 { 179 return cast(__m128) cmpps!(FPComparison.ogt)(a, b); 180 } 181 182 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than, 183 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 184 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe 185 { 186 return cast(__m128) cmpss!(FPComparison.ogt)(a, b); 187 } 188 189 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal. 190 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe 191 { 192 return cast(__m128) cmpps!(FPComparison.ole)(a, b); 193 } 194 195 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal, 196 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 197 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe 198 { 199 return cast(__m128) cmpss!(FPComparison.ole)(a, b); 200 } 201 202 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than. 203 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe 204 { 205 return cast(__m128) cmpps!(FPComparison.olt)(a, b); 206 } 207 208 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than, 209 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 210 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe 211 { 212 return cast(__m128) cmpss!(FPComparison.olt)(a, b); 213 } 214 215 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-equal. 216 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe 217 { 218 return cast(__m128) cmpps!(FPComparison.une)(a, b); 219 } 220 221 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-equal, 222 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 223 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe 224 { 225 return cast(__m128) cmpss!(FPComparison.une)(a, b); 226 } 227 228 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal. 229 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe 230 { 231 return cast(__m128) cmpps!(FPComparison.ult)(a, b); 232 } 233 234 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal, 235 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 236 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe 237 { 238 return cast(__m128) cmpss!(FPComparison.ult)(a, b); 239 } 240 241 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than. 242 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe 243 { 244 return cast(__m128) cmpps!(FPComparison.ule)(a, b); 245 } 246 247 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than, 248 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 249 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe 250 { 251 return cast(__m128) cmpss!(FPComparison.ule)(a, b); 252 } 253 254 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal. 255 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe 256 { 257 return cast(__m128) cmpps!(FPComparison.ugt)(a, b); 258 } 259 260 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal, 261 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 262 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe 263 { 264 return cast(__m128) cmpss!(FPComparison.ugt)(a, b); 265 } 266 267 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than. 268 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe 269 { 270 return cast(__m128) cmpps!(FPComparison.uge)(a, b); 271 } 272 273 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than, 274 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 275 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe 276 { 277 return cast(__m128) cmpss!(FPComparison.uge)(a, b); 278 } 279 280 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN. 281 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe 282 { 283 return cast(__m128) cmpps!(FPComparison.ord)(a, b); 284 } 285 286 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN, 287 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 288 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe 289 { 290 return cast(__m128) cmpss!(FPComparison.ord)(a, b); 291 } 292 293 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN. 294 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe 295 { 296 return cast(__m128) cmpps!(FPComparison.uno)(a, b); 297 } 298 299 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN. 300 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 301 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe 302 { 303 return cast(__m128) cmpss!(FPComparison.uno)(a, b); 304 } 305 306 // Note: we've reversed clang and GCC behaviour with regards to EFLAGS 307 // Some such comparisons yields true for NaNs, other don't. 308 309 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for equality, 310 /// and return the boolean result (0 or 1). 311 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe // comiss + sete 312 { 313 return comss!(FPComparison.ueq)(a, b); // yields true for NaN! 314 } 315 316 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than-or-equal, 317 /// and return the boolean result (0 or 1). 318 int _mm_comige_ss (__m128 a, __m128 b) pure @safe // comiss + setae 319 { 320 return comss!(FPComparison.oge)(a, b); 321 } 322 323 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than, 324 /// and return the boolean result (0 or 1). 325 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta 326 { 327 return comss!(FPComparison.ogt)(a, b); 328 } 329 330 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than-or-equal, 331 /// and return the boolean result (0 or 1). 332 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe 333 { 334 return comss!(FPComparison.ule)(a, b); // yields true for NaN! 335 } 336 337 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than, 338 /// and return the boolean result (0 or 1). 339 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb 340 { 341 return comss!(FPComparison.ult)(a, b); // yields true for NaN! 342 } 343 344 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for not-equal, 345 /// and return the boolean result (0 or 1). 346 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne 347 { 348 return comss!(FPComparison.one)(a, b); 349 } 350 351 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 352 /// floating-point elements, store the results in the lower 2 elements, 353 /// and copy the upper 2 packed elements from `a` to the upper elements of result. 354 alias _mm_cvt_pi2ps = _mm_cvtpi32_ps; 355 356 /// Convert 2 lower packed single-precision (32-bit) floating-point elements in `a` 357 /// to packed 32-bit integers. 358 __m64 _mm_cvt_ps2pi (__m128 a) @safe 359 { 360 return to_m64(_mm_cvtps_epi32(a)); 361 } 362 363 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 364 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 365 /// upper elements of the result. 366 __m128 _mm_cvt_si2ss (__m128 v, int x) pure @trusted 367 { 368 v.ptr[0] = cast(float)x; 369 return v; 370 } 371 unittest 372 { 373 __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42); 374 assert(a.array == [42f, 0, 0, 0]); 375 } 376 377 /// Convert packed 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements. 378 __m128 _mm_cvtpi16_ps (__m64 a) pure @safe 379 { 380 __m128i ma = to_m128i(a); 381 ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit 382 ma = _mm_srai_epi32(_mm_slli_epi32(ma, 16), 16); // Replicate sign bit 383 return _mm_cvtepi32_ps(ma); 384 } 385 unittest 386 { 387 __m64 A = _mm_setr_pi16(-1, 2, -3, 4); 388 __m128 R = _mm_cvtpi16_ps(A); 389 float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f]; 390 assert(R.array == correct); 391 } 392 393 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 394 /// floating-point elements, store the results in the lower 2 elements, 395 /// and copy the upper 2 packed elements from `a` to the upper elements of result. 396 __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) pure @trusted 397 { 398 __m128 fb = _mm_cvtepi32_ps(to_m128i(b)); 399 a.ptr[0] = fb.array[0]; 400 a.ptr[1] = fb.array[1]; 401 return a; 402 } 403 unittest 404 { 405 __m128 R = _mm_cvtpi32_ps(_mm_set1_ps(4.0f), _mm_setr_pi32(1, 2)); 406 float[4] correct = [1.0f, 2.0f, 4.0f, 4.0f]; 407 assert(R.array == correct); 408 } 409 410 /// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point elements, 411 /// store the results in the lower 2 elements, then covert the packed signed 32-bit integers in `b` to 412 /// single-precision (32-bit) floating-point element, and store the results in the upper 2 elements. 413 __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) pure @trusted 414 { 415 long2 l; 416 l.ptr[0] = a.array[0]; 417 l.ptr[1] = b.array[0]; 418 return _mm_cvtepi32_ps(cast(__m128i)l); 419 } 420 unittest 421 { 422 __m64 A = _mm_setr_pi32(-45, 128); 423 __m64 B = _mm_setr_pi32(0, 1000); 424 __m128 R = _mm_cvtpi32x2_ps(A, B); 425 float[4] correct = [-45.0f, 128.0f, 0.0f, 1000.0f]; 426 assert(R.array == correct); 427 } 428 429 /// Convert the lower packed 8-bit integers in `a` to packed single-precision (32-bit) floating-point elements. 430 __m128 _mm_cvtpi8_ps (__m64 a) pure @safe 431 { 432 __m128i b = to_m128i(a); 433 434 // Zero extend to 32-bit 435 b = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 436 b = _mm_unpacklo_epi16(b, _mm_setzero_si128()); 437 438 // Replicate sign bit 439 b = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24); // Replicate sign bit 440 return _mm_cvtepi32_ps(b); 441 } 442 unittest 443 { 444 __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0); 445 __m128 R = _mm_cvtpi8_ps(A); 446 float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f]; 447 assert(R.array == correct); 448 } 449 450 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 16-bit integers. 451 /// Note: this intrinsic will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and 0x7FFFFFFF. 452 __m64 _mm_cvtps_pi16 (__m128 a) @safe 453 { 454 // The C++ version of this intrinsic convert to 32-bit float, then use packssdw 455 // Which means the 16-bit integers should be saturated 456 __m128i b = _mm_cvtps_epi32(a); 457 b = _mm_packs_epi32(b, b); 458 return to_m64(b); 459 } 460 unittest 461 { 462 __m128 A = _mm_setr_ps(-1.0f, 2.0f, -33000.0f, 70000.0f); 463 short4 R = cast(short4) _mm_cvtps_pi16(A); 464 short[4] correct = [-1, 2, -32768, 32767]; 465 assert(R.array == correct); 466 } 467 468 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers. 469 __m64 _mm_cvtps_pi32 (__m128 a) @safe 470 { 471 return to_m64(_mm_cvtps_epi32(a)); 472 } 473 unittest 474 { 475 __m128 A = _mm_setr_ps(-33000.0f, 70000.0f, -1.0f, 2.0f, ); 476 int2 R = cast(int2) _mm_cvtps_pi32(A); 477 int[2] correct = [-33000, 70000]; 478 assert(R.array == correct); 479 } 480 481 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 8-bit integers, 482 /// and store the results in lower 4 elements. 483 /// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values between 0x7F and 0x7FFFFFFF. 484 __m64 _mm_cvtps_pi8 (__m128 a) @safe 485 { 486 // The C++ version of this intrinsic convert to 32-bit float, then use packssdw + packsswb 487 // Which means the 8-bit integers should be saturated 488 __m128i b = _mm_cvtps_epi32(a); 489 b = _mm_packs_epi32(b, _mm_setzero_si128()); 490 b = _mm_packs_epi16(b, _mm_setzero_si128()); 491 return to_m64(b); 492 } 493 unittest 494 { 495 __m128 A = _mm_setr_ps(-1.0f, 2.0f, -129.0f, 128.0f); 496 byte8 R = cast(byte8) _mm_cvtps_pi8(A); 497 byte[8] correct = [-1, 2, -128, 127, 0, 0, 0, 0]; 498 assert(R.array == correct); 499 } 500 501 /// Convert packed unsigned 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements. 502 __m128 _mm_cvtpu16_ps (__m64 a) pure @safe 503 { 504 __m128i ma = to_m128i(a); 505 ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit 506 return _mm_cvtepi32_ps(ma); 507 } 508 unittest 509 { 510 __m64 A = _mm_setr_pi16(-1, 2, -3, 4); 511 __m128 R = _mm_cvtpu16_ps(A); 512 float[4] correct = [65535.0f, 2.0f, 65533.0f, 4.0f]; 513 assert(R.array == correct); 514 } 515 516 /// Convert the lower packed unsigned 8-bit integers in `a` to packed single-precision (32-bit) floating-point element. 517 __m128 _mm_cvtpu8_ps (__m64 a) pure @safe 518 { 519 __m128i b = to_m128i(a); 520 521 // Zero extend to 32-bit 522 b = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 523 b = _mm_unpacklo_epi16(b, _mm_setzero_si128()); 524 return _mm_cvtepi32_ps(b); 525 } 526 unittest 527 { 528 __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0); 529 __m128 R = _mm_cvtpu8_ps(A); 530 float[4] correct = [255.0f, 2.0f, 253.0f, 4.0f]; 531 assert(R.array == correct); 532 } 533 534 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 535 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 536 /// upper elements of result. 537 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @trusted 538 { 539 v.ptr[0] = cast(float)x; 540 return v; 541 } 542 unittest 543 { 544 __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42); 545 assert(a.array == [42.0f, 0, 0, 0]); 546 } 547 548 549 /// Convert the signed 64-bit integer `b` to a single-precision (32-bit) floating-point element, 550 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 551 /// upper elements of result. 552 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @trusted 553 { 554 v.ptr[0] = cast(float)x; 555 return v; 556 } 557 unittest 558 { 559 __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42); 560 assert(a.array == [42.0f, 0, 0, 0]); 561 } 562 563 /// Take the lower single-precision (32-bit) floating-point element of `a`. 564 float _mm_cvtss_f32(__m128 a) pure @safe 565 { 566 return a.array[0]; 567 } 568 569 static if (LDC_with_SSE1) 570 { 571 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit integer. 572 int _mm_cvtss_si32 (__m128 a) @safe // PERF GDC 573 { 574 return __builtin_ia32_cvtss2si(a); 575 } 576 } 577 else 578 { 579 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit integer. 580 int _mm_cvtss_si32 (__m128 a) @safe 581 { 582 return convertFloatToInt32UsingMXCSR(a.array[0]); 583 } 584 } 585 unittest 586 { 587 assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 588 } 589 590 version(LDC) 591 { 592 version(X86_64) 593 { 594 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer. 595 long _mm_cvtss_si64 (__m128 a) @safe 596 { 597 return __builtin_ia32_cvtss2si64(a); 598 } 599 } 600 else 601 { 602 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer. 603 long _mm_cvtss_si64 (__m128 a) @safe 604 { 605 // Note: __builtin_ia32_cvtss2si64 crashes LDC in 32-bit 606 return convertFloatToInt64UsingMXCSR(a.array[0]); 607 } 608 } 609 } 610 else 611 { 612 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer. 613 long _mm_cvtss_si64 (__m128 a) @safe 614 { 615 return convertFloatToInt64UsingMXCSR(a.array[0]); 616 } 617 } 618 unittest 619 { 620 assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 621 622 uint savedRounding = _MM_GET_ROUNDING_MODE(); 623 624 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 625 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.49f))); 626 627 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 628 assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f))); 629 630 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 631 assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f))); 632 633 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 634 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f))); 635 636 _MM_SET_ROUNDING_MODE(savedRounding); 637 } 638 639 640 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit 641 /// integer with truncation. 642 int _mm_cvtt_ss2si (__m128 a) pure @safe 643 { 644 // x86: cvttss2si always generated, even in -O0 645 return cast(int)(a.array[0]); 646 } 647 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; ///ditto 648 unittest 649 { 650 assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 651 } 652 653 654 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit 655 /// integers with truncation. 656 __m64 _mm_cvtt_ps2pi (__m128 a) pure @safe 657 { 658 return to_m64(_mm_cvttps_epi32(a)); 659 } 660 661 662 // Note: __builtin_ia32_cvttss2si64 crashes LDC when generating 32-bit x86 code. 663 long _mm_cvttss_si64 (__m128 a) pure @safe 664 { 665 return cast(long)(a.array[0]); // Generates cvttss2si as expected 666 } 667 unittest 668 { 669 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 670 } 671 672 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe 673 { 674 return a / b; 675 } 676 unittest 677 { 678 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 679 a = _mm_div_ps(a, a); 680 float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f]; 681 assert(a.array == correct); 682 } 683 684 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe 685 { 686 static if (GDC_with_SSE) 687 return __builtin_ia32_divss(a, b); 688 else 689 { 690 a[0] /= b[0]; 691 return a; 692 } 693 } 694 unittest 695 { 696 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 697 a = _mm_div_ss(a, a); 698 float[4] correct = [1.0f, -2.0, 3.0f, 1.0f]; 699 assert(a.array == correct); 700 } 701 702 int _mm_extract_pi16 (__m64 a, int imm8) 703 { 704 short4 sa = cast(short4)a; 705 return cast(ushort)(sa.array[imm8]); 706 } 707 unittest 708 { 709 __m64 A = _mm_setr_pi16(-1, 6, 0, 4); 710 assert(_mm_extract_pi16(A, 0) == 65535); 711 assert(_mm_extract_pi16(A, 1) == 6); 712 assert(_mm_extract_pi16(A, 2) == 0); 713 assert(_mm_extract_pi16(A, 3) == 4); 714 } 715 716 /// Free aligned memory that was allocated with `_mm_malloc`. 717 void _mm_free(void * mem_addr) @trusted 718 { 719 // support for free(NULL) 720 if (mem_addr is null) 721 return; 722 723 // Technically we don't need to store size and alignement in the chunk, but we do in case we 724 // have to implement _mm_realloc 725 726 size_t pointerSize = (void*).sizeof; 727 void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof); 728 size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize); 729 size_t alignment = *alignmentLocation; 730 assert(alignment != 0); 731 assert(isPointerAligned(mem_addr, alignment)); 732 free(*rawLocation); 733 } 734 735 uint _MM_GET_EXCEPTION_MASK() @safe 736 { 737 return _mm_getcsr() & _MM_MASK_MASK; 738 } 739 740 uint _MM_GET_EXCEPTION_STATE() @safe 741 { 742 return _mm_getcsr() & _MM_EXCEPT_MASK; 743 } 744 745 uint _MM_GET_FLUSH_ZERO_MODE() @safe 746 { 747 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 748 } 749 750 uint _MM_GET_ROUNDING_MODE() @safe 751 { 752 return _mm_getcsr() & _MM_ROUND_MASK; 753 } 754 755 uint _mm_getcsr() @trusted 756 { 757 static if (LDC_with_ARM) 758 { 759 // Note: we convert the ARM FPSCR into a x86 SSE control word. 760 // However, only rounding mode and flush to zero are actually set. 761 // The returned control word will have all exceptions masked, and no exception detected. 762 763 uint fpscr = arm_get_fpcr(); 764 765 uint cw = 0; // No exception detected 766 if (fpscr & _MM_FLUSH_ZERO_MASK_ARM) 767 { 768 // ARM has one single flag for ARM. 769 // It does both x86 bits. 770 // https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode 771 cw |= _MM_FLUSH_ZERO_ON; 772 cw |= 0x40; // set "denormals are zeros" 773 } 774 cw |= _MM_MASK_MASK; // All exception maske 775 776 // Rounding mode 777 switch(fpscr & _MM_ROUND_MASK_ARM) 778 { 779 default: 780 case _MM_ROUND_NEAREST_ARM: cw |= _MM_ROUND_NEAREST; break; 781 case _MM_ROUND_DOWN_ARM: cw |= _MM_ROUND_DOWN; break; 782 case _MM_ROUND_UP_ARM: cw |= _MM_ROUND_UP; break; 783 case _MM_ROUND_TOWARD_ZERO_ARM: cw |= _MM_ROUND_TOWARD_ZERO; break; 784 } 785 return cw; 786 } 787 else version(GNU) 788 { 789 static if (GDC_with_SSE) 790 { 791 return __builtin_ia32_stmxcsr(); 792 } 793 else version(X86) 794 { 795 uint sseRounding = 0; 796 asm pure nothrow @nogc @trusted 797 { 798 "stmxcsr %0;\n" 799 : "=m" (sseRounding) 800 : 801 : ; 802 } 803 return sseRounding; 804 } 805 else 806 static assert(false); 807 } 808 else version (InlineX86Asm) 809 { 810 uint controlWord; 811 asm nothrow @nogc pure @safe 812 { 813 stmxcsr controlWord; 814 } 815 return controlWord; 816 } 817 else 818 static assert(0, "Not yet supported"); 819 } 820 unittest 821 { 822 uint csr = _mm_getcsr(); 823 } 824 825 __m64 _mm_insert_pi16 (__m64 v, int i, int index) pure @trusted 826 { 827 short4 r = cast(short4)v; 828 r.ptr[index & 3] = cast(short)i; 829 return cast(__m64)r; 830 } 831 unittest 832 { 833 __m64 A = _mm_set_pi16(3, 2, 1, 0); 834 short4 R = cast(short4) _mm_insert_pi16(A, 42, 1 | 4); 835 short[4] correct = [0, 42, 2, 3]; 836 assert(R.array == correct); 837 } 838 839 __m128 _mm_load_ps(const(float)*p) pure @trusted 840 { 841 return *cast(__m128*)p; 842 } 843 844 __m128 _mm_load_ps1(const(float)*p) pure @trusted 845 { 846 return __m128(*p); 847 } 848 849 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted 850 { 851 __m128 r; 852 r.ptr[0] = *mem_addr; 853 r.ptr[1] = 0; 854 r.ptr[2] = 0; 855 r.ptr[3] = 0; 856 return r; 857 } 858 859 alias _mm_load1_ps = _mm_load_ps1; 860 861 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @trusted 862 { 863 long2 la = cast(long2)a; 864 la.ptr[1] = (*mem_addr).array[0]; 865 return cast(__m128)la; 866 } 867 868 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @trusted 869 { 870 long2 la = cast(long2)a; 871 la.ptr[0] = (*mem_addr).array[0]; 872 return cast(__m128)la; 873 } 874 875 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted 876 { 877 __m128* aligned = cast(__m128*)mem_addr; 878 __m128 a = *aligned; 879 __m128 r; 880 r.ptr[0] = a.array[3]; 881 r.ptr[1] = a.array[2]; 882 r.ptr[2] = a.array[1]; 883 r.ptr[3] = a.array[0]; 884 return r; 885 } 886 887 __m128 _mm_loadu_ps(const(float)*p) pure @safe 888 { 889 return loadUnaligned!(__m128)(p); 890 } 891 892 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted 893 { 894 short r = *cast(short*)(mem_addr); 895 short8 result = [0, 0, 0, 0, 0, 0, 0, 0]; 896 result.ptr[0] = r; 897 return cast(__m128i)result; 898 } 899 unittest 900 { 901 short r = 13; 902 short8 A = cast(short8) _mm_loadu_si16(&r); 903 short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0]; 904 assert(A.array == correct); 905 } 906 907 __m128i _mm_loadu_si64(const(void)* mem_addr) pure @trusted 908 { 909 long r = *cast(long*)(mem_addr); 910 long2 result = [0, 0]; 911 result.ptr[0] = r; 912 return cast(__m128i)result; 913 } 914 unittest 915 { 916 long r = 446446446446; 917 long2 A = cast(long2) _mm_loadu_si64(&r); 918 long[2] correct = [446446446446, 0]; 919 assert(A.array == correct); 920 } 921 922 /// Allocate size bytes of memory, aligned to the alignment specified in align, 923 /// and return a pointer to the allocated memory. `_mm_free` should be used to free 924 /// memory that is allocated with `_mm_malloc`. 925 void* _mm_malloc(size_t size, size_t alignment) @trusted 926 { 927 assert(alignment != 0); 928 size_t request = requestedSize(size, alignment); 929 void* raw = malloc(request); 930 if (request > 0 && raw == null) // malloc(0) can validly return anything 931 onOutOfMemoryError(); 932 return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size 933 } 934 935 void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr) @trusted 936 { 937 // this works since mask is zero-extended 938 return _mm_maskmoveu_si128 (to_m128i(a), to_m128i(mask), mem_addr); 939 } 940 941 deprecated alias _m_maskmovq = _mm_maskmove_si64; 942 943 __m64 _mm_max_pi16 (__m64 a, __m64 b) pure @safe 944 { 945 return to_m64(_mm_max_epi16(to_m128i(a), to_m128i(b))); 946 } 947 948 static if (GDC_with_SSE) 949 { 950 alias _mm_max_ps = __builtin_ia32_maxps; 951 } 952 else static if (LDC_with_SSE1) 953 { 954 alias _mm_max_ps = __builtin_ia32_maxps; 955 } 956 else 957 { 958 __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe 959 { 960 // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02 961 __m128 r; 962 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 963 r[1] = (a[1] > b[1]) ? a[1] : b[1]; 964 r[2] = (a[2] > b[2]) ? a[2] : b[2]; 965 r[3] = (a[3] > b[3]) ? a[3] : b[3]; 966 return r; 967 } 968 } 969 unittest 970 { 971 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 972 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 973 __m128 M = _mm_max_ps(A, B); 974 assert(M.array[0] == 4); 975 assert(M.array[1] == 2); 976 assert(M.array[2] == 4); // in case of NaN, second operand prevails (as it seems) 977 assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems) 978 } 979 980 __m64 _mm_max_pu8 (__m64 a, __m64 b) pure @safe 981 { 982 return to_m64(_mm_max_epu8(to_m128i(a), to_m128i(b))); 983 } 984 985 static if (GDC_with_SSE) 986 { 987 alias _mm_max_ss = __builtin_ia32_maxss; 988 } 989 else static if (LDC_with_SSE1) 990 { 991 alias _mm_max_ss = __builtin_ia32_maxss; 992 } 993 else 994 { 995 __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe 996 { 997 __m128 r = a; 998 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 999 return r; 1000 } 1001 } 1002 unittest 1003 { 1004 __m128 A = _mm_setr_ps(1, 2, 3, 4); 1005 __m128 B = _mm_setr_ps(4, 1, 4, 1); 1006 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 1007 __m128 M = _mm_max_ss(A, B); 1008 assert(M.array[0] == 4); 1009 assert(M.array[1] == 2); 1010 assert(M.array[2] == 3); 1011 assert(M.array[3] == 4); 1012 M = _mm_max_ps(A, C); // in case of NaN, second operand prevails 1013 assert(M.array[0] != M.array[0]); 1014 M = _mm_max_ps(C, A); // in case of NaN, second operand prevails 1015 assert(M.array[0] == 1); 1016 } 1017 1018 __m64 _mm_min_pi16 (__m64 a, __m64 b) pure @safe 1019 { 1020 return to_m64(_mm_min_epi16(to_m128i(a), to_m128i(b))); 1021 } 1022 1023 static if (GDC_with_SSE) 1024 { 1025 alias _mm_min_ps = __builtin_ia32_minps; 1026 } 1027 else static if (LDC_with_SSE1) 1028 { 1029 alias _mm_min_ps = __builtin_ia32_minps; 1030 } 1031 else 1032 { 1033 __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe 1034 { 1035 // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02 1036 __m128 r; 1037 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 1038 r[1] = (a[1] < b[1]) ? a[1] : b[1]; 1039 r[2] = (a[2] < b[2]) ? a[2] : b[2]; 1040 r[3] = (a[3] < b[3]) ? a[3] : b[3]; 1041 return r; 1042 } 1043 } 1044 unittest 1045 { 1046 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 1047 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 1048 __m128 M = _mm_min_ps(A, B); 1049 assert(M.array[0] == 1); 1050 assert(M.array[1] == 1); 1051 assert(M.array[2] == 4); // in case of NaN, second operand prevails (as it seems) 1052 assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems) 1053 } 1054 1055 __m64 _mm_min_pu8 (__m64 a, __m64 b) pure @safe 1056 { 1057 return to_m64(_mm_min_epu8(to_m128i(a), to_m128i(b))); 1058 } 1059 1060 static if (GDC_with_SSE) 1061 { 1062 alias _mm_min_ss = __builtin_ia32_minss; 1063 } 1064 else static if (LDC_with_SSE1) 1065 { 1066 alias _mm_min_ss = __builtin_ia32_minss; 1067 } 1068 else 1069 { 1070 __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe 1071 { 1072 // Generates minss since LDC 1.3 -O1 1073 __m128 r = a; 1074 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 1075 return r; 1076 } 1077 } 1078 unittest 1079 { 1080 __m128 A = _mm_setr_ps(1, 2, 3, 4); 1081 __m128 B = _mm_setr_ps(4, 1, 4, 1); 1082 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 1083 __m128 M = _mm_min_ss(A, B); 1084 assert(M.array[0] == 1); 1085 assert(M.array[1] == 2); 1086 assert(M.array[2] == 3); 1087 assert(M.array[3] == 4); 1088 M = _mm_min_ps(A, C); // in case of NaN, second operand prevails 1089 assert(M.array[0] != M.array[0]); 1090 M = _mm_min_ps(C, A); // in case of NaN, second operand prevails 1091 assert(M.array[0] == 1); 1092 } 1093 1094 __m128 _mm_move_ss (__m128 a, __m128 b) pure @trusted 1095 { 1096 a.ptr[0] = b.array[0]; 1097 return a; 1098 } 1099 1100 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @trusted 1101 { 1102 b.ptr[0] = a.array[2]; 1103 b.ptr[1] = a.array[3]; 1104 return b; 1105 } 1106 1107 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @trusted 1108 { 1109 a.ptr[2] = b.array[0]; 1110 a.ptr[3] = b.array[1]; 1111 return a; 1112 } 1113 1114 int _mm_movemask_pi8 (__m64 a) pure @safe 1115 { 1116 return _mm_movemask_epi8(to_m128i(a)); 1117 } 1118 unittest 1119 { 1120 assert(0x9C == _mm_movemask_pi8(_mm_set_pi8(-1, 0, 0, -1, -1, -1, 0, 0))); 1121 } 1122 1123 static if (GDC_with_SSE) 1124 { 1125 alias _mm_movemask_ps = __builtin_ia32_movmskps; 1126 } 1127 else static if (LDC_with_SSE1) 1128 { 1129 alias _mm_movemask_ps = __builtin_ia32_movmskps; 1130 } 1131 else static if (LDC_with_ARM) 1132 { 1133 int _mm_movemask_ps (__m128 a) pure @safe 1134 { 1135 int4 ai = cast(int4)a; 1136 int4 shift31 = [31, 31, 31, 31]; 1137 ai = ai >>> shift31; 1138 int4 shift = [0, 1, 2, 3]; 1139 ai = ai << shift; // 4-way shift, only efficient on ARM. 1140 int r = ai.array[0] + (ai.array[1]) + (ai.array[2]) + (ai.array[3]); 1141 return r; 1142 } 1143 } 1144 else 1145 { 1146 int _mm_movemask_ps (__m128 a) pure @safe 1147 { 1148 int4 ai = cast(int4)a; 1149 int r = 0; 1150 if (ai[0] < 0) r += 1; 1151 if (ai[1] < 0) r += 2; 1152 if (ai[2] < 0) r += 4; 1153 if (ai[3] < 0) r += 8; 1154 return r; 1155 } 1156 } 1157 unittest 1158 { 1159 int4 A = [-1, 0, -43, 0]; 1160 assert(5 == _mm_movemask_ps(cast(float4)A)); 1161 } 1162 1163 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe 1164 { 1165 return a * b; 1166 } 1167 unittest 1168 { 1169 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1170 a = _mm_mul_ps(a, a); 1171 float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f]; 1172 assert(a.array == correct); 1173 } 1174 1175 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe 1176 { 1177 static if (GDC_with_SSE) 1178 return __builtin_ia32_mulss(a, b); 1179 else 1180 { 1181 a[0] *= b[0]; 1182 return a; 1183 } 1184 } 1185 unittest 1186 { 1187 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1188 a = _mm_mul_ss(a, a); 1189 float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f]; 1190 assert(a.array == correct); 1191 } 1192 1193 __m64 _mm_mulhi_pu16 (__m64 a, __m64 b) pure @safe 1194 { 1195 return to_m64(_mm_mulhi_epu16(to_m128i(a), to_m128i(b))); 1196 } 1197 unittest 1198 { 1199 __m64 A = _mm_setr_pi16(0, -16, 2, 3); 1200 __m64 B = _mm_set1_pi16(16384); 1201 short4 R = cast(short4)_mm_mulhi_pu16(A, B); 1202 short[4] correct = [0, 0x3FFC, 0, 0]; 1203 assert(R.array == correct); 1204 } 1205 1206 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe 1207 { 1208 return cast(__m128)(cast(__m128i)a | cast(__m128i)b); 1209 } 1210 1211 deprecated alias 1212 _m_pavgb = _mm_avg_pu8, 1213 _m_pavgw = _mm_avg_pu16, 1214 _m_pextrw = _mm_extract_pi16, 1215 _m_pinsrw = _mm_insert_pi16, 1216 _m_pmaxsw = _mm_max_pi16, 1217 _m_pmaxub = _mm_max_pu8, 1218 _m_pminsw = _mm_min_pi16, 1219 _m_pminub = _mm_min_pu8, 1220 _m_pmovmskb = _mm_movemask_pi8, 1221 _m_pmulhuw = _mm_mulhi_pu16; 1222 1223 enum _MM_HINT_T0 = 3; /// 1224 enum _MM_HINT_T1 = 2; /// 1225 enum _MM_HINT_T2 = 1; /// 1226 enum _MM_HINT_NTA = 0; /// 1227 1228 1229 version(LDC) 1230 { 1231 // Starting with LLVM 10, it seems llvm.prefetch has changed its name. 1232 // Was reported at: https://github.com/ldc-developers/ldc/issues/3397 1233 static if (__VERSION__ >= 2091) 1234 { 1235 pragma(LDC_intrinsic, "llvm.prefetch.p0i8") // was "llvm.prefetch" 1236 void llvm_prefetch_fixed(void* ptr, uint rw, uint locality, uint cachetype) pure @safe; 1237 } 1238 } 1239 1240 /// Fetch the line of data from memory that contains address `p` to a location in the 1241 /// cache hierarchy specified by the locality hint i. 1242 /// 1243 /// Warning: `locality` is a compile-time parameter, unlike in Intel Intrinsics API. 1244 void _mm_prefetch(int locality)(const(void)* p) pure @trusted 1245 { 1246 static if (GDC_with_SSE) 1247 { 1248 return __builtin_prefetch(p, (locality & 0x4) >> 2, locality & 0x3); 1249 } 1250 else version(LDC) 1251 { 1252 static if (__VERSION__ >= 2091) 1253 { 1254 // const_cast here. `llvm_prefetch` wants a mutable pointer 1255 llvm_prefetch_fixed( cast(void*)p, 0, locality, 1); 1256 } 1257 else 1258 { 1259 // const_cast here. `llvm_prefetch` wants a mutable pointer 1260 llvm_prefetch( cast(void*)p, 0, locality, 1); 1261 } 1262 } 1263 else version(D_InlineAsm_X86_64) 1264 { 1265 static if (locality == _MM_HINT_NTA) 1266 { 1267 asm pure nothrow @nogc @trusted 1268 { 1269 mov RAX, p; 1270 prefetchnta [RAX]; 1271 } 1272 } 1273 else static if (locality == _MM_HINT_T0) 1274 { 1275 asm pure nothrow @nogc @trusted 1276 { 1277 mov RAX, p; 1278 prefetcht0 [RAX]; 1279 } 1280 } 1281 else static if (locality == _MM_HINT_T1) 1282 { 1283 asm pure nothrow @nogc @trusted 1284 { 1285 mov RAX, p; 1286 prefetcht1 [RAX]; 1287 } 1288 } 1289 else static if (locality == _MM_HINT_T2) 1290 { 1291 asm pure nothrow @nogc @trusted 1292 { 1293 mov RAX, p; 1294 prefetcht2 [RAX]; 1295 } 1296 } 1297 else 1298 assert(false); // invalid locality hint 1299 } 1300 else version(D_InlineAsm_X86) 1301 { 1302 static if (locality == _MM_HINT_NTA) 1303 { 1304 asm pure nothrow @nogc @trusted 1305 { 1306 mov EAX, p; 1307 prefetchnta [EAX]; 1308 } 1309 } 1310 else static if (locality == _MM_HINT_T0) 1311 { 1312 asm pure nothrow @nogc @trusted 1313 { 1314 mov EAX, p; 1315 prefetcht0 [EAX]; 1316 } 1317 } 1318 else static if (locality == _MM_HINT_T1) 1319 { 1320 asm pure nothrow @nogc @trusted 1321 { 1322 mov EAX, p; 1323 prefetcht1 [EAX]; 1324 } 1325 } 1326 else static if (locality == _MM_HINT_T2) 1327 { 1328 asm pure nothrow @nogc @trusted 1329 { 1330 mov EAX, p; 1331 prefetcht2 [EAX]; 1332 } 1333 } 1334 else 1335 assert(false); // invalid locality hint 1336 } 1337 else 1338 { 1339 // Generic version: do nothing. From bitter experience, 1340 // it's unlikely you get ANY speed-up with manual prefetching. 1341 // Prefetching or not doesn't change program behaviour. 1342 } 1343 } 1344 unittest 1345 { 1346 // From Intel documentation: 1347 // "The amount of data prefetched is also processor implementation-dependent. It will, however, be a minimum of 32 bytes." 1348 ubyte[256] cacheline; // though it seems it cannot generate GP fault 1349 _mm_prefetch!_MM_HINT_T0(cacheline.ptr); 1350 _mm_prefetch!_MM_HINT_T1(cacheline.ptr); 1351 _mm_prefetch!_MM_HINT_T2(cacheline.ptr); 1352 _mm_prefetch!_MM_HINT_NTA(cacheline.ptr); 1353 } 1354 1355 deprecated alias 1356 _m_psadbw = _mm_sad_pu8, 1357 _m_pshufw = _mm_shuffle_pi16; 1358 1359 static if (GDC_with_SSE) 1360 { 1361 alias _mm_rcp_ps = __builtin_ia32_rcpps; 1362 } 1363 else static if (LDC_with_SSE1) 1364 { 1365 alias _mm_rcp_ps = __builtin_ia32_rcpps; 1366 } 1367 else 1368 { 1369 __m128 _mm_rcp_ps (__m128 a) pure @safe 1370 { 1371 a[0] = 1.0f / a[0]; 1372 a[1] = 1.0f / a[1]; 1373 a[2] = 1.0f / a[2]; 1374 a[3] = 1.0f / a[3]; 1375 return a; 1376 } 1377 } 1378 1379 static if (GDC_with_SSE) 1380 { 1381 alias _mm_rcp_ss = __builtin_ia32_rcpss; 1382 } 1383 else static if (LDC_with_SSE1) 1384 { 1385 alias _mm_rcp_ss = __builtin_ia32_rcpss; 1386 } 1387 else 1388 { 1389 __m128 _mm_rcp_ss (__m128 a) pure @safe 1390 { 1391 a[0] = 1.0f / a[0]; 1392 return a; 1393 } 1394 } 1395 1396 static if (GDC_with_SSE) 1397 { 1398 alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps; 1399 } 1400 else static if (LDC_with_SSE1) 1401 { 1402 alias _mm_rsqrt_ps = __builtin_ia32_rsqrtps; 1403 } 1404 else version(LDC) 1405 { 1406 __m128 _mm_rsqrt_ps (__m128 a) pure @safe 1407 { 1408 a[0] = 1.0f / llvm_sqrt(a[0]); 1409 a[1] = 1.0f / llvm_sqrt(a[1]); 1410 a[2] = 1.0f / llvm_sqrt(a[2]); 1411 a[3] = 1.0f / llvm_sqrt(a[3]); 1412 return a; 1413 } 1414 } 1415 else 1416 { 1417 __m128 _mm_rsqrt_ps (__m128 a) pure @safe 1418 { 1419 a[0] = 1.0f / sqrt(a[0]); 1420 a[1] = 1.0f / sqrt(a[1]); 1421 a[2] = 1.0f / sqrt(a[2]); 1422 a[3] = 1.0f / sqrt(a[3]); 1423 return a; 1424 } 1425 } 1426 1427 static if (GDC_with_SSE) 1428 { 1429 alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss; 1430 } 1431 else static if (LDC_with_SSE1) 1432 { 1433 alias _mm_rsqrt_ss = __builtin_ia32_rsqrtss; 1434 } 1435 else version(LDC) 1436 { 1437 __m128 _mm_rsqrt_ss (__m128 a) pure @safe 1438 { 1439 a[0] = 1.0f / llvm_sqrt(a[0]); 1440 return a; 1441 } 1442 } 1443 else 1444 { 1445 __m128 _mm_rsqrt_ss (__m128 a) pure @safe 1446 { 1447 a[0] = 1.0f / sqrt(a[0]); 1448 return a; 1449 } 1450 } 1451 1452 unittest 1453 { 1454 double maxRelativeError = 0.000245; // -72 dB 1455 void testInvSqrt(float number) nothrow @nogc 1456 { 1457 __m128 A = _mm_set1_ps(number); 1458 1459 // test _mm_rcp_ps 1460 __m128 B = _mm_rcp_ps(A); 1461 foreach(i; 0..4) 1462 { 1463 double exact = 1.0f / A.array[i]; 1464 double ratio = cast(double)(B.array[i]) / cast(double)(exact); 1465 assert(abs(ratio - 1) <= maxRelativeError); 1466 } 1467 1468 // test _mm_rcp_ss 1469 { 1470 B = _mm_rcp_ss(A); 1471 double exact = 1.0f / A.array[0]; 1472 double ratio = cast(double)(B.array[0]) / cast(double)(exact); 1473 assert(abs(ratio - 1) <= maxRelativeError); 1474 } 1475 1476 // test _mm_rsqrt_ps 1477 B = _mm_rsqrt_ps(A); 1478 foreach(i; 0..4) 1479 { 1480 double exact = 1.0f / sqrt(A.array[i]); 1481 double ratio = cast(double)(B.array[i]) / cast(double)(exact); 1482 assert(abs(ratio - 1) <= maxRelativeError); 1483 } 1484 1485 // test _mm_rsqrt_ss 1486 { 1487 B = _mm_rsqrt_ss(A); 1488 double exact = 1.0f / sqrt(A.array[0]); 1489 double ratio = cast(double)(B.array[0]) / cast(double)(exact); 1490 assert(abs(ratio - 1) <= maxRelativeError); 1491 } 1492 } 1493 1494 testInvSqrt(1.1f); 1495 testInvSqrt(2.45674864151f); 1496 testInvSqrt(27841456468.0f); 1497 } 1498 1499 __m64 _mm_sad_pu8 (__m64 a, __m64 b) pure @safe 1500 { 1501 return to_m64(_mm_sad_epu8(to_m128i(a), to_m128i(b))); 1502 } 1503 1504 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) @safe 1505 { 1506 // Note: unsupported on ARM 1507 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx); 1508 } 1509 1510 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) @safe 1511 { 1512 // Note: unsupported on ARM 1513 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx); 1514 } 1515 1516 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) @safe 1517 { 1518 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx); 1519 } 1520 1521 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted 1522 { 1523 // Note: despite appearances, generates sensible code, 1524 // inlines correctly and is constant folded 1525 float[4] result = [e0, e1, e2, e3]; 1526 return loadUnaligned!(float4)(result.ptr); 1527 } 1528 unittest 1529 { 1530 __m128 A = _mm_set_ps(3, 2, 1, 546); 1531 float[4] correct = [546.0f, 1.0f, 2.0f, 3.0f]; 1532 assert(A.array == correct); 1533 assert(A.array[0] == 546.0f); 1534 assert(A.array[1] == 1.0f); 1535 assert(A.array[2] == 2.0f); 1536 assert(A.array[3] == 3.0f); 1537 } 1538 1539 alias _mm_set_ps1 = _mm_set1_ps; 1540 1541 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) @safe 1542 { 1543 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx); 1544 } 1545 1546 __m128 _mm_set_ss (float a) pure @trusted 1547 { 1548 __m128 r = _mm_setzero_ps(); 1549 r.ptr[0] = a; 1550 return r; 1551 } 1552 unittest 1553 { 1554 float[4] correct = [42.0f, 0.0f, 0.0f, 0.0f]; 1555 __m128 A = _mm_set_ss(42.0f); 1556 assert(A.array == correct); 1557 } 1558 1559 __m128 _mm_set1_ps (float a) pure @trusted 1560 { 1561 __m128 r = void; 1562 r.ptr[0] = a; 1563 r.ptr[1] = a; 1564 r.ptr[2] = a; 1565 r.ptr[3] = a; 1566 return r; 1567 } 1568 unittest 1569 { 1570 float[4] correct = [42.0f, 42.0f, 42.0f, 42.0f]; 1571 __m128 A = _mm_set1_ps(42.0f); 1572 assert(A.array == correct); 1573 } 1574 1575 void _mm_setcsr(uint controlWord) @trusted 1576 { 1577 static if (LDC_with_ARM) 1578 { 1579 // Convert from SSE to ARM control word. This is done _partially_ 1580 // and only support rounding mode changes. 1581 1582 // "To alter some bits of a VFP system register without 1583 // affecting other bits, use a read-modify-write procedure" 1584 uint fpscr = arm_get_fpcr(); 1585 1586 // Bits 23 to 22 are rounding modes, however not used in NEON 1587 fpscr = fpscr & ~_MM_ROUND_MASK_ARM; 1588 switch(controlWord & _MM_ROUND_MASK) 1589 { 1590 default: 1591 case _MM_ROUND_NEAREST: fpscr |= _MM_ROUND_NEAREST_ARM; break; 1592 case _MM_ROUND_DOWN: fpscr |= _MM_ROUND_DOWN_ARM; break; 1593 case _MM_ROUND_UP: fpscr |= _MM_ROUND_UP_ARM; break; 1594 case _MM_ROUND_TOWARD_ZERO: fpscr |= _MM_ROUND_TOWARD_ZERO_ARM; break; 1595 } 1596 fpscr = fpscr & ~_MM_FLUSH_ZERO_MASK_ARM; 1597 if (controlWord & _MM_FLUSH_ZERO_MASK) 1598 fpscr |= _MM_FLUSH_ZERO_MASK_ARM; 1599 arm_set_fpcr(fpscr); 1600 } 1601 else version(GNU) 1602 { 1603 static if (GDC_with_SSE) 1604 { 1605 __builtin_ia32_ldmxcsr(controlWord); 1606 } 1607 else version(X86) 1608 { 1609 asm pure nothrow @nogc @trusted 1610 { 1611 "ldmxcsr %0;\n" 1612 : 1613 : "m" (controlWord) 1614 : ; 1615 } 1616 } 1617 else 1618 static assert(false); 1619 } 1620 else version (InlineX86Asm) 1621 { 1622 asm pure nothrow @nogc @safe 1623 { 1624 ldmxcsr controlWord; 1625 } 1626 } 1627 else 1628 static assert(0, "Not yet supported"); 1629 } 1630 unittest 1631 { 1632 _mm_setcsr(_mm_getcsr()); 1633 } 1634 1635 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted 1636 { 1637 float[4] result = [e3, e2, e1, e0]; 1638 return loadUnaligned!(float4)(result.ptr); 1639 } 1640 unittest 1641 { 1642 __m128 A = _mm_setr_ps(3, 2, 1, 546); 1643 float[4] correct = [3.0f, 2.0f, 1.0f, 546.0f]; 1644 assert(A.array == correct); 1645 assert(A.array[0] == 3.0f); 1646 assert(A.array[1] == 2.0f); 1647 assert(A.array[2] == 1.0f); 1648 assert(A.array[3] == 546.0f); 1649 } 1650 1651 __m128 _mm_setzero_ps() pure @trusted 1652 { 1653 // Compiles to xorps without problems 1654 float[4] result = [0.0f, 0.0f, 0.0f, 0.0f]; 1655 return loadUnaligned!(float4)(result.ptr); 1656 } 1657 1658 version(GNU) 1659 { 1660 void _mm_sfence() pure @trusted 1661 { 1662 static if (GDC_with_SSE) 1663 { 1664 __builtin_ia32_sfence(); 1665 } 1666 else version(X86) 1667 { 1668 asm pure nothrow @nogc @trusted 1669 { 1670 "sfence;\n" : : : ; 1671 } 1672 } 1673 else 1674 static assert(false); 1675 } 1676 } 1677 else static if (LDC_with_SSE1) 1678 { 1679 alias _mm_sfence = __builtin_ia32_sfence; 1680 } 1681 else static if (DMD_with_asm) 1682 { 1683 void _mm_sfence() pure @safe 1684 { 1685 asm nothrow @nogc pure @safe 1686 { 1687 sfence; 1688 } 1689 } 1690 } 1691 else version(LDC) 1692 { 1693 void _mm_sfence() pure @safe 1694 { 1695 llvm_memory_fence(); // PERF: generates mfence 1696 } 1697 } 1698 else 1699 static assert(false); 1700 unittest 1701 { 1702 _mm_sfence(); 1703 } 1704 1705 /// Warning: the immediate shuffle value `imm8` is given at compile-time instead of runtime. 1706 __m64 _mm_shuffle_pi16(int imm8)(__m64 a) pure @safe 1707 { 1708 return cast(__m64) shufflevector!(short4, ( (imm8 >> 0) & 3 ), 1709 ( (imm8 >> 2) & 3 ), 1710 ( (imm8 >> 4) & 3 ), 1711 ( (imm8 >> 6) & 3 ))(cast(short4)a, cast(short4)a); 1712 } 1713 unittest 1714 { 1715 __m64 A = _mm_setr_pi16(0, 1, 2, 3); 1716 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 1717 short4 B = cast(short4) _mm_shuffle_pi16!SHUFFLE(A); 1718 short[4] expectedB = [ 3, 2, 1, 0 ]; 1719 assert(B.array == expectedB); 1720 } 1721 1722 /// Warning: the immediate shuffle value `imm8` is given at compile-time instead of runtime. 1723 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe 1724 { 1725 return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b); 1726 } 1727 1728 static if (GDC_with_SSE) 1729 { 1730 alias _mm_sqrt_ps = __builtin_ia32_sqrtps; 1731 } 1732 else version(LDC) 1733 { 1734 // Disappeared with LDC 1.11 1735 static if (__VERSION__ < 2081) 1736 alias _mm_sqrt_ps = __builtin_ia32_sqrtps; 1737 else 1738 { 1739 __m128 _mm_sqrt_ps(__m128 vec) pure @safe 1740 { 1741 vec.array[0] = llvm_sqrt(vec.array[0]); 1742 vec.array[1] = llvm_sqrt(vec.array[1]); 1743 vec.array[2] = llvm_sqrt(vec.array[2]); 1744 vec.array[3] = llvm_sqrt(vec.array[3]); 1745 return vec; 1746 } 1747 } 1748 } 1749 else 1750 { 1751 __m128 _mm_sqrt_ps(__m128 vec) pure @trusted 1752 { 1753 vec.ptr[0] = sqrt(vec.array[0]); 1754 vec.ptr[1] = sqrt(vec.array[1]); 1755 vec.ptr[2] = sqrt(vec.array[2]); 1756 vec.ptr[3] = sqrt(vec.array[3]); 1757 return vec; 1758 } 1759 } 1760 unittest 1761 { 1762 __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f)); 1763 assert(A.array[0] == 2.0f); 1764 assert(A.array[1] == 2.0f); 1765 assert(A.array[2] == 2.0f); 1766 assert(A.array[3] == 2.0f); 1767 } 1768 1769 static if (GDC_with_SSE) 1770 { 1771 alias _mm_sqrt_ss = __builtin_ia32_sqrtss; 1772 } 1773 else version(LDC) 1774 { 1775 // Disappeared with LDC 1.11 1776 static if (__VERSION__ < 2081) 1777 alias _mm_sqrt_ss = __builtin_ia32_sqrtss; 1778 else 1779 { 1780 __m128 _mm_sqrt_ss(__m128 vec) pure @safe 1781 { 1782 vec.array[0] = llvm_sqrt(vec.array[0]); 1783 vec.array[1] = vec.array[1]; 1784 vec.array[2] = vec.array[2]; 1785 vec.array[3] = vec.array[3]; 1786 return vec; 1787 } 1788 } 1789 } 1790 else 1791 { 1792 __m128 _mm_sqrt_ss(__m128 vec) pure @trusted 1793 { 1794 vec.ptr[0] = sqrt(vec.array[0]); 1795 return vec; 1796 } 1797 } 1798 unittest 1799 { 1800 __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f)); 1801 assert(A.array[0] == 2.0f); 1802 assert(A.array[1] == 4.0f); 1803 assert(A.array[2] == 4.0f); 1804 assert(A.array[3] == 4.0f); 1805 } 1806 1807 void _mm_store_ps (float* mem_addr, __m128 a) pure // not safe since nothing guarantees alignment 1808 { 1809 __m128* aligned = cast(__m128*)mem_addr; 1810 *aligned = a; 1811 } 1812 1813 alias _mm_store_ps1 = _mm_store1_ps; 1814 1815 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe 1816 { 1817 *mem_addr = a.array[0]; 1818 } 1819 unittest 1820 { 1821 float a; 1822 _mm_store_ss(&a, _mm_set_ps(3, 2, 1, 546)); 1823 assert(a == 546); 1824 } 1825 1826 void _mm_store1_ps(float* mem_addr, __m128 a) pure @trusted // not safe since nothing guarantees alignment 1827 { 1828 __m128* aligned = cast(__m128*)mem_addr; 1829 __m128 r; 1830 r.ptr[0] = a.array[0]; 1831 r.ptr[1] = a.array[0]; 1832 r.ptr[2] = a.array[0]; 1833 r.ptr[3] = a.array[0]; 1834 *aligned = r; 1835 } 1836 1837 void _mm_storeh_pi(__m64* p, __m128 a) pure @trusted 1838 { 1839 long2 la = cast(long2)a; 1840 (*p).ptr[0] = la.array[1]; 1841 } 1842 unittest 1843 { 1844 __m64 R = _mm_setzero_si64(); 1845 long2 A = [13, 25]; 1846 _mm_storeh_pi(&R, cast(__m128)A); 1847 assert(R.array[0] == 25); 1848 } 1849 1850 void _mm_storel_pi(__m64* p, __m128 a) pure @trusted 1851 { 1852 long2 la = cast(long2)a; 1853 (*p).ptr[0] = la.array[0]; 1854 } 1855 unittest 1856 { 1857 __m64 R = _mm_setzero_si64(); 1858 long2 A = [13, 25]; 1859 _mm_storel_pi(&R, cast(__m128)A); 1860 assert(R.array[0] == 13); 1861 } 1862 1863 void _mm_storer_ps(float* mem_addr, __m128 a) pure @trusted // not safe since nothing guarantees alignment 1864 { 1865 __m128* aligned = cast(__m128*)mem_addr; 1866 __m128 r; 1867 r.ptr[0] = a.array[3]; 1868 r.ptr[1] = a.array[2]; 1869 r.ptr[2] = a.array[1]; 1870 r.ptr[3] = a.array[0]; 1871 *aligned = r; 1872 } 1873 1874 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe 1875 { 1876 storeUnaligned!(float4)(a, mem_addr); 1877 } 1878 1879 void _mm_stream_pi (__m64* mem_addr, __m64 a) 1880 { 1881 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 1882 *mem_addr = a; // it's a regular move instead 1883 } 1884 1885 // BUG: can't implement non-temporal store with LDC inlineIR since !nontemporal 1886 // needs some IR outside this function that would say: 1887 // 1888 // !0 = !{ i32 1 } 1889 // 1890 // It's a LLVM IR metadata description. 1891 // Regardless, non-temporal moves are really dangerous for performance... 1892 void _mm_stream_ps (float* mem_addr, __m128 a) 1893 { 1894 __m128* dest = cast(__m128*)mem_addr; 1895 *dest = a; // it's a regular move instead 1896 } 1897 unittest 1898 { 1899 align(16) float[4] A; 1900 _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f)); 1901 assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f); 1902 } 1903 1904 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe 1905 { 1906 return a - b; 1907 } 1908 unittest 1909 { 1910 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1911 a = _mm_sub_ps(a, a); 1912 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 1913 assert(a.array == correct); 1914 } 1915 1916 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe 1917 { 1918 static if (GDC_with_SSE) 1919 return __builtin_ia32_subss(a, b); 1920 else 1921 { 1922 a[0] -= b[0]; 1923 return a; 1924 } 1925 } 1926 unittest 1927 { 1928 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1929 a = _mm_sub_ss(a, a); 1930 float[4] correct = [0.0f, -2.0, 3.0f, 1.0f]; 1931 assert(a.array == correct); 1932 } 1933 1934 1935 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe 1936 { 1937 __m128 tmp3, tmp2, tmp1, tmp0; 1938 tmp0 = _mm_unpacklo_ps(row0, row1); 1939 tmp2 = _mm_unpacklo_ps(row2, row3); 1940 tmp1 = _mm_unpackhi_ps(row0, row1); 1941 tmp3 = _mm_unpackhi_ps(row2, row3); 1942 row0 = _mm_movelh_ps(tmp0, tmp2); 1943 row1 = _mm_movehl_ps(tmp2, tmp0); 1944 row2 = _mm_movelh_ps(tmp1, tmp3); 1945 row3 = _mm_movehl_ps(tmp3, tmp1); 1946 } 1947 1948 // Note: the only difference between these intrinsics is the signalling 1949 // behaviour of quiet NaNs. This is incorrect but the case where 1950 // you would want to differentiate between qNaN and sNaN and then 1951 // treat them differently on purpose seems extremely rare. 1952 alias _mm_ucomieq_ss = _mm_comieq_ss; 1953 alias _mm_ucomige_ss = _mm_comige_ss; 1954 alias _mm_ucomigt_ss = _mm_comigt_ss; 1955 alias _mm_ucomile_ss = _mm_comile_ss; 1956 alias _mm_ucomilt_ss = _mm_comilt_ss; 1957 alias _mm_ucomineq_ss = _mm_comineq_ss; 1958 1959 1960 __m128 _mm_undefined_ps() pure @safe 1961 { 1962 __m128 undef = void; 1963 return undef; 1964 } 1965 1966 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @trusted 1967 { 1968 __m128 r; 1969 r.ptr[0] = a.array[2]; 1970 r.ptr[1] = b.array[2]; 1971 r.ptr[2] = a.array[3]; 1972 r.ptr[3] = b.array[3]; 1973 return r; 1974 } 1975 1976 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @trusted 1977 { 1978 __m128 r; 1979 r.ptr[0] = a.array[0]; 1980 r.ptr[1] = b.array[0]; 1981 r.ptr[2] = a.array[1]; 1982 r.ptr[3] = b.array[1]; 1983 return r; 1984 } 1985 1986 __m128 _mm_xor_ps (__m128 a, __m128 b) pure @safe 1987 { 1988 return cast(__m128)(cast(__m128i)a ^ cast(__m128i)b); 1989 } 1990 1991 1992 private 1993 { 1994 /// Returns: `true` if the pointer is suitably aligned. 1995 bool isPointerAligned(void* p, size_t alignment) pure 1996 { 1997 assert(alignment != 0); 1998 return ( cast(size_t)p & (alignment - 1) ) == 0; 1999 } 2000 2001 /// Returns: next pointer aligned with alignment bytes. 2002 void* nextAlignedPointer(void* start, size_t alignment) pure 2003 { 2004 return cast(void*)nextMultipleOf(cast(size_t)(start), alignment); 2005 } 2006 2007 // Returns number of bytes to actually allocate when asking 2008 // for a particular alignment 2009 @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure 2010 { 2011 enum size_t pointerSize = size_t.sizeof; 2012 return askedSize + alignment - 1 + pointerSize * 3; 2013 } 2014 2015 // Store pointer given my malloc, size and alignment 2016 @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure 2017 { 2018 enum size_t pointerSize = size_t.sizeof; 2019 char* start = cast(char*)raw + pointerSize * 3; 2020 void* aligned = nextAlignedPointer(start, alignment); 2021 void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize); 2022 *rawLocation = raw; 2023 size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize); 2024 *sizeLocation = size; 2025 size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize); 2026 *alignmentLocation = alignment; 2027 assert( isPointerAligned(aligned, alignment) ); 2028 return aligned; 2029 } 2030 2031 // Returns: x, multiple of powerOfTwo, so that x >= n. 2032 @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow 2033 { 2034 // check power-of-two 2035 assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0)); 2036 2037 size_t mask = ~(powerOfTwo - 1); 2038 return (n + powerOfTwo - 1) & mask; 2039 } 2040 } 2041 2042 unittest 2043 { 2044 assert(nextMultipleOf(0, 4) == 0); 2045 assert(nextMultipleOf(1, 4) == 4); 2046 assert(nextMultipleOf(2, 4) == 4); 2047 assert(nextMultipleOf(3, 4) == 4); 2048 assert(nextMultipleOf(4, 4) == 4); 2049 assert(nextMultipleOf(5, 4) == 8); 2050 2051 { 2052 void* p = _mm_malloc(23, 16); 2053 assert(p !is null); 2054 assert(((cast(size_t)p) & 0xf) == 0); 2055 _mm_free(p); 2056 } 2057 2058 void* nullAlloc = _mm_malloc(0, 32); 2059 assert(nullAlloc != null); 2060 _mm_free(nullAlloc); 2061 } 2062 2063 // For some reason, order of declaration is important for this one... 4 2064 // so it is misplaced. 2065 // Note: is just another name for _mm_cvtss_si32 2066 alias _mm_cvt_ss2si = _mm_cvtss_si32;