1 /** 2 * SSE intrinsics. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.xmmintrin; 8 9 public import inteli.types; 10 11 import inteli.internals; 12 13 import inteli.mmx; 14 import inteli.emmintrin; 15 16 import core.stdc.stdlib: malloc, free; 17 import core.exception: onOutOfMemoryError; 18 19 version(D_InlineAsm_X86) 20 version = InlineX86Asm; 21 else version(D_InlineAsm_X86_64) 22 version = InlineX86Asm; 23 24 25 // SSE1 26 27 nothrow @nogc: 28 29 30 enum int _MM_EXCEPT_INVALID = 0x0001; /// MXCSR Exception states. 31 enum int _MM_EXCEPT_DENORM = 0x0002; ///ditto 32 enum int _MM_EXCEPT_DIV_ZERO = 0x0004; ///ditto 33 enum int _MM_EXCEPT_OVERFLOW = 0x0008; ///ditto 34 enum int _MM_EXCEPT_UNDERFLOW = 0x0010; ///ditto 35 enum int _MM_EXCEPT_INEXACT = 0x0020; ///ditto 36 enum int _MM_EXCEPT_MASK = 0x003f; /// MXCSR Exception states mask. 37 38 enum int _MM_MASK_INVALID = 0x0080; /// MXCSR Exception masks. 39 enum int _MM_MASK_DENORM = 0x0100; ///ditto 40 enum int _MM_MASK_DIV_ZERO = 0x0200; ///ditto 41 enum int _MM_MASK_OVERFLOW = 0x0400; ///ditto 42 enum int _MM_MASK_UNDERFLOW = 0x0800; ///ditto 43 enum int _MM_MASK_INEXACT = 0x1000; ///ditto 44 enum int _MM_MASK_MASK = 0x1f80; /// MXCSR Exception masks mask. 45 46 enum int _MM_ROUND_NEAREST = 0x0000; /// MXCSR Rounding mode. 47 enum int _MM_ROUND_DOWN = 0x2000; ///ditto 48 enum int _MM_ROUND_UP = 0x4000; ///ditto 49 enum int _MM_ROUND_TOWARD_ZERO = 0x6000; ///ditto 50 enum int _MM_ROUND_MASK = 0x6000; /// MXCSR Rounding mode mask. 51 52 enum int _MM_FLUSH_ZERO_MASK = 0x8000; /// MXCSR Denormal flush to zero mask. 53 enum int _MM_FLUSH_ZERO_ON = 0x8000; /// MXCSR Denormal flush to zero modes. 54 enum int _MM_FLUSH_ZERO_OFF = 0x0000; ///ditto 55 56 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`. 57 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe 58 { 59 return a + b; 60 } 61 unittest 62 { 63 __m128 a = [1, 2, 3, 4]; 64 a = _mm_add_ps(a, a); 65 assert(a.array[0] == 2); 66 assert(a.array[1] == 4); 67 assert(a.array[2] == 6); 68 assert(a.array[3] == 8); 69 } 70 71 /// Add the lower single-precision (32-bit) floating-point element 72 /// in `a` and `b`, store the result in the lower element of result, 73 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 74 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe 75 { 76 static if (GDC_with_SSE) 77 { 78 return __builtin_ia32_addss(a, b); 79 } 80 else static if (DMD_with_DSIMD) 81 { 82 return cast(__m128) __simd(XMM.ADDSS, a, b); 83 } 84 else 85 { 86 a[0] += b[0]; 87 return a; 88 } 89 } 90 unittest 91 { 92 __m128 a = [1, 2, 3, 4]; 93 a = _mm_add_ss(a, a); 94 assert(a.array == [2.0f, 2, 3, 4]); 95 } 96 97 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`. 98 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe 99 { 100 return cast(__m128)(cast(__m128i)a & cast(__m128i)b); 101 } 102 unittest 103 { 104 float a = 4.32f; 105 float b = -78.99f; 106 int correct = (*cast(int*)(&a)) & (*cast(int*)(&b)); 107 __m128 A = _mm_set_ps(a, b, a, b); 108 __m128 B = _mm_set_ps(b, a, b, a); 109 int4 R = cast(int4)( _mm_and_ps(A, B) ); 110 assert(R.array[0] == correct); 111 assert(R.array[1] == correct); 112 assert(R.array[2] == correct); 113 assert(R.array[3] == correct); 114 } 115 116 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` and then AND with `b`. 117 __m128 _mm_andnot_ps (__m128 a, __m128 b) pure @safe 118 { 119 return cast(__m128)( (~cast(__m128i)a) & cast(__m128i)b ); 120 } 121 unittest 122 { 123 float a = 4.32f; 124 float b = -78.99f; 125 int correct = ~(*cast(int*)(&a)) & (*cast(int*)(&b)); 126 int correct2 = (*cast(int*)(&a)) & ~(*cast(int*)(&b)); 127 __m128 A = _mm_set_ps(a, b, a, b); 128 __m128 B = _mm_set_ps(b, a, b, a); 129 int4 R = cast(int4)( _mm_andnot_ps(A, B) ); 130 assert(R.array[0] == correct2); 131 assert(R.array[1] == correct); 132 assert(R.array[2] == correct2); 133 assert(R.array[3] == correct); 134 } 135 136 /// Average packed unsigned 16-bit integers in ``a` and `b`. 137 __m64 _mm_avg_pu16 (__m64 a, __m64 b) pure @safe 138 { 139 return to_m64(_mm_avg_epu16(to_m128i(a), to_m128i(b))); 140 } 141 142 /// Average packed unsigned 8-bit integers in ``a` and `b`. 143 __m64 _mm_avg_pu8 (__m64 a, __m64 b) pure @safe 144 { 145 return to_m64(_mm_avg_epu8(to_m128i(a), to_m128i(b))); 146 } 147 148 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for equality. 149 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe 150 { 151 return cast(__m128) cmpps!(FPComparison.oeq)(a, b); 152 } 153 154 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for equality, 155 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 156 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe 157 { 158 return cast(__m128) cmpss!(FPComparison.oeq)(a, b); 159 } 160 161 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal. 162 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe 163 { 164 return cast(__m128) cmpps!(FPComparison.oge)(a, b); 165 } 166 unittest 167 { 168 __m128i R = cast(__m128i) _mm_cmpge_ps(_mm_setr_ps(0, 1, -1, float.nan), 169 _mm_setr_ps(0, 0, 0, 0)); 170 int[4] correct = [-1, -1, 0, 0]; 171 assert(R.array == correct); 172 } 173 174 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal, 175 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 176 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe 177 { 178 return cast(__m128) cmpss!(FPComparison.oge)(a, b); 179 } 180 181 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than. 182 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe 183 { 184 return cast(__m128) cmpps!(FPComparison.ogt)(a, b); 185 } 186 187 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than, 188 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 189 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe 190 { 191 return cast(__m128) cmpss!(FPComparison.ogt)(a, b); 192 } 193 194 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal. 195 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe 196 { 197 return cast(__m128) cmpps!(FPComparison.ole)(a, b); 198 } 199 200 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal, 201 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 202 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe 203 { 204 return cast(__m128) cmpss!(FPComparison.ole)(a, b); 205 } 206 207 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than. 208 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe 209 { 210 return cast(__m128) cmpps!(FPComparison.olt)(a, b); 211 } 212 213 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than, 214 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 215 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe 216 { 217 return cast(__m128) cmpss!(FPComparison.olt)(a, b); 218 } 219 220 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-equal. 221 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe 222 { 223 return cast(__m128) cmpps!(FPComparison.une)(a, b); 224 } 225 226 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-equal, 227 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 228 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe 229 { 230 return cast(__m128) cmpss!(FPComparison.une)(a, b); 231 } 232 233 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal. 234 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe 235 { 236 return cast(__m128) cmpps!(FPComparison.ult)(a, b); 237 } 238 239 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal, 240 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 241 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe 242 { 243 return cast(__m128) cmpss!(FPComparison.ult)(a, b); 244 } 245 246 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than. 247 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe 248 { 249 return cast(__m128) cmpps!(FPComparison.ule)(a, b); 250 } 251 252 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than, 253 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 254 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe 255 { 256 return cast(__m128) cmpss!(FPComparison.ule)(a, b); 257 } 258 259 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal. 260 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe 261 { 262 return cast(__m128) cmpps!(FPComparison.ugt)(a, b); 263 } 264 265 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal, 266 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 267 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe 268 { 269 return cast(__m128) cmpss!(FPComparison.ugt)(a, b); 270 } 271 272 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than. 273 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe 274 { 275 return cast(__m128) cmpps!(FPComparison.uge)(a, b); 276 } 277 278 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than, 279 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 280 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe 281 { 282 return cast(__m128) cmpss!(FPComparison.uge)(a, b); 283 } 284 285 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN. 286 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe 287 { 288 return cast(__m128) cmpps!(FPComparison.ord)(a, b); 289 } 290 291 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN, 292 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 293 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe 294 { 295 return cast(__m128) cmpss!(FPComparison.ord)(a, b); 296 } 297 298 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN. 299 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe 300 { 301 return cast(__m128) cmpps!(FPComparison.uno)(a, b); 302 } 303 304 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN. 305 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 306 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe 307 { 308 return cast(__m128) cmpss!(FPComparison.uno)(a, b); 309 } 310 311 // Note: we've reversed clang and GCC behaviour with regards to EFLAGS 312 // Some such comparisons yields true for NaNs, other don't. 313 314 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for equality, 315 /// and return the boolean result (0 or 1). 316 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe // comiss + sete 317 { 318 return comss!(FPComparison.ueq)(a, b); // yields true for NaN! 319 } 320 321 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than-or-equal, 322 /// and return the boolean result (0 or 1). 323 int _mm_comige_ss (__m128 a, __m128 b) pure @safe // comiss + setae 324 { 325 return comss!(FPComparison.oge)(a, b); 326 } 327 328 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than, 329 /// and return the boolean result (0 or 1). 330 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta 331 { 332 return comss!(FPComparison.ogt)(a, b); 333 } 334 335 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than-or-equal, 336 /// and return the boolean result (0 or 1). 337 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe 338 { 339 return comss!(FPComparison.ule)(a, b); // yields true for NaN! 340 } 341 342 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than, 343 /// and return the boolean result (0 or 1). 344 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb 345 { 346 return comss!(FPComparison.ult)(a, b); // yields true for NaN! 347 } 348 349 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for not-equal, 350 /// and return the boolean result (0 or 1). 351 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne 352 { 353 return comss!(FPComparison.one)(a, b); 354 } 355 356 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 357 /// floating-point elements, store the results in the lower 2 elements, 358 /// and copy the upper 2 packed elements from `a` to the upper elements of result. 359 alias _mm_cvt_pi2ps = _mm_cvtpi32_ps; 360 361 /// Convert 2 lower packed single-precision (32-bit) floating-point elements in `a` 362 /// to packed 32-bit integers. 363 __m64 _mm_cvt_ps2pi (__m128 a) @safe 364 { 365 return to_m64(_mm_cvtps_epi32(a)); 366 } 367 368 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 369 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 370 /// upper elements of the result. 371 __m128 _mm_cvt_si2ss (__m128 v, int x) pure @trusted 372 { 373 v.ptr[0] = cast(float)x; 374 return v; 375 } 376 unittest 377 { 378 __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42); 379 assert(a.array == [42f, 0, 0, 0]); 380 } 381 382 /// Convert packed 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements. 383 __m128 _mm_cvtpi16_ps (__m64 a) pure @safe 384 { 385 __m128i ma = to_m128i(a); 386 ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit 387 ma = _mm_srai_epi32(_mm_slli_epi32(ma, 16), 16); // Replicate sign bit 388 return _mm_cvtepi32_ps(ma); 389 } 390 unittest 391 { 392 __m64 A = _mm_setr_pi16(-1, 2, -3, 4); 393 __m128 R = _mm_cvtpi16_ps(A); 394 float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f]; 395 assert(R.array == correct); 396 } 397 398 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 399 /// floating-point elements, store the results in the lower 2 elements, 400 /// and copy the upper 2 packed elements from `a` to the upper elements of result. 401 __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) pure @trusted 402 { 403 __m128 fb = _mm_cvtepi32_ps(to_m128i(b)); 404 a.ptr[0] = fb.array[0]; 405 a.ptr[1] = fb.array[1]; 406 return a; 407 } 408 unittest 409 { 410 __m128 R = _mm_cvtpi32_ps(_mm_set1_ps(4.0f), _mm_setr_pi32(1, 2)); 411 float[4] correct = [1.0f, 2.0f, 4.0f, 4.0f]; 412 assert(R.array == correct); 413 } 414 415 /// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point elements, 416 /// store the results in the lower 2 elements, then covert the packed signed 32-bit integers in `b` to 417 /// single-precision (32-bit) floating-point element, and store the results in the upper 2 elements. 418 __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) pure @trusted 419 { 420 long2 l; 421 l.ptr[0] = a.array[0]; 422 l.ptr[1] = b.array[0]; 423 return _mm_cvtepi32_ps(cast(__m128i)l); 424 } 425 unittest 426 { 427 __m64 A = _mm_setr_pi32(-45, 128); 428 __m64 B = _mm_setr_pi32(0, 1000); 429 __m128 R = _mm_cvtpi32x2_ps(A, B); 430 float[4] correct = [-45.0f, 128.0f, 0.0f, 1000.0f]; 431 assert(R.array == correct); 432 } 433 434 /// Convert the lower packed 8-bit integers in `a` to packed single-precision (32-bit) floating-point elements. 435 __m128 _mm_cvtpi8_ps (__m64 a) pure @safe 436 { 437 __m128i b = to_m128i(a); 438 439 // Zero extend to 32-bit 440 b = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 441 b = _mm_unpacklo_epi16(b, _mm_setzero_si128()); 442 443 // Replicate sign bit 444 b = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24); // Replicate sign bit 445 return _mm_cvtepi32_ps(b); 446 } 447 unittest 448 { 449 __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0); 450 __m128 R = _mm_cvtpi8_ps(A); 451 float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f]; 452 assert(R.array == correct); 453 } 454 455 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 16-bit integers. 456 /// Note: this intrinsic will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and 0x7FFFFFFF. 457 __m64 _mm_cvtps_pi16 (__m128 a) @safe 458 { 459 // The C++ version of this intrinsic convert to 32-bit float, then use packssdw 460 // Which means the 16-bit integers should be saturated 461 __m128i b = _mm_cvtps_epi32(a); 462 b = _mm_packs_epi32(b, b); 463 return to_m64(b); 464 } 465 unittest 466 { 467 __m128 A = _mm_setr_ps(-1.0f, 2.0f, -33000.0f, 70000.0f); 468 short4 R = cast(short4) _mm_cvtps_pi16(A); 469 short[4] correct = [-1, 2, -32768, 32767]; 470 assert(R.array == correct); 471 } 472 473 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers. 474 __m64 _mm_cvtps_pi32 (__m128 a) @safe 475 { 476 return to_m64(_mm_cvtps_epi32(a)); 477 } 478 unittest 479 { 480 __m128 A = _mm_setr_ps(-33000.0f, 70000.0f, -1.0f, 2.0f, ); 481 int2 R = cast(int2) _mm_cvtps_pi32(A); 482 int[2] correct = [-33000, 70000]; 483 assert(R.array == correct); 484 } 485 486 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 8-bit integers, 487 /// and store the results in lower 4 elements. 488 /// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values between 0x7F and 0x7FFFFFFF. 489 __m64 _mm_cvtps_pi8 (__m128 a) @safe 490 { 491 // The C++ version of this intrinsic convert to 32-bit float, then use packssdw + packsswb 492 // Which means the 8-bit integers should be saturated 493 __m128i b = _mm_cvtps_epi32(a); 494 b = _mm_packs_epi32(b, _mm_setzero_si128()); 495 b = _mm_packs_epi16(b, _mm_setzero_si128()); 496 return to_m64(b); 497 } 498 unittest 499 { 500 __m128 A = _mm_setr_ps(-1.0f, 2.0f, -129.0f, 128.0f); 501 byte8 R = cast(byte8) _mm_cvtps_pi8(A); 502 byte[8] correct = [-1, 2, -128, 127, 0, 0, 0, 0]; 503 assert(R.array == correct); 504 } 505 506 /// Convert packed unsigned 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements. 507 __m128 _mm_cvtpu16_ps (__m64 a) pure @safe 508 { 509 __m128i ma = to_m128i(a); 510 ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit 511 return _mm_cvtepi32_ps(ma); 512 } 513 unittest 514 { 515 __m64 A = _mm_setr_pi16(-1, 2, -3, 4); 516 __m128 R = _mm_cvtpu16_ps(A); 517 float[4] correct = [65535.0f, 2.0f, 65533.0f, 4.0f]; 518 assert(R.array == correct); 519 } 520 521 /// Convert the lower packed unsigned 8-bit integers in `a` to packed single-precision (32-bit) floating-point element. 522 __m128 _mm_cvtpu8_ps (__m64 a) pure @safe 523 { 524 __m128i b = to_m128i(a); 525 526 // Zero extend to 32-bit 527 b = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 528 b = _mm_unpacklo_epi16(b, _mm_setzero_si128()); 529 return _mm_cvtepi32_ps(b); 530 } 531 unittest 532 { 533 __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0); 534 __m128 R = _mm_cvtpu8_ps(A); 535 float[4] correct = [255.0f, 2.0f, 253.0f, 4.0f]; 536 assert(R.array == correct); 537 } 538 539 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 540 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 541 /// upper elements of result. 542 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @trusted 543 { 544 v.ptr[0] = cast(float)x; 545 return v; 546 } 547 unittest 548 { 549 __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42); 550 assert(a.array == [42.0f, 0, 0, 0]); 551 } 552 553 554 /// Convert the signed 64-bit integer `b` to a single-precision (32-bit) floating-point element, 555 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 556 /// upper elements of result. 557 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @trusted 558 { 559 v.ptr[0] = cast(float)x; 560 return v; 561 } 562 unittest 563 { 564 __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42); 565 assert(a.array == [42.0f, 0, 0, 0]); 566 } 567 568 /// Take the lower single-precision (32-bit) floating-point element of `a`. 569 float _mm_cvtss_f32(__m128 a) pure @safe 570 { 571 return a.array[0]; 572 } 573 574 static if (LDC_with_SSE1) 575 { 576 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit integer. 577 int _mm_cvtss_si32 (__m128 a) @safe // PERF GDC 578 { 579 return __builtin_ia32_cvtss2si(a); 580 } 581 } 582 else 583 { 584 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit integer. 585 int _mm_cvtss_si32 (__m128 a) @safe 586 { 587 return convertFloatToInt32UsingMXCSR(a.array[0]); 588 } 589 } 590 unittest 591 { 592 assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 593 } 594 595 version(LDC) 596 { 597 version(X86_64) 598 { 599 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer. 600 long _mm_cvtss_si64 (__m128 a) @safe 601 { 602 return __builtin_ia32_cvtss2si64(a); 603 } 604 } 605 else 606 { 607 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer. 608 long _mm_cvtss_si64 (__m128 a) @safe 609 { 610 // Note: __builtin_ia32_cvtss2si64 crashes LDC in 32-bit 611 return convertFloatToInt64UsingMXCSR(a.array[0]); 612 } 613 } 614 } 615 else 616 { 617 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer. 618 long _mm_cvtss_si64 (__m128 a) @safe 619 { 620 return convertFloatToInt64UsingMXCSR(a.array[0]); 621 } 622 } 623 unittest 624 { 625 assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 626 627 uint savedRounding = _MM_GET_ROUNDING_MODE(); 628 629 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 630 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.49f))); 631 632 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 633 assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f))); 634 635 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 636 assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f))); 637 638 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 639 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f))); 640 641 _MM_SET_ROUNDING_MODE(savedRounding); 642 } 643 644 645 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit 646 /// integer with truncation. 647 int _mm_cvtt_ss2si (__m128 a) pure @safe 648 { 649 // x86: cvttss2si always generated, even in -O0 650 return cast(int)(a.array[0]); 651 } 652 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; ///ditto 653 unittest 654 { 655 assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 656 } 657 658 659 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit 660 /// integers with truncation. 661 __m64 _mm_cvtt_ps2pi (__m128 a) pure @safe 662 { 663 return to_m64(_mm_cvttps_epi32(a)); 664 } 665 666 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit 667 /// integer with truncation. 668 long _mm_cvttss_si64 (__m128 a) pure @safe 669 { 670 return cast(long)(a.array[0]); 671 } 672 unittest 673 { 674 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 675 } 676 677 /// Divide packed single-precision (32-bit) floating-point elements in `a` by packed elements in `b`. 678 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe 679 { 680 return a / b; 681 } 682 unittest 683 { 684 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 685 a = _mm_div_ps(a, a); 686 float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f]; 687 assert(a.array == correct); 688 } 689 690 /// Divide the lower single-precision (32-bit) floating-point element in `a` by the lower 691 /// single-precision (32-bit) floating-point element in `b`, store the result in the lower 692 /// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result. 693 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe 694 { 695 static if (GDC_with_SSE) 696 return __builtin_ia32_divss(a, b); 697 else 698 { 699 a[0] /= b[0]; 700 return a; 701 } 702 } 703 unittest 704 { 705 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 706 a = _mm_div_ss(a, a); 707 float[4] correct = [1.0f, -2.0, 3.0f, 1.0f]; 708 assert(a.array == correct); 709 } 710 711 /// Extract a 16-bit unsigned integer from `a`, selected with `imm8`. Zero-extended. 712 int _mm_extract_pi16 (__m64 a, int imm8) 713 { 714 short4 sa = cast(short4)a; 715 return cast(ushort)(sa.array[imm8]); 716 } 717 unittest 718 { 719 __m64 A = _mm_setr_pi16(-1, 6, 0, 4); 720 assert(_mm_extract_pi16(A, 0) == 65535); 721 assert(_mm_extract_pi16(A, 1) == 6); 722 assert(_mm_extract_pi16(A, 2) == 0); 723 assert(_mm_extract_pi16(A, 3) == 4); 724 } 725 726 /// Free aligned memory that was allocated with `_mm_malloc`. 727 void _mm_free(void * mem_addr) @trusted 728 { 729 // support for free(NULL) 730 if (mem_addr is null) 731 return; 732 733 // Technically we don't need to store size and alignement in the chunk, but we do in case we 734 // have to implement _mm_realloc 735 736 size_t pointerSize = (void*).sizeof; 737 void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof); 738 size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize); 739 size_t alignment = *alignmentLocation; 740 assert(alignment != 0); 741 assert(isPointerAligned(mem_addr, alignment)); 742 free(*rawLocation); 743 } 744 745 /// Get the exception mask bits from the MXCSR control and status register. 746 /// The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, 747 /// `_MM_MASK_DIV_ZERO`, `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`. 748 /// Note: won't correspond to reality on non-x86, where MXCSR this is emulated. 749 uint _MM_GET_EXCEPTION_MASK() @safe 750 { 751 return _mm_getcsr() & _MM_MASK_MASK; 752 } 753 754 /// Get the exception state bits from the MXCSR control and status register. 755 /// The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 756 /// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`. 757 /// Note: won't correspond to reality on non-x86, where MXCSR this is emulated. No exception reported. 758 uint _MM_GET_EXCEPTION_STATE() @safe 759 { 760 return _mm_getcsr() & _MM_EXCEPT_MASK; 761 } 762 763 /// Get the flush zero bits from the MXCSR control and status register. 764 /// The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF` 765 uint _MM_GET_FLUSH_ZERO_MODE() @safe 766 { 767 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 768 } 769 770 /// Get the rounding mode bits from the MXCSR control and status register. The rounding mode may 771 /// contain any of the following flags: `_MM_ROUND_NEAREST, `_MM_ROUND_DOWN`, `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`. 772 uint _MM_GET_ROUNDING_MODE() @safe 773 { 774 return _mm_getcsr() & _MM_ROUND_MASK; 775 } 776 777 /// Get the unsigned 32-bit value of the MXCSR control and status register. 778 /// Note: this is emulated on ARM, because there is no MXCSR register then. 779 uint _mm_getcsr() @trusted 780 { 781 static if (LDC_with_ARM) 782 { 783 // Note: we convert the ARM FPSCR into a x86 SSE control word. 784 // However, only rounding mode and flush to zero are actually set. 785 // The returned control word will have all exceptions masked, and no exception detected. 786 787 uint fpscr = arm_get_fpcr(); 788 789 uint cw = 0; // No exception detected 790 if (fpscr & _MM_FLUSH_ZERO_MASK_ARM) 791 { 792 // ARM has one single flag for ARM. 793 // It does both x86 bits. 794 // https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode 795 cw |= _MM_FLUSH_ZERO_ON; 796 cw |= 0x40; // set "denormals are zeros" 797 } 798 cw |= _MM_MASK_MASK; // All exception maske 799 800 // Rounding mode 801 switch(fpscr & _MM_ROUND_MASK_ARM) 802 { 803 default: 804 case _MM_ROUND_NEAREST_ARM: cw |= _MM_ROUND_NEAREST; break; 805 case _MM_ROUND_DOWN_ARM: cw |= _MM_ROUND_DOWN; break; 806 case _MM_ROUND_UP_ARM: cw |= _MM_ROUND_UP; break; 807 case _MM_ROUND_TOWARD_ZERO_ARM: cw |= _MM_ROUND_TOWARD_ZERO; break; 808 } 809 return cw; 810 } 811 else version(GNU) 812 { 813 static if (GDC_with_SSE) 814 { 815 return __builtin_ia32_stmxcsr(); 816 } 817 else version(X86) 818 { 819 uint sseRounding = 0; 820 asm pure nothrow @nogc @trusted 821 { 822 "stmxcsr %0;\n" 823 : "=m" (sseRounding) 824 : 825 : ; 826 } 827 return sseRounding; 828 } 829 else 830 static assert(false); 831 } 832 else version (InlineX86Asm) 833 { 834 uint controlWord; 835 asm nothrow @nogc pure @safe 836 { 837 stmxcsr controlWord; 838 } 839 return controlWord; 840 } 841 else 842 static assert(0, "Not yet supported"); 843 } 844 unittest 845 { 846 uint csr = _mm_getcsr(); 847 } 848 849 /// Insert a 16-bit integer `i` inside `a` at the location specified by `imm8`. 850 __m64 _mm_insert_pi16 (__m64 v, int i, int imm8) pure @trusted 851 { 852 short4 r = cast(short4)v; 853 r.ptr[imm8 & 3] = cast(short)i; 854 return cast(__m64)r; 855 } 856 unittest 857 { 858 __m64 A = _mm_set_pi16(3, 2, 1, 0); 859 short4 R = cast(short4) _mm_insert_pi16(A, 42, 1 | 4); 860 short[4] correct = [0, 42, 2, 3]; 861 assert(R.array == correct); 862 } 863 864 /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory. 865 // `p` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 866 __m128 _mm_load_ps(const(float)*p) pure @trusted // TODO shouldn't be trusted 867 { 868 return *cast(__m128*)p; 869 } 870 unittest 871 { 872 static immutable align(16) float[4] correct = [1.0f, 2.0f, 3.0f, 4.0f]; 873 __m128 A = _mm_load_ps(correct.ptr); 874 assert(A.array == correct); 875 } 876 877 /// Load a single-precision (32-bit) floating-point element from memory into all elements. 878 __m128 _mm_load_ps1(const(float)*p) pure @trusted 879 { 880 return __m128(*p); 881 } 882 unittest 883 { 884 float n = 2.5f; 885 float[4] correct = [2.5f, 2.5f, 2.5f, 2.5f]; 886 __m128 A = _mm_load_ps1(&n); 887 assert(A.array == correct); 888 } 889 890 /// Load a single-precision (32-bit) floating-point element from memory into the lower of dst, and zero the upper 3 891 /// elements. `mem_addr` does not need to be aligned on any particular boundary. 892 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted 893 { 894 __m128 r; 895 r.ptr[0] = *mem_addr; 896 r.ptr[1] = 0; 897 r.ptr[2] = 0; 898 r.ptr[3] = 0; 899 return r; 900 } 901 unittest 902 { 903 float n = 2.5f; 904 float[4] correct = [2.5f, 0.0f, 0.0f, 0.0f]; 905 __m128 A = _mm_load_ss(&n); 906 assert(A.array == correct); 907 } 908 909 /// Load a single-precision (32-bit) floating-point element from memory into all elements. 910 alias _mm_load1_ps = _mm_load_ps1; 911 912 /// Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of result, 913 /// and copy the lower 2 elements from `a` to result. `mem_addr does` not need to be aligned on any particular boundary. 914 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @trusted 915 { 916 // x86: movlhps generated since LDC 1.9.0 -O1 917 long2 la = cast(long2)a; 918 la.ptr[1] = (*mem_addr).array[0]; 919 return cast(__m128)la; 920 } 921 unittest 922 { 923 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 924 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 925 __m64 M = to_m64(cast(__m128i)B); 926 __m128 R = _mm_loadh_pi(A, &M); 927 float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f]; 928 assert(R.array == correct); 929 } 930 931 /// Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of result, 932 /// and copy the upper 2 elements from `a` to result. `mem_addr` does not need to be aligned on any particular boundary. 933 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @trusted 934 { 935 // x86: movlpd/movlps generated with all LDC -01 936 long2 la = cast(long2)a; 937 la.ptr[0] = (*mem_addr).array[0]; 938 return cast(__m128)la; 939 } 940 unittest 941 { 942 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 943 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 944 __m64 M = to_m64(cast(__m128i)B); 945 __m128 R = _mm_loadl_pi(A, &M); 946 float[4] correct = [5.0f, 6.0f, 3.0f, 4.0f]; 947 assert(R.array == correct); 948 } 949 950 /// Load 4 single-precision (32-bit) floating-point elements from memory in reverse order. 951 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 952 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted // TODO shouldn't be trusted 953 { 954 __m128* aligned = cast(__m128*)mem_addr; // x86: movaps + shups since LDC 1.0.0 -O1 955 __m128 a = *aligned; 956 __m128 r; 957 r.ptr[0] = a.array[3]; 958 r.ptr[1] = a.array[2]; 959 r.ptr[2] = a.array[1]; 960 r.ptr[3] = a.array[0]; 961 return r; 962 } 963 unittest 964 { 965 align(16) static immutable float[4] arr = [ 1.0f, 2.0f, 3.0f, 8.0f ]; 966 __m128 A = _mm_loadr_ps(arr.ptr); 967 float[4] correct = [ 8.0f, 3.0f, 2.0f, 1.0f ]; 968 assert(A.array == correct); 969 } 970 971 /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into dst. 972 /// `p` does not need to be aligned on any particular boundary. 973 __m128 _mm_loadu_ps(const(float)*p) pure @safe 974 { 975 return loadUnaligned!(__m128)(p); 976 } 977 unittest 978 { 979 align(16) static immutable float[5] arr = [ 1.0f, 2.0f, 3.0f, 8.0f, 9.0f ]; // force unaligned load 980 __m128 A = _mm_loadu_ps(&arr[1]); 981 float[4] correct = [ 2.0f, 3.0f, 8.0f, 9.0f ]; 982 assert(A.array == correct); 983 } 984 985 /// Load unaligned 16-bit integer from memory into the first element, fill with zeroes otherwise. 986 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted 987 { 988 short r = *cast(short*)(mem_addr); 989 short8 result = [0, 0, 0, 0, 0, 0, 0, 0]; 990 result.ptr[0] = r; 991 return cast(__m128i)result; 992 } 993 unittest 994 { 995 short r = 13; 996 short8 A = cast(short8) _mm_loadu_si16(&r); 997 short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0]; 998 assert(A.array == correct); 999 } 1000 1001 /// Load unaligned 64-bit integer from memory into the first element of result. 1002 /// Upper 64-bit is zeroed. 1003 __m128i _mm_loadu_si64(const(void)* mem_addr) pure @trusted 1004 { 1005 long r = *cast(long*)(mem_addr); 1006 long2 result = [0, 0]; 1007 result.ptr[0] = r; 1008 return cast(__m128i)result; 1009 } 1010 unittest 1011 { 1012 long r = 446446446446; 1013 long2 A = cast(long2) _mm_loadu_si64(&r); 1014 long[2] correct = [446446446446, 0]; 1015 assert(A.array == correct); 1016 } 1017 1018 /// Allocate size bytes of memory, aligned to the alignment specified in align, 1019 /// and return a pointer to the allocated memory. `_mm_free` should be used to free 1020 /// memory that is allocated with `_mm_malloc`. 1021 void* _mm_malloc(size_t size, size_t alignment) @trusted 1022 { 1023 assert(alignment != 0); 1024 size_t request = requestedSize(size, alignment); 1025 void* raw = malloc(request); 1026 if (request > 0 && raw == null) // malloc(0) can validly return anything 1027 onOutOfMemoryError(); 1028 return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size 1029 } 1030 1031 /// Conditionally store 8-bit integer elements from a into memory using mask (elements are not stored when the highest 1032 /// bit is not set in the corresponding element) and a non-temporal memory hint. 1033 void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr) @trusted 1034 { 1035 // this works since mask is zero-extended 1036 return _mm_maskmoveu_si128 (to_m128i(a), to_m128i(mask), mem_addr); 1037 } 1038 1039 deprecated("Use _mm_maskmove_si64 instead") alias _m_maskmovq = _mm_maskmove_si64;/// 1040 1041 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum value. 1042 __m64 _mm_max_pi16 (__m64 a, __m64 b) pure @safe 1043 { 1044 return to_m64(_mm_max_epi16(to_m128i(a), to_m128i(b))); 1045 } 1046 1047 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values. 1048 __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe 1049 { 1050 static if (GDC_with_SSE) 1051 { 1052 return __builtin_ia32_maxps(a, b); 1053 } 1054 else static if (LDC_with_SSE1) 1055 { 1056 return __builtin_ia32_maxps(a, b); 1057 } 1058 else 1059 { 1060 // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02 1061 __m128 r; 1062 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 1063 r[1] = (a[1] > b[1]) ? a[1] : b[1]; 1064 r[2] = (a[2] > b[2]) ? a[2] : b[2]; 1065 r[3] = (a[3] > b[3]) ? a[3] : b[3]; 1066 return r; 1067 } 1068 } 1069 unittest 1070 { 1071 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 1072 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 1073 __m128 M = _mm_max_ps(A, B); 1074 assert(M.array[0] == 4); 1075 assert(M.array[1] == 2); 1076 assert(M.array[2] == 4); // in case of NaN, second operand prevails (as it seems) 1077 assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems) 1078 } 1079 1080 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed maximum values. 1081 __m64 _mm_max_pu8 (__m64 a, __m64 b) pure @safe 1082 { 1083 return to_m64(_mm_max_epu8(to_m128i(a), to_m128i(b))); 1084 } 1085 1086 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the maximum value in the 1087 /// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result. 1088 __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe 1089 { 1090 static if (GDC_with_SSE) 1091 { 1092 return __builtin_ia32_maxss(a, b); 1093 } 1094 else static if (LDC_with_SSE1) 1095 { 1096 return __builtin_ia32_maxss(a, b); 1097 } 1098 else 1099 { 1100 __m128 r = a; 1101 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 1102 return r; 1103 } 1104 } 1105 unittest 1106 { 1107 __m128 A = _mm_setr_ps(1, 2, 3, 4); 1108 __m128 B = _mm_setr_ps(4, 1, 4, 1); 1109 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 1110 __m128 M = _mm_max_ss(A, B); 1111 assert(M.array[0] == 4); 1112 assert(M.array[1] == 2); 1113 assert(M.array[2] == 3); 1114 assert(M.array[3] == 4); 1115 M = _mm_max_ps(A, C); // in case of NaN, second operand prevails 1116 assert(M.array[0] != M.array[0]); 1117 M = _mm_max_ps(C, A); // in case of NaN, second operand prevails 1118 assert(M.array[0] == 1); 1119 } 1120 1121 /// Compare packed signed 16-bit integers in a and b, and return packed minimum values. 1122 __m64 _mm_min_pi16 (__m64 a, __m64 b) pure @safe 1123 { 1124 return to_m64(_mm_min_epi16(to_m128i(a), to_m128i(b))); 1125 } 1126 1127 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values. 1128 __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe 1129 { 1130 static if (GDC_with_SSE) 1131 { 1132 return __builtin_ia32_minps(a, b); 1133 } 1134 else static if (LDC_with_SSE1) 1135 { 1136 // not technically needed, but better perf in debug mode 1137 return __builtin_ia32_minps(a, b); 1138 } 1139 else 1140 { 1141 // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02 1142 __m128 r; 1143 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 1144 r[1] = (a[1] < b[1]) ? a[1] : b[1]; 1145 r[2] = (a[2] < b[2]) ? a[2] : b[2]; 1146 r[3] = (a[3] < b[3]) ? a[3] : b[3]; 1147 return r; 1148 } 1149 } 1150 unittest 1151 { 1152 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 1153 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 1154 __m128 M = _mm_min_ps(A, B); 1155 assert(M.array[0] == 1); 1156 assert(M.array[1] == 1); 1157 assert(M.array[2] == 4); // in case of NaN, second operand prevails (as it seems) 1158 assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems) 1159 } 1160 1161 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values. 1162 __m64 _mm_min_pu8 (__m64 a, __m64 b) pure @safe 1163 { 1164 return to_m64(_mm_min_epu8(to_m128i(a), to_m128i(b))); 1165 } 1166 1167 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the minimum value in the 1168 /// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result. 1169 __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe 1170 { 1171 static if (GDC_with_SSE) 1172 { 1173 return __builtin_ia32_minss(a, b); 1174 } 1175 else static if (LDC_with_SSE1) 1176 { 1177 return __builtin_ia32_minss(a, b); 1178 } 1179 else 1180 { 1181 // Generates minss since LDC 1.3 -O1 1182 __m128 r = a; 1183 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 1184 return r; 1185 } 1186 } 1187 unittest 1188 { 1189 __m128 A = _mm_setr_ps(1, 2, 3, 4); 1190 __m128 B = _mm_setr_ps(4, 1, 4, 1); 1191 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 1192 __m128 M = _mm_min_ss(A, B); 1193 assert(M.array[0] == 1); 1194 assert(M.array[1] == 2); 1195 assert(M.array[2] == 3); 1196 assert(M.array[3] == 4); 1197 M = _mm_min_ps(A, C); // in case of NaN, second operand prevails 1198 assert(M.array[0] != M.array[0]); 1199 M = _mm_min_ps(C, A); // in case of NaN, second operand prevails 1200 assert(M.array[0] == 1); 1201 } 1202 1203 /// Move the lower single-precision (32-bit) floating-point element from `b` to the lower element of result, and copy 1204 /// the upper 3 packed elements from `a` to the upper elements of result. 1205 __m128 _mm_move_ss (__m128 a, __m128 b) pure @trusted 1206 { 1207 a.ptr[0] = b.array[0]; 1208 return a; 1209 } 1210 unittest 1211 { 1212 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1213 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1214 __m128 R = _mm_move_ss(A, B); 1215 float[4] correct = [5.0f, 2.0f, 3.0f, 4.0f]; 1216 assert(R.array == correct); 1217 } 1218 1219 /// Move the upper 2 single-precision (32-bit) floating-point elements from `b` to the lower 2 elements of result, and 1220 /// copy the upper 2 elements from `a` to the upper 2 elements of dst. 1221 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @trusted 1222 { 1223 a.ptr[0] = b.array[2]; 1224 a.ptr[1] = b.array[3]; 1225 return a; 1226 } 1227 unittest 1228 { 1229 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1230 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1231 __m128 R = _mm_movehl_ps(A, B); 1232 float[4] correct = [7.0f, 8.0f, 3.0f, 4.0f]; 1233 assert(R.array == correct); 1234 } 1235 1236 /// Move the lower 2 single-precision (32-bit) floating-point elements from `b` to the upper 2 elements of result, and 1237 /// copy the lower 2 elements from `a` to the lower 2 elements of result 1238 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @trusted 1239 { 1240 a.ptr[2] = b.array[0]; 1241 a.ptr[3] = b.array[1]; 1242 return a; 1243 } 1244 unittest 1245 { 1246 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1247 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1248 __m128 R = _mm_movelh_ps(A, B); 1249 float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f]; 1250 assert(R.array == correct); 1251 } 1252 1253 /// Create mask from the most significant bit of each 8-bit element in `a`. 1254 int _mm_movemask_pi8 (__m64 a) pure @safe 1255 { 1256 return _mm_movemask_epi8(to_m128i(a)); 1257 } 1258 unittest 1259 { 1260 assert(0x9C == _mm_movemask_pi8(_mm_set_pi8(-1, 0, 0, -1, -1, -1, 0, 0))); 1261 } 1262 1263 /// Set each bit of result based on the most significant bit of the corresponding packed single-precision (32-bit) 1264 /// floating-point element in `a`. 1265 int _mm_movemask_ps (__m128 a) pure @trusted 1266 { 1267 static if (GDC_with_SSE) 1268 { 1269 return __builtin_ia32_movmskps(a); 1270 } 1271 else static if (LDC_with_SSE1) 1272 { 1273 return __builtin_ia32_movmskps(a); 1274 } 1275 else static if (LDC_with_ARM) 1276 { 1277 int4 ai = cast(int4)a; 1278 int4 shift31 = [31, 31, 31, 31]; 1279 ai = ai >>> shift31; 1280 int4 shift = [0, 1, 2, 3]; 1281 ai = ai << shift; // 4-way shift, only efficient on ARM. 1282 int r = ai.array[0] + (ai.array[1]) + (ai.array[2]) + (ai.array[3]); 1283 return r; 1284 } 1285 else 1286 { 1287 int4 ai = cast(int4)a; 1288 int r = 0; 1289 if (ai.array[0] < 0) r += 1; 1290 if (ai.array[1] < 0) r += 2; 1291 if (ai.array[2] < 0) r += 4; 1292 if (ai.array[3] < 0) r += 8; 1293 return r; 1294 } 1295 } 1296 unittest 1297 { 1298 int4 A = [-1, 0, -43, 0]; 1299 assert(5 == _mm_movemask_ps(cast(float4)A)); 1300 } 1301 1302 /// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`. 1303 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe 1304 { 1305 return a * b; 1306 } 1307 unittest 1308 { 1309 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1310 a = _mm_mul_ps(a, a); 1311 float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f]; 1312 assert(a.array == correct); 1313 } 1314 1315 /// Multiply the lower single-precision (32-bit) floating-point element in `a` and `b`, store the result in the lower 1316 /// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result. 1317 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe 1318 { 1319 static if (GDC_with_SSE) 1320 return __builtin_ia32_mulss(a, b); 1321 else 1322 { 1323 a[0] *= b[0]; 1324 return a; 1325 } 1326 } 1327 unittest 1328 { 1329 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1330 a = _mm_mul_ss(a, a); 1331 float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f]; 1332 assert(a.array == correct); 1333 } 1334 1335 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 1336 /// and return the high 16 bits of the intermediate integers. 1337 __m64 _mm_mulhi_pu16 (__m64 a, __m64 b) pure @safe 1338 { 1339 return to_m64(_mm_mulhi_epu16(to_m128i(a), to_m128i(b))); 1340 } 1341 unittest 1342 { 1343 __m64 A = _mm_setr_pi16(0, -16, 2, 3); 1344 __m64 B = _mm_set1_pi16(16384); 1345 short4 R = cast(short4)_mm_mulhi_pu16(A, B); 1346 short[4] correct = [0, 0x3FFC, 0, 0]; 1347 assert(R.array == correct); 1348 } 1349 1350 /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in `a` and `b`, and 1351 /// return the result. 1352 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe 1353 { 1354 return cast(__m128)(cast(__m128i)a | cast(__m128i)b); 1355 } 1356 1357 deprecated("Use _mm_avg_pu8 instead") alias _m_pavgb = _mm_avg_pu8;/// 1358 deprecated("Use _mm_avg_pu16 instead") alias _m_pavgw = _mm_avg_pu16;/// 1359 deprecated("Use _mm_extract_pi16 instead") alias _m_pextrw = _mm_extract_pi16;/// 1360 deprecated("Use _mm_insert_pi16 instead") alias _m_pinsrw = _mm_insert_pi16;/// 1361 deprecated("Use _mm_max_pi16 instead") alias _m_pmaxsw = _mm_max_pi16;/// 1362 deprecated("Use _mm_max_pu8 instead") alias _m_pmaxub = _mm_max_pu8;/// 1363 deprecated("Use _mm_min_pi16 instead") alias _m_pminsw = _mm_min_pi16;/// 1364 deprecated("Use _mm_min_pu8 instead") alias _m_pminub = _mm_min_pu8;/// 1365 deprecated("Use _mm_movemask_pi8 instead") alias _m_pmovmskb = _mm_movemask_pi8;/// 1366 deprecated("Use _mm_mulhi_pu16 instead") alias _m_pmulhuw = _mm_mulhi_pu16;/// 1367 1368 enum _MM_HINT_T0 = 3; /// 1369 enum _MM_HINT_T1 = 2; /// 1370 enum _MM_HINT_T2 = 1; /// 1371 enum _MM_HINT_NTA = 0; /// 1372 1373 1374 version(LDC) 1375 { 1376 // Starting with LLVM 10, it seems llvm.prefetch has changed its name. 1377 // Was reported at: https://github.com/ldc-developers/ldc/issues/3397 1378 static if (__VERSION__ >= 2091) 1379 { 1380 pragma(LDC_intrinsic, "llvm.prefetch.p0i8") // was "llvm.prefetch" 1381 void llvm_prefetch_fixed(void* ptr, uint rw, uint locality, uint cachetype) pure @safe; 1382 } 1383 } 1384 1385 /// Fetch the line of data from memory that contains address `p` to a location in the 1386 /// cache hierarchy specified by the locality hint i. 1387 /// 1388 /// Warning: `locality` is a compile-time parameter, unlike in Intel Intrinsics API. 1389 void _mm_prefetch(int locality)(const(void)* p) pure @trusted 1390 { 1391 static if (GDC_with_SSE) 1392 { 1393 return __builtin_prefetch(p, (locality & 0x4) >> 2, locality & 0x3); 1394 } 1395 else version(LDC) 1396 { 1397 static if (__VERSION__ >= 2091) 1398 { 1399 // const_cast here. `llvm_prefetch` wants a mutable pointer 1400 llvm_prefetch_fixed( cast(void*)p, 0, locality, 1); 1401 } 1402 else 1403 { 1404 // const_cast here. `llvm_prefetch` wants a mutable pointer 1405 llvm_prefetch( cast(void*)p, 0, locality, 1); 1406 } 1407 } 1408 else version(D_InlineAsm_X86_64) 1409 { 1410 static if (locality == _MM_HINT_NTA) 1411 { 1412 asm pure nothrow @nogc @trusted 1413 { 1414 mov RAX, p; 1415 prefetchnta [RAX]; 1416 } 1417 } 1418 else static if (locality == _MM_HINT_T0) 1419 { 1420 asm pure nothrow @nogc @trusted 1421 { 1422 mov RAX, p; 1423 prefetcht0 [RAX]; 1424 } 1425 } 1426 else static if (locality == _MM_HINT_T1) 1427 { 1428 asm pure nothrow @nogc @trusted 1429 { 1430 mov RAX, p; 1431 prefetcht1 [RAX]; 1432 } 1433 } 1434 else static if (locality == _MM_HINT_T2) 1435 { 1436 asm pure nothrow @nogc @trusted 1437 { 1438 mov RAX, p; 1439 prefetcht2 [RAX]; 1440 } 1441 } 1442 else 1443 assert(false); // invalid locality hint 1444 } 1445 else version(D_InlineAsm_X86) 1446 { 1447 static if (locality == _MM_HINT_NTA) 1448 { 1449 asm pure nothrow @nogc @trusted 1450 { 1451 mov EAX, p; 1452 prefetchnta [EAX]; 1453 } 1454 } 1455 else static if (locality == _MM_HINT_T0) 1456 { 1457 asm pure nothrow @nogc @trusted 1458 { 1459 mov EAX, p; 1460 prefetcht0 [EAX]; 1461 } 1462 } 1463 else static if (locality == _MM_HINT_T1) 1464 { 1465 asm pure nothrow @nogc @trusted 1466 { 1467 mov EAX, p; 1468 prefetcht1 [EAX]; 1469 } 1470 } 1471 else static if (locality == _MM_HINT_T2) 1472 { 1473 asm pure nothrow @nogc @trusted 1474 { 1475 mov EAX, p; 1476 prefetcht2 [EAX]; 1477 } 1478 } 1479 else 1480 assert(false); // invalid locality hint 1481 } 1482 else 1483 { 1484 // Generic version: do nothing. From bitter experience, 1485 // it's unlikely you get ANY speed-up with manual prefetching. 1486 // Prefetching or not doesn't change program behaviour. 1487 } 1488 } 1489 unittest 1490 { 1491 // From Intel documentation: 1492 // "The amount of data prefetched is also processor implementation-dependent. It will, however, be a minimum of 1493 // 32 bytes." 1494 ubyte[256] cacheline; // though it seems it cannot generate GP fault 1495 _mm_prefetch!_MM_HINT_T0(cacheline.ptr); 1496 _mm_prefetch!_MM_HINT_T1(cacheline.ptr); 1497 _mm_prefetch!_MM_HINT_T2(cacheline.ptr); 1498 _mm_prefetch!_MM_HINT_NTA(cacheline.ptr); 1499 } 1500 1501 deprecated("Use _mm_sad_pu8 instead") alias _m_psadbw = _mm_sad_pu8;/// 1502 deprecated("Use _mm_shuffle_pi16 instead") alias _m_pshufw = _mm_shuffle_pi16;/// 1503 1504 1505 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a`` , 1506 /// and return the results. The maximum relative error for this approximation is less than 1.5*2^-12. 1507 __m128 _mm_rcp_ps (__m128 a) pure @trusted 1508 { 1509 static if (GDC_with_SSE) 1510 { 1511 return __builtin_ia32_rcpps(a); 1512 } 1513 else static if (LDC_with_SSE1) 1514 { 1515 return __builtin_ia32_rcpps(a); 1516 } 1517 else 1518 { 1519 a.ptr[0] = 1.0f / a.array[0]; 1520 a.ptr[1] = 1.0f / a.array[1]; 1521 a.ptr[2] = 1.0f / a.array[2]; 1522 a.ptr[3] = 1.0f / a.array[3]; 1523 return a; 1524 } 1525 } 1526 unittest 1527 { 1528 __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f); 1529 __m128 groundTruth = _mm_set1_ps(1.0f) / A; 1530 __m128 result = _mm_rcp_ps(A); 1531 foreach(i; 0..4) 1532 { 1533 double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1; 1534 assert(abs(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 1535 } 1536 } 1537 1538 /// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in `a`, store it 1539 /// in the lower element of the result, and copy the upper 3 packed elements from `a` to the upper elements of result. 1540 /// The maximum relative error for this approximation is less than 1.5*2^-12. 1541 __m128 _mm_rcp_ss (__m128 a) pure @trusted 1542 { 1543 static if (GDC_with_SSE) 1544 { 1545 return __builtin_ia32_rcpss(a); 1546 } 1547 else static if (LDC_with_SSE1) 1548 { 1549 return __builtin_ia32_rcpss(a); 1550 } 1551 else 1552 { 1553 a.ptr[0] = 1.0f / a.array[0]; 1554 return a; 1555 } 1556 } 1557 unittest 1558 { 1559 __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f); 1560 __m128 correct = _mm_setr_ps(1 / 2.34f, -70000.0f, 0.00001f, 345.5f); 1561 __m128 R = _mm_rcp_ss(A); 1562 double relError = (cast(double)(correct.array[0]) / R.array[0]) - 1; 1563 assert(abs(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 1564 assert(R.array[1] == correct.array[1]); 1565 assert(R.array[2] == correct.array[2]); 1566 assert(R.array[3] == correct.array[3]); 1567 } 1568 1569 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in `a`. 1570 /// The maximum relative error for this approximation is less than 1.5*2^-12. 1571 __m128 _mm_rsqrt_ps (__m128 a) pure @trusted 1572 { 1573 static if (GDC_with_SSE) 1574 { 1575 return __builtin_ia32_rsqrtps(a); 1576 } 1577 else static if (LDC_with_SSE1) 1578 { 1579 return __builtin_ia32_rsqrtps(a); 1580 } 1581 else version(LDC) 1582 { 1583 a[0] = 1.0f / llvm_sqrt(a[0]); 1584 a[1] = 1.0f / llvm_sqrt(a[1]); 1585 a[2] = 1.0f / llvm_sqrt(a[2]); 1586 a[3] = 1.0f / llvm_sqrt(a[3]); 1587 return a; 1588 } 1589 else 1590 { 1591 a.ptr[0] = 1.0f / sqrt(a.array[0]); 1592 a.ptr[1] = 1.0f / sqrt(a.array[1]); 1593 a.ptr[2] = 1.0f / sqrt(a.array[2]); 1594 a.ptr[3] = 1.0f / sqrt(a.array[3]); 1595 return a; 1596 } 1597 } 1598 unittest 1599 { 1600 __m128 A = _mm_setr_ps(2.34f, 70000.0f, 0.00001f, 345.5f); 1601 __m128 groundTruth = _mm_setr_ps(0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f); 1602 __m128 result = _mm_rsqrt_ps(A); 1603 foreach(i; 0..4) 1604 { 1605 double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1; 1606 assert(abs(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 1607 } 1608 } 1609 1610 /// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in `a`, 1611 /// store the result in the lower element. Copy the upper 3 packed elements from `a` to the upper elements of result. 1612 /// The maximum relative error for this approximation is less than 1.5*2^-12. 1613 __m128 _mm_rsqrt_ss (__m128 a) pure @trusted 1614 { 1615 static if (GDC_with_SSE) 1616 { 1617 return __builtin_ia32_rsqrtss(a); 1618 } 1619 else static if (LDC_with_SSE1) 1620 { 1621 return __builtin_ia32_rsqrtss(a); 1622 } 1623 else version(LDC) 1624 { 1625 a[0] = 1.0f / llvm_sqrt(a[0]); 1626 return a; 1627 } 1628 else 1629 { 1630 a[0] = 1.0f / sqrt(a[0]); 1631 return a; 1632 } 1633 } 1634 unittest // this one test 4 different intrinsics: _mm_rsqrt_ss, _mm_rsqrt_ps, _mm_rcp_ps, _mm_rcp_ss 1635 { 1636 double maxRelativeError = 0.000245; // -72 dB, stuff is apparently more precise than said in the doc? 1637 void testApproximateSSE(float number) nothrow @nogc 1638 { 1639 __m128 A = _mm_set1_ps(number); 1640 1641 // test _mm_rcp_ps 1642 __m128 B = _mm_rcp_ps(A); 1643 foreach(i; 0..4) 1644 { 1645 double exact = 1.0f / A.array[i]; 1646 double ratio = cast(double)(B.array[i]) / cast(double)(exact); 1647 assert(abs(ratio - 1) <= maxRelativeError); 1648 } 1649 1650 // test _mm_rcp_ss 1651 { 1652 B = _mm_rcp_ss(A); 1653 double exact = 1.0f / A.array[0]; 1654 double ratio = cast(double)(B.array[0]) / cast(double)(exact); 1655 assert(abs(ratio - 1) <= maxRelativeError); 1656 } 1657 1658 // test _mm_rsqrt_ps 1659 B = _mm_rsqrt_ps(A); 1660 foreach(i; 0..4) 1661 { 1662 double exact = 1.0f / sqrt(A.array[i]); 1663 double ratio = cast(double)(B.array[i]) / cast(double)(exact); 1664 assert(abs(ratio - 1) <= maxRelativeError); 1665 } 1666 1667 // test _mm_rsqrt_ss 1668 { 1669 B = _mm_rsqrt_ss(A); 1670 double exact = 1.0f / sqrt(A.array[0]); 1671 double ratio = cast(double)(B.array[0]) / cast(double)(exact); 1672 assert(abs(ratio - 1) <= maxRelativeError); 1673 } 1674 } 1675 1676 testApproximateSSE(0.00001f); 1677 testApproximateSSE(1.1f); 1678 testApproximateSSE(345.0f); 1679 testApproximateSSE(2.45674864151f); 1680 testApproximateSSE(700000.0f); 1681 testApproximateSSE(10000000.0f); 1682 testApproximateSSE(27841456468.0f); 1683 } 1684 1685 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 1686 /// consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 1687 /// low 16 bits of result. 1688 __m64 _mm_sad_pu8 (__m64 a, __m64 b) pure @safe 1689 { 1690 return to_m64(_mm_sad_epu8(to_m128i(a), to_m128i(b))); 1691 } 1692 1693 /// Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer 1694 /// `_MM_MASK_xxxx`. The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, `_MM_MASK_DIV_ZERO`, 1695 /// `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`. 1696 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) @safe 1697 { 1698 // Note: unsupported on ARM 1699 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx); 1700 } 1701 1702 /// Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer 1703 /// `_MM_EXCEPT_xxxx`. The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 1704 /// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`. 1705 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) @safe 1706 { 1707 // Note: unsupported on ARM 1708 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx); 1709 } 1710 1711 /// Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer 1712 /// `_MM_FLUSH_xxxx`. The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF`. 1713 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) @safe 1714 { 1715 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx); 1716 } 1717 1718 /// Set packed single-precision (32-bit) floating-point elements with the supplied values. 1719 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted 1720 { 1721 // Note: despite appearances, generates sensible code, 1722 // inlines correctly and is constant folded 1723 float[4] result = [e0, e1, e2, e3]; 1724 return loadUnaligned!(float4)(result.ptr); 1725 } 1726 unittest 1727 { 1728 __m128 A = _mm_set_ps(3, 2, 1, 546); 1729 float[4] correct = [546.0f, 1.0f, 2.0f, 3.0f]; 1730 assert(A.array == correct); 1731 } 1732 1733 deprecated("Use _mm_set1_ps instead") alias _mm_set_ps1 = _mm_set1_ps; /// 1734 1735 /// Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer 1736 /// `_MM_ROUND_xxxx`. The rounding mode may contain any of the following flags: `_MM_ROUND_NEAREST`, `_MM_ROUND_DOWN`, 1737 /// `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`. 1738 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) @safe 1739 { 1740 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx); 1741 } 1742 1743 /// Copy single-precision (32-bit) floating-point element `a` to the lower element of result, and zero the upper 3 elements. 1744 __m128 _mm_set_ss (float a) pure @trusted 1745 { 1746 __m128 r = _mm_setzero_ps(); 1747 r.ptr[0] = a; 1748 return r; 1749 } 1750 unittest 1751 { 1752 float[4] correct = [42.0f, 0.0f, 0.0f, 0.0f]; 1753 __m128 A = _mm_set_ss(42.0f); 1754 assert(A.array == correct); 1755 } 1756 1757 /// Broadcast single-precision (32-bit) floating-point value `a` to all elements. 1758 __m128 _mm_set1_ps (float a) pure @trusted 1759 { 1760 __m128 r = void; 1761 r.ptr[0] = a; 1762 r.ptr[1] = a; 1763 r.ptr[2] = a; 1764 r.ptr[3] = a; 1765 return r; 1766 } 1767 unittest 1768 { 1769 float[4] correct = [42.0f, 42.0f, 42.0f, 42.0f]; 1770 __m128 A = _mm_set1_ps(42.0f); 1771 assert(A.array == correct); 1772 } 1773 1774 /// Set the MXCSR control and status register with the value in unsigned 32-bit integer `controlWord`. 1775 void _mm_setcsr(uint controlWord) @trusted 1776 { 1777 static if (LDC_with_ARM) 1778 { 1779 // Convert from SSE to ARM control word. This is done _partially_ 1780 // and only support rounding mode changes. 1781 1782 // "To alter some bits of a VFP system register without 1783 // affecting other bits, use a read-modify-write procedure" 1784 uint fpscr = arm_get_fpcr(); 1785 1786 // Bits 23 to 22 are rounding modes, however not used in NEON 1787 fpscr = fpscr & ~_MM_ROUND_MASK_ARM; 1788 switch(controlWord & _MM_ROUND_MASK) 1789 { 1790 default: 1791 case _MM_ROUND_NEAREST: fpscr |= _MM_ROUND_NEAREST_ARM; break; 1792 case _MM_ROUND_DOWN: fpscr |= _MM_ROUND_DOWN_ARM; break; 1793 case _MM_ROUND_UP: fpscr |= _MM_ROUND_UP_ARM; break; 1794 case _MM_ROUND_TOWARD_ZERO: fpscr |= _MM_ROUND_TOWARD_ZERO_ARM; break; 1795 } 1796 fpscr = fpscr & ~_MM_FLUSH_ZERO_MASK_ARM; 1797 if (controlWord & _MM_FLUSH_ZERO_MASK) 1798 fpscr |= _MM_FLUSH_ZERO_MASK_ARM; 1799 arm_set_fpcr(fpscr); 1800 } 1801 else version(GNU) 1802 { 1803 static if (GDC_with_SSE) 1804 { 1805 __builtin_ia32_ldmxcsr(controlWord); 1806 } 1807 else version(X86) 1808 { 1809 asm pure nothrow @nogc @trusted 1810 { 1811 "ldmxcsr %0;\n" 1812 : 1813 : "m" (controlWord) 1814 : ; 1815 } 1816 } 1817 else 1818 static assert(false); 1819 } 1820 else version (InlineX86Asm) 1821 { 1822 asm pure nothrow @nogc @safe 1823 { 1824 ldmxcsr controlWord; 1825 } 1826 } 1827 else 1828 static assert(0, "Not yet supported"); 1829 } 1830 unittest 1831 { 1832 _mm_setcsr(_mm_getcsr()); 1833 } 1834 1835 /// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order. 1836 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted 1837 { 1838 float[4] result = [e3, e2, e1, e0]; 1839 return loadUnaligned!(float4)(result.ptr); 1840 } 1841 unittest 1842 { 1843 __m128 A = _mm_setr_ps(3, 2, 1, 546); 1844 float[4] correct = [3.0f, 2.0f, 1.0f, 546.0f]; 1845 assert(A.array == correct); 1846 assert(A.array[0] == 3.0f); 1847 assert(A.array[1] == 2.0f); 1848 assert(A.array[2] == 1.0f); 1849 assert(A.array[3] == 546.0f); 1850 } 1851 1852 /// Return vector of type `__m128` with all elements set to zero. 1853 __m128 _mm_setzero_ps() pure @trusted 1854 { 1855 // Compiles to xorps without problems 1856 float[4] result = [0.0f, 0.0f, 0.0f, 0.0f]; 1857 return loadUnaligned!(float4)(result.ptr); 1858 } 1859 1860 /// Perform a serializing operation on all store-to-memory instructions that were issued prior 1861 /// to this instruction. Guarantees that every store instruction that precedes, in program order, 1862 /// is globally visible before any store instruction which follows the fence in program order. 1863 void _mm_sfence() @trusted 1864 { 1865 version(GNU) 1866 { 1867 static if (GDC_with_SSE) 1868 { 1869 __builtin_ia32_sfence(); 1870 } 1871 else version(X86) 1872 { 1873 asm pure nothrow @nogc @trusted 1874 { 1875 "sfence;\n" : : : ; 1876 } 1877 } 1878 else 1879 static assert(false); 1880 } 1881 else static if (LDC_with_SSE1) 1882 { 1883 __builtin_ia32_sfence(); 1884 } 1885 else static if (DMD_with_asm) 1886 { 1887 asm nothrow @nogc pure @safe 1888 { 1889 sfence; 1890 } 1891 } 1892 else version(LDC) 1893 { 1894 llvm_memory_fence(); // PERF: this generates mfence instead of sfence 1895 } 1896 else 1897 static assert(false); 1898 } 1899 unittest 1900 { 1901 _mm_sfence(); 1902 } 1903 1904 /// Warning: the immediate shuffle value `imm8` is given at compile-time instead of runtime. 1905 __m64 _mm_shuffle_pi16(int imm8)(__m64 a) pure @safe 1906 { 1907 return cast(__m64) shufflevector!(short4, ( (imm8 >> 0) & 3 ), 1908 ( (imm8 >> 2) & 3 ), 1909 ( (imm8 >> 4) & 3 ), 1910 ( (imm8 >> 6) & 3 ))(cast(short4)a, cast(short4)a); 1911 } 1912 unittest 1913 { 1914 __m64 A = _mm_setr_pi16(0, 1, 2, 3); 1915 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 1916 short4 B = cast(short4) _mm_shuffle_pi16!SHUFFLE(A); 1917 short[4] expectedB = [ 3, 2, 1, 0 ]; 1918 assert(B.array == expectedB); 1919 } 1920 1921 /// Warning: the immediate shuffle value `imm8` is given at compile-time instead of runtime. 1922 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe 1923 { 1924 return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b); 1925 } 1926 1927 /// Compute the square root of packed single-precision (32-bit) floating-point elements in `a`. 1928 __m128 _mm_sqrt_ps(__m128 a) @trusted 1929 { 1930 static if (GDC_with_SSE) 1931 { 1932 return __builtin_ia32_sqrtps(a); 1933 } 1934 else version(LDC) 1935 { 1936 // Disappeared with LDC 1.11 1937 static if (__VERSION__ < 2081) 1938 return __builtin_ia32_sqrtps(a); 1939 else 1940 { 1941 a[0] = llvm_sqrt(a[0]); 1942 a[1] = llvm_sqrt(a[1]); 1943 a[2] = llvm_sqrt(a[2]); 1944 a[3] = llvm_sqrt(a[3]); 1945 return a; 1946 } 1947 } 1948 else 1949 { 1950 a.ptr[0] = sqrt(a.array[0]); 1951 a.ptr[1] = sqrt(a.array[1]); 1952 a.ptr[2] = sqrt(a.array[2]); 1953 a.ptr[3] = sqrt(a.array[3]); 1954 return a; 1955 } 1956 } 1957 unittest 1958 { 1959 __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f)); 1960 assert(A.array[0] == 2.0f); 1961 assert(A.array[1] == 2.0f); 1962 assert(A.array[2] == 2.0f); 1963 assert(A.array[3] == 2.0f); 1964 } 1965 1966 /// Compute the square root of the lower single-precision (32-bit) floating-point element in `a`, store it in the lower 1967 /// element, and copy the upper 3 packed elements from `a` to the upper elements of result. 1968 __m128 _mm_sqrt_ss(__m128 a) @trusted 1969 { 1970 static if (GDC_with_SSE) 1971 { 1972 return __builtin_ia32_sqrtss(a); 1973 } 1974 else version(LDC) 1975 { 1976 a.ptr[0] = llvm_sqrt(a.array[0]); 1977 return a; 1978 } 1979 else 1980 { 1981 a.ptr[0] = sqrt(a.array[0]); 1982 return a; 1983 } 1984 } 1985 unittest 1986 { 1987 __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f)); 1988 assert(A.array[0] == 2.0f); 1989 assert(A.array[1] == 4.0f); 1990 assert(A.array[2] == 4.0f); 1991 assert(A.array[3] == 4.0f); 1992 } 1993 1994 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 1995 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1996 void _mm_store_ps (float* mem_addr, __m128 a) pure 1997 { 1998 __m128* aligned = cast(__m128*)mem_addr; 1999 *aligned = a; 2000 } 2001 2002 deprecated("Use _mm_store1_ps instead") alias _mm_store_ps1 = _mm_store1_ps; /// 2003 2004 /// Store the lower single-precision (32-bit) floating-point element from `a` into memory. 2005 /// `mem_addr` does not need to be aligned on any particular boundary. 2006 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe 2007 { 2008 *mem_addr = a.array[0]; 2009 } 2010 unittest 2011 { 2012 float a; 2013 _mm_store_ss(&a, _mm_set_ps(3, 2, 1, 546)); 2014 assert(a == 546); 2015 } 2016 2017 /// Store the lower single-precision (32-bit) floating-point element from `a` into 4 contiguous elements in memory. 2018 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 2019 void _mm_store1_ps(float* mem_addr, __m128 a) pure @trusted // TODO: shouldn't be trusted 2020 { 2021 __m128* aligned = cast(__m128*)mem_addr; 2022 __m128 r; 2023 r.ptr[0] = a.array[0]; 2024 r.ptr[1] = a.array[0]; 2025 r.ptr[2] = a.array[0]; 2026 r.ptr[3] = a.array[0]; 2027 *aligned = r; 2028 } 2029 unittest 2030 { 2031 align(16) float[4] A; 2032 _mm_store1_ps(A.ptr, _mm_set_ss(42.0f)); 2033 float[4] correct = [42.0f, 42, 42, 42]; 2034 assert(A == correct); 2035 } 2036 2037 /// Store the upper 2 single-precision (32-bit) floating-point elements from `a` into memory. 2038 void _mm_storeh_pi(__m64* p, __m128 a) pure @trusted 2039 { 2040 long2 la = cast(long2)a; 2041 (*p).ptr[0] = la.array[1]; 2042 } 2043 unittest 2044 { 2045 __m64 R = _mm_setzero_si64(); 2046 long2 A = [13, 25]; 2047 _mm_storeh_pi(&R, cast(__m128)A); 2048 assert(R.array[0] == 25); 2049 } 2050 2051 /// Store the lower 2 single-precision (32-bit) floating-point elements from `a` into memory. 2052 void _mm_storel_pi(__m64* p, __m128 a) pure @trusted 2053 { 2054 long2 la = cast(long2)a; 2055 (*p).ptr[0] = la.array[0]; 2056 } 2057 unittest 2058 { 2059 __m64 R = _mm_setzero_si64(); 2060 long2 A = [13, 25]; 2061 _mm_storel_pi(&R, cast(__m128)A); 2062 assert(R.array[0] == 13); 2063 } 2064 2065 /// Store 4 single-precision (32-bit) floating-point elements from `a` into memory in reverse order. 2066 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 2067 void _mm_storer_ps(float* mem_addr, __m128 a) pure @trusted // TODO should not be trusted 2068 { 2069 __m128* aligned = cast(__m128*)mem_addr; 2070 __m128 r; 2071 r.ptr[0] = a.array[3]; 2072 r.ptr[1] = a.array[2]; 2073 r.ptr[2] = a.array[1]; 2074 r.ptr[3] = a.array[0]; 2075 *aligned = r; 2076 } 2077 unittest 2078 { 2079 align(16) float[4] A; 2080 _mm_storer_ps(A.ptr, _mm_setr_ps(1.0f, 2, 3, 4)); 2081 float[4] correct = [4.0f, 3.0f, 2.0f, 1.0f]; 2082 assert(A == correct); 2083 } 2084 2085 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 2086 /// `mem_addr` does not need to be aligned on any particular boundary. 2087 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe // TODO should not be trusted 2088 { 2089 storeUnaligned!(float4)(a, mem_addr); 2090 } 2091 2092 /// Store 64-bits of integer data from `a` into memory using a non-temporal memory hint. 2093 void _mm_stream_pi (__m64* mem_addr, __m64 a) 2094 { 2095 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 2096 *mem_addr = a; // it's a regular move instead 2097 } 2098 2099 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a`s into memory using 2100 /// a non-temporal memory hint. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be 2101 /// generated. 2102 void _mm_stream_ps (float* mem_addr, __m128 a) 2103 { 2104 // BUG: can't implement non-temporal store with LDC inlineIR since !nontemporal 2105 // needs some IR outside this function that would say: 2106 // 2107 // !0 = !{ i32 1 } 2108 // 2109 // It's a LLVM IR metadata description. 2110 __m128* dest = cast(__m128*)mem_addr; 2111 *dest = a; // it's a regular move instead 2112 } 2113 unittest 2114 { 2115 align(16) float[4] A; 2116 _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f)); 2117 assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f); 2118 } 2119 2120 /// Subtract packed single-precision (32-bit) floating-point elements in `b` from packed single-precision (32-bit) 2121 /// floating-point elements in `a`. 2122 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe 2123 { 2124 return a - b; 2125 } 2126 unittest 2127 { 2128 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 2129 a = _mm_sub_ps(a, a); 2130 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 2131 assert(a.array == correct); 2132 } 2133 2134 /// Subtract the lower single-precision (32-bit) floating-point element in `b` from the lower single-precision (32-bit) 2135 /// floating-point element in `a`, store the subtration result in the lower element of result, and copy the upper 3 2136 /// packed elements from a to the upper elements of result. 2137 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe 2138 { 2139 static if (GDC_with_SSE) 2140 return __builtin_ia32_subss(a, b); 2141 else 2142 { 2143 a[0] -= b[0]; 2144 return a; 2145 } 2146 } 2147 unittest 2148 { 2149 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 2150 a = _mm_sub_ss(a, a); 2151 float[4] correct = [0.0f, -2.0, 3.0f, 1.0f]; 2152 assert(a.array == correct); 2153 } 2154 2155 /// Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in row0, row1, 2156 /// row2, and row3, and store the transposed matrix in these vectors (row0 now contains column 0, etc.). 2157 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe 2158 { 2159 __m128 tmp3, tmp2, tmp1, tmp0; 2160 tmp0 = _mm_unpacklo_ps(row0, row1); 2161 tmp2 = _mm_unpacklo_ps(row2, row3); 2162 tmp1 = _mm_unpackhi_ps(row0, row1); 2163 tmp3 = _mm_unpackhi_ps(row2, row3); 2164 row0 = _mm_movelh_ps(tmp0, tmp2); 2165 row1 = _mm_movehl_ps(tmp2, tmp0); 2166 row2 = _mm_movelh_ps(tmp1, tmp3); 2167 row3 = _mm_movehl_ps(tmp3, tmp1); 2168 } 2169 unittest 2170 { 2171 __m128 l0 = _mm_setr_ps(0, 1, 2, 3); 2172 __m128 l1 = _mm_setr_ps(4, 5, 6, 7); 2173 __m128 l2 = _mm_setr_ps(8, 9, 10, 11); 2174 __m128 l3 = _mm_setr_ps(12, 13, 14, 15); 2175 _MM_TRANSPOSE4_PS(l0, l1, l2, l3); 2176 float[4] r0 = [0.0f, 4, 8, 12]; 2177 float[4] r1 = [1.0f, 5, 9, 13]; 2178 float[4] r2 = [2.0f, 6, 10, 14]; 2179 float[4] r3 = [3.0f, 7, 11, 15]; 2180 assert(l0.array == r0); 2181 assert(l1.array == r1); 2182 assert(l2.array == r2); 2183 assert(l3.array == r3); 2184 } 2185 2186 // Note: the only difference between these intrinsics is the signalling 2187 // behaviour of quiet NaNs. This is incorrect but the case where 2188 // you would want to differentiate between qNaN and sNaN and then 2189 // treat them differently on purpose seems extremely rare. 2190 alias _mm_ucomieq_ss = _mm_comieq_ss; 2191 alias _mm_ucomige_ss = _mm_comige_ss; 2192 alias _mm_ucomigt_ss = _mm_comigt_ss; 2193 alias _mm_ucomile_ss = _mm_comile_ss; 2194 alias _mm_ucomilt_ss = _mm_comilt_ss; 2195 alias _mm_ucomineq_ss = _mm_comineq_ss; 2196 2197 /// Return vector of type `__m128` with undefined elements. 2198 __m128 _mm_undefined_ps() pure @safe 2199 { 2200 __m128 undef = void; 2201 return undef; 2202 } 2203 2204 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half `a` and `b`. 2205 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @trusted 2206 { 2207 version(LDC) 2208 { 2209 // x86: plain version generates unpckhps with LDC 1.0.0 -O1, but shufflevector 8 less instructions in -O0 2210 return shufflevectorLDC!(__m128, 2, 6, 3, 7)(a, b); 2211 } 2212 else 2213 { 2214 __m128 r; 2215 r.ptr[0] = a.array[2]; 2216 r.ptr[1] = b.array[2]; 2217 r.ptr[2] = a.array[3]; 2218 r.ptr[3] = b.array[3]; 2219 return r; 2220 } 2221 } 2222 unittest 2223 { 2224 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 2225 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 2226 __m128 R = _mm_unpackhi_ps(A, B); 2227 float[4] correct = [3.0f, 7.0f, 4.0f, 8.0f]; 2228 assert(R.array == correct); 2229 } 2230 2231 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of `a` and `b`. 2232 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @trusted 2233 { 2234 version(LDC) 2235 { 2236 // x86: plain version generates unpckhps with LDC 1.0.0 -O1, but shufflevector 8 less instructions in -O0 2237 return shufflevectorLDC!(__m128, 0, 4, 1, 5)(a, b); 2238 } 2239 else 2240 { 2241 __m128 r; 2242 r.ptr[0] = a.array[0]; 2243 r.ptr[1] = b.array[0]; 2244 r.ptr[2] = a.array[1]; 2245 r.ptr[3] = b.array[1]; 2246 return r; 2247 } 2248 } 2249 unittest 2250 { 2251 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 2252 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 2253 __m128 R = _mm_unpacklo_ps(A, B); 2254 float[4] correct = [1.0f, 5.0f, 2.0f, 6.0f]; 2255 assert(R.array == correct); 2256 } 2257 2258 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`. 2259 __m128 _mm_xor_ps (__m128 a, __m128 b) pure @safe 2260 { 2261 return cast(__m128)(cast(__m128i)a ^ cast(__m128i)b); 2262 } 2263 2264 2265 private 2266 { 2267 // Returns: `true` if the pointer is suitably aligned. 2268 bool isPointerAligned(void* p, size_t alignment) pure 2269 { 2270 assert(alignment != 0); 2271 return ( cast(size_t)p & (alignment - 1) ) == 0; 2272 } 2273 2274 // Returns: next pointer aligned with alignment bytes. 2275 void* nextAlignedPointer(void* start, size_t alignment) pure 2276 { 2277 return cast(void*)nextMultipleOf(cast(size_t)(start), alignment); 2278 } 2279 2280 // Returns number of bytes to actually allocate when asking 2281 // for a particular alignment 2282 @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure 2283 { 2284 enum size_t pointerSize = size_t.sizeof; 2285 return askedSize + alignment - 1 + pointerSize * 3; 2286 } 2287 2288 // Store pointer given my malloc, size and alignment 2289 @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure 2290 { 2291 enum size_t pointerSize = size_t.sizeof; 2292 char* start = cast(char*)raw + pointerSize * 3; 2293 void* aligned = nextAlignedPointer(start, alignment); 2294 void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize); 2295 *rawLocation = raw; 2296 size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize); 2297 *sizeLocation = size; 2298 size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize); 2299 *alignmentLocation = alignment; 2300 assert( isPointerAligned(aligned, alignment) ); 2301 return aligned; 2302 } 2303 2304 // Returns: x, multiple of powerOfTwo, so that x >= n. 2305 @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow 2306 { 2307 // check power-of-two 2308 assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0)); 2309 2310 size_t mask = ~(powerOfTwo - 1); 2311 return (n + powerOfTwo - 1) & mask; 2312 } 2313 } 2314 2315 unittest 2316 { 2317 assert(nextMultipleOf(0, 4) == 0); 2318 assert(nextMultipleOf(1, 4) == 4); 2319 assert(nextMultipleOf(2, 4) == 4); 2320 assert(nextMultipleOf(3, 4) == 4); 2321 assert(nextMultipleOf(4, 4) == 4); 2322 assert(nextMultipleOf(5, 4) == 8); 2323 2324 { 2325 void* p = _mm_malloc(23, 16); 2326 assert(p !is null); 2327 assert(((cast(size_t)p) & 0xf) == 0); 2328 _mm_free(p); 2329 } 2330 2331 void* nullAlloc = _mm_malloc(0, 32); 2332 assert(nullAlloc != null); 2333 _mm_free(nullAlloc); 2334 } 2335 2336 // For some reason, order of declaration is important for this one 2337 // so it is misplaced. 2338 // Note: is just another name for _mm_cvtss_si32 2339 alias _mm_cvt_ss2si = _mm_cvtss_si32;