1 /** 2 * SSE intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE 4 * 5 * Copyright: Copyright Guillaume Piolat 2016-2020. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.xmmintrin; 9 10 public import inteli.types; 11 12 import inteli.internals; 13 14 import inteli.mmx; 15 import inteli.emmintrin; 16 17 version(D_InlineAsm_X86) 18 version = InlineX86Asm; 19 else version(D_InlineAsm_X86_64) 20 version = InlineX86Asm; 21 22 23 // SSE1 24 25 nothrow @nogc: 26 27 28 enum int _MM_EXCEPT_INVALID = 0x0001; /// MXCSR Exception states. 29 enum int _MM_EXCEPT_DENORM = 0x0002; ///ditto 30 enum int _MM_EXCEPT_DIV_ZERO = 0x0004; ///ditto 31 enum int _MM_EXCEPT_OVERFLOW = 0x0008; ///ditto 32 enum int _MM_EXCEPT_UNDERFLOW = 0x0010; ///ditto 33 enum int _MM_EXCEPT_INEXACT = 0x0020; ///ditto 34 enum int _MM_EXCEPT_MASK = 0x003f; /// MXCSR Exception states mask. 35 36 enum int _MM_MASK_INVALID = 0x0080; /// MXCSR Exception masks. 37 enum int _MM_MASK_DENORM = 0x0100; ///ditto 38 enum int _MM_MASK_DIV_ZERO = 0x0200; ///ditto 39 enum int _MM_MASK_OVERFLOW = 0x0400; ///ditto 40 enum int _MM_MASK_UNDERFLOW = 0x0800; ///ditto 41 enum int _MM_MASK_INEXACT = 0x1000; ///ditto 42 enum int _MM_MASK_MASK = 0x1f80; /// MXCSR Exception masks mask. 43 44 enum int _MM_ROUND_NEAREST = 0x0000; /// MXCSR Rounding mode. 45 enum int _MM_ROUND_DOWN = 0x2000; ///ditto 46 enum int _MM_ROUND_UP = 0x4000; ///ditto 47 enum int _MM_ROUND_TOWARD_ZERO = 0x6000; ///ditto 48 enum int _MM_ROUND_MASK = 0x6000; /// MXCSR Rounding mode mask. 49 50 enum int _MM_FLUSH_ZERO_MASK = 0x8000; /// MXCSR Denormal flush to zero mask. 51 enum int _MM_FLUSH_ZERO_ON = 0x8000; /// MXCSR Denormal flush to zero modes. 52 enum int _MM_FLUSH_ZERO_OFF = 0x0000; ///ditto 53 54 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`. 55 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe 56 { 57 pragma(inline, true); 58 return a + b; 59 } 60 unittest 61 { 62 __m128 a = [1, 2, 3, 4]; 63 a = _mm_add_ps(a, a); 64 assert(a.array[0] == 2); 65 assert(a.array[1] == 4); 66 assert(a.array[2] == 6); 67 assert(a.array[3] == 8); 68 } 69 70 /// Add the lower single-precision (32-bit) floating-point element 71 /// in `a` and `b`, store the result in the lower element of result, 72 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 73 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe 74 { 75 static if (GDC_with_SSE) 76 { 77 return __builtin_ia32_addss(a, b); 78 } 79 else static if (DMD_with_DSIMD) 80 { 81 return cast(__m128) __simd(XMM.ADDSS, a, b); 82 } 83 else 84 { 85 a[0] += b[0]; 86 return a; 87 } 88 } 89 unittest 90 { 91 __m128 a = [1, 2, 3, 4]; 92 a = _mm_add_ss(a, a); 93 assert(a.array == [2.0f, 2, 3, 4]); 94 } 95 96 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`. 97 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe 98 { 99 pragma(inline, true); 100 return cast(__m128)(cast(__m128i)a & cast(__m128i)b); 101 } 102 unittest 103 { 104 float a = 4.32f; 105 float b = -78.99f; 106 int correct = (*cast(int*)(&a)) & (*cast(int*)(&b)); 107 __m128 A = _mm_set_ps(a, b, a, b); 108 __m128 B = _mm_set_ps(b, a, b, a); 109 int4 R = cast(int4)( _mm_and_ps(A, B) ); 110 assert(R.array[0] == correct); 111 assert(R.array[1] == correct); 112 assert(R.array[2] == correct); 113 assert(R.array[3] == correct); 114 } 115 116 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` and then AND with `b`. 117 __m128 _mm_andnot_ps (__m128 a, __m128 b) pure @safe 118 { 119 static if (DMD_with_DSIMD) 120 return cast(__m128) __simd(XMM.ANDNPS, a, b); 121 else 122 return cast(__m128)( (~cast(__m128i)a) & cast(__m128i)b ); 123 } 124 unittest 125 { 126 float a = 4.32f; 127 float b = -78.99f; 128 int correct = ~(*cast(int*)(&a)) & (*cast(int*)(&b)); 129 int correct2 = (*cast(int*)(&a)) & ~(*cast(int*)(&b)); 130 __m128 A = _mm_set_ps(a, b, a, b); 131 __m128 B = _mm_set_ps(b, a, b, a); 132 int4 R = cast(int4)( _mm_andnot_ps(A, B) ); 133 assert(R.array[0] == correct2); 134 assert(R.array[1] == correct); 135 assert(R.array[2] == correct2); 136 assert(R.array[3] == correct); 137 } 138 139 /// Average packed unsigned 16-bit integers in ``a` and `b`. 140 __m64 _mm_avg_pu16 (__m64 a, __m64 b) pure @safe 141 { 142 return to_m64(_mm_avg_epu16(to_m128i(a), to_m128i(b))); 143 } 144 145 /// Average packed unsigned 8-bit integers in ``a` and `b`. 146 __m64 _mm_avg_pu8 (__m64 a, __m64 b) pure @safe 147 { 148 return to_m64(_mm_avg_epu8(to_m128i(a), to_m128i(b))); 149 } 150 151 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for equality. 152 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe 153 { 154 static if (DMD_with_DSIMD) 155 return cast(__m128) __simd(XMM.CMPPS, a, b, 0); 156 else 157 return cast(__m128) cmpps!(FPComparison.oeq)(a, b); 158 } 159 unittest 160 { 161 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 162 __m128 B = _mm_setr_ps(3.0f, 2.0f, float.nan, float.nan); 163 __m128i R = cast(__m128i) _mm_cmpeq_ps(A, B); 164 int[4] correct = [0, -1, 0, 0]; 165 assert(R.array == correct); 166 } 167 168 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for equality, 169 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 170 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe 171 { 172 static if (DMD_with_DSIMD) 173 return cast(__m128) __simd(XMM.CMPSS, a, b, 0); 174 else 175 return cast(__m128) cmpss!(FPComparison.oeq)(a, b); 176 } 177 unittest 178 { 179 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 180 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 181 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 182 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 183 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 184 __m128i R1 = cast(__m128i) _mm_cmpeq_ss(A, B); 185 __m128i R2 = cast(__m128i) _mm_cmpeq_ss(A, C); 186 __m128i R3 = cast(__m128i) _mm_cmpeq_ss(A, D); 187 __m128i R4 = cast(__m128i) _mm_cmpeq_ss(A, E); 188 int[4] correct1 = [-1, 0, 0, 0]; 189 int[4] correct2 = [0, 0, 0, 0]; 190 int[4] correct3 = [0, 0, 0, 0]; 191 int[4] correct4 = [0, 0, 0, 0]; 192 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 193 } 194 195 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal. 196 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe 197 { 198 static if (DMD_with_DSIMD) 199 return cast(__m128) __simd(XMM.CMPPS, b, a, 2); 200 else 201 return cast(__m128) cmpps!(FPComparison.oge)(a, b); 202 } 203 unittest 204 { 205 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 206 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 207 __m128i R = cast(__m128i) _mm_cmpge_ps(A, B); 208 int[4] correct = [0, -1,-1, 0]; 209 assert(R.array == correct); 210 } 211 212 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal, 213 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 214 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe 215 { 216 static if (DMD_with_DSIMD) 217 { 218 __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 2); 219 a[0] = c[0]; 220 return a; 221 } 222 else 223 return cast(__m128) cmpss!(FPComparison.oge)(a, b); 224 } 225 unittest 226 { 227 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 228 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 229 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 230 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 231 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 232 __m128i R1 = cast(__m128i) _mm_cmpge_ss(A, B); 233 __m128i R2 = cast(__m128i) _mm_cmpge_ss(A, C); 234 __m128i R3 = cast(__m128i) _mm_cmpge_ss(A, D); 235 __m128i R4 = cast(__m128i) _mm_cmpge_ss(A, E); 236 int[4] correct1 = [-1, 0, 0, 0]; 237 int[4] correct2 = [-1, 0, 0, 0]; 238 int[4] correct3 = [0, 0, 0, 0]; 239 int[4] correct4 = [0, 0, 0, 0]; 240 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 241 } 242 243 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than. 244 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe 245 { 246 static if (DMD_with_DSIMD) 247 return cast(__m128) __simd(XMM.CMPPS, b, a, 1); 248 else 249 return cast(__m128) cmpps!(FPComparison.ogt)(a, b); 250 } 251 unittest 252 { 253 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 254 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 255 __m128i R = cast(__m128i) _mm_cmpgt_ps(A, B); 256 int[4] correct = [0, 0,-1, 0]; 257 assert(R.array == correct); 258 } 259 260 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than, 261 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 262 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe 263 { 264 static if (DMD_with_DSIMD) 265 { 266 __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 1); 267 a[0] = c[0]; 268 return a; 269 } 270 else 271 return cast(__m128) cmpss!(FPComparison.ogt)(a, b); 272 } 273 unittest 274 { 275 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 276 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 277 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 278 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 279 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 280 __m128i R1 = cast(__m128i) _mm_cmpgt_ss(A, B); 281 __m128i R2 = cast(__m128i) _mm_cmpgt_ss(A, C); 282 __m128i R3 = cast(__m128i) _mm_cmpgt_ss(A, D); 283 __m128i R4 = cast(__m128i) _mm_cmpgt_ss(A, E); 284 int[4] correct1 = [0, 0, 0, 0]; 285 int[4] correct2 = [-1, 0, 0, 0]; 286 int[4] correct3 = [0, 0, 0, 0]; 287 int[4] correct4 = [0, 0, 0, 0]; 288 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 289 } 290 291 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal. 292 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe 293 { 294 static if (DMD_with_DSIMD) 295 return cast(__m128) __simd(XMM.CMPPS, a, b, 2); 296 else 297 return cast(__m128) cmpps!(FPComparison.ole)(a, b); 298 } 299 unittest 300 { 301 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 302 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 303 __m128i R = cast(__m128i) _mm_cmple_ps(A, B); 304 int[4] correct = [-1, -1, 0, 0]; 305 assert(R.array == correct); 306 } 307 308 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal, 309 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 310 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe 311 { 312 static if (DMD_with_DSIMD) 313 return cast(__m128) __simd(XMM.CMPSS, a, b, 2); 314 else 315 return cast(__m128) cmpss!(FPComparison.ole)(a, b); 316 } 317 unittest 318 { 319 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 320 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 321 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 322 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 323 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 324 __m128i R1 = cast(__m128i) _mm_cmple_ss(A, B); 325 __m128i R2 = cast(__m128i) _mm_cmple_ss(A, C); 326 __m128i R3 = cast(__m128i) _mm_cmple_ss(A, D); 327 __m128i R4 = cast(__m128i) _mm_cmple_ss(A, E); 328 int[4] correct1 = [-1, 0, 0, 0]; 329 int[4] correct2 = [0, 0, 0, 0]; 330 int[4] correct3 = [0, 0, 0, 0]; 331 int[4] correct4 = [-1, 0, 0, 0]; 332 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 333 } 334 335 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than. 336 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe 337 { 338 static if (DMD_with_DSIMD) 339 return cast(__m128) __simd(XMM.CMPPS, a, b, 1); 340 else 341 return cast(__m128) cmpps!(FPComparison.olt)(a, b); 342 } 343 unittest 344 { 345 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 346 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 347 __m128i R = cast(__m128i) _mm_cmplt_ps(A, B); 348 int[4] correct = [-1, 0, 0, 0]; 349 assert(R.array == correct); 350 } 351 352 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than, 353 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 354 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe 355 { 356 static if (DMD_with_DSIMD) 357 return cast(__m128) __simd(XMM.CMPSS, a, b, 1); 358 else 359 return cast(__m128) cmpss!(FPComparison.olt)(a, b); 360 } 361 unittest 362 { 363 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 364 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 365 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 366 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 367 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 368 __m128i R1 = cast(__m128i) _mm_cmplt_ss(A, B); 369 __m128i R2 = cast(__m128i) _mm_cmplt_ss(A, C); 370 __m128i R3 = cast(__m128i) _mm_cmplt_ss(A, D); 371 __m128i R4 = cast(__m128i) _mm_cmplt_ss(A, E); 372 int[4] correct1 = [0, 0, 0, 0]; 373 int[4] correct2 = [0, 0, 0, 0]; 374 int[4] correct3 = [0, 0, 0, 0]; 375 int[4] correct4 = [-1, 0, 0, 0]; 376 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 377 } 378 379 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-equal. 380 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe 381 { 382 static if (DMD_with_DSIMD) 383 return cast(__m128) __simd(XMM.CMPPS, a, b, 4); 384 else 385 return cast(__m128) cmpps!(FPComparison.une)(a, b); 386 } 387 unittest 388 { 389 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 390 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 391 __m128i R = cast(__m128i) _mm_cmpneq_ps(A, B); 392 int[4] correct = [-1, 0, -1, -1]; 393 assert(R.array == correct); 394 } 395 396 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-equal, 397 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 398 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe 399 { 400 static if (DMD_with_DSIMD) 401 return cast(__m128) __simd(XMM.CMPSS, a, b, 4); 402 else 403 return cast(__m128) cmpss!(FPComparison.une)(a, b); 404 } 405 unittest 406 { 407 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 408 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 409 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 410 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 411 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 412 __m128i R1 = cast(__m128i) _mm_cmpneq_ss(A, B); 413 __m128i R2 = cast(__m128i) _mm_cmpneq_ss(A, C); 414 __m128i R3 = cast(__m128i) _mm_cmpneq_ss(A, D); 415 __m128i R4 = cast(__m128i) _mm_cmpneq_ss(A, E); 416 int[4] correct1 = [0, 0, 0, 0]; 417 int[4] correct2 = [-1, 0, 0, 0]; 418 int[4] correct3 = [-1, 0, 0, 0]; 419 int[4] correct4 = [-1, 0, 0, 0]; 420 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 421 } 422 423 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal. 424 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe 425 { 426 static if (DMD_with_DSIMD) 427 return cast(__m128) __simd(XMM.CMPPS, b, a, 6); 428 else 429 return cast(__m128) cmpps!(FPComparison.ult)(a, b); 430 } 431 unittest 432 { 433 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 434 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 435 __m128i R = cast(__m128i) _mm_cmpnge_ps(A, B); 436 int[4] correct = [-1, 0, 0, -1]; 437 assert(R.array == correct); 438 } 439 440 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal, 441 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 442 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe 443 { 444 static if (DMD_with_DSIMD) 445 { 446 __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 6); 447 a[0] = c[0]; 448 return a; 449 } 450 else 451 return cast(__m128) cmpss!(FPComparison.ult)(a, b); 452 } 453 unittest 454 { 455 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 456 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 457 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 458 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 459 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 460 __m128i R1 = cast(__m128i) _mm_cmpnge_ss(A, B); 461 __m128i R2 = cast(__m128i) _mm_cmpnge_ss(A, C); 462 __m128i R3 = cast(__m128i) _mm_cmpnge_ss(A, D); 463 __m128i R4 = cast(__m128i) _mm_cmpnge_ss(A, E); 464 int[4] correct1 = [0, 0, 0, 0]; 465 int[4] correct2 = [0, 0, 0, 0]; 466 int[4] correct3 = [-1, 0, 0, 0]; 467 int[4] correct4 = [-1, 0, 0, 0]; 468 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 469 } 470 471 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than. 472 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe 473 { 474 static if (DMD_with_DSIMD) 475 return cast(__m128) __simd(XMM.CMPPS, b, a, 5); 476 else 477 return cast(__m128) cmpps!(FPComparison.ule)(a, b); 478 } 479 unittest 480 { 481 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 482 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 483 __m128i R = cast(__m128i) _mm_cmpngt_ps(A, B); 484 int[4] correct = [-1, -1, 0, -1]; 485 assert(R.array == correct); 486 } 487 488 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than, 489 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 490 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe 491 { 492 static if (DMD_with_DSIMD) 493 { 494 __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 5); 495 a[0] = c[0]; 496 return a; 497 } 498 else 499 return cast(__m128) cmpss!(FPComparison.ule)(a, b); 500 } 501 unittest 502 { 503 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 504 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 505 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 506 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 507 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 508 __m128i R1 = cast(__m128i) _mm_cmpngt_ss(A, B); 509 __m128i R2 = cast(__m128i) _mm_cmpngt_ss(A, C); 510 __m128i R3 = cast(__m128i) _mm_cmpngt_ss(A, D); 511 __m128i R4 = cast(__m128i) _mm_cmpngt_ss(A, E); 512 int[4] correct1 = [-1, 0, 0, 0]; 513 int[4] correct2 = [0, 0, 0, 0]; 514 int[4] correct3 = [-1, 0, 0, 0]; 515 int[4] correct4 = [-1, 0, 0, 0]; 516 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 517 } 518 519 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal. 520 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe 521 { 522 static if (DMD_with_DSIMD) 523 return cast(__m128) __simd(XMM.CMPPS, a, b, 6); 524 else 525 return cast(__m128) cmpps!(FPComparison.ugt)(a, b); 526 } 527 unittest 528 { 529 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 530 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 531 __m128i R = cast(__m128i) _mm_cmpnle_ps(A, B); 532 int[4] correct = [0, 0, -1, -1]; 533 assert(R.array == correct); 534 } 535 536 537 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal, 538 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 539 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe 540 { 541 static if (DMD_with_DSIMD) 542 return cast(__m128) __simd(XMM.CMPSS, a, b, 6); 543 else 544 return cast(__m128) cmpss!(FPComparison.ugt)(a, b); 545 } 546 unittest 547 { 548 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 549 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 550 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 551 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 552 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 553 __m128i R1 = cast(__m128i) _mm_cmpnle_ss(A, B); 554 __m128i R2 = cast(__m128i) _mm_cmpnle_ss(A, C); 555 __m128i R3 = cast(__m128i) _mm_cmpnle_ss(A, D); 556 __m128i R4 = cast(__m128i) _mm_cmpnle_ss(A, E); 557 int[4] correct1 = [0, 0, 0, 0]; 558 int[4] correct2 = [-1, 0, 0, 0]; 559 int[4] correct3 = [-1, 0, 0, 0]; 560 int[4] correct4 = [0, 0, 0, 0]; 561 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 562 } 563 564 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than. 565 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe 566 { 567 static if (DMD_with_DSIMD) 568 return cast(__m128) __simd(XMM.CMPPS, a, b, 5); 569 else 570 return cast(__m128) cmpps!(FPComparison.uge)(a, b); 571 } 572 unittest 573 { 574 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 575 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 576 __m128i R = cast(__m128i) _mm_cmpnlt_ps(A, B); 577 int[4] correct = [0, -1, -1, -1]; 578 assert(R.array == correct); 579 } 580 581 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than, 582 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 583 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe 584 { 585 static if (DMD_with_DSIMD) 586 return cast(__m128) __simd(XMM.CMPSS, a, b, 5); 587 else 588 return cast(__m128) cmpss!(FPComparison.uge)(a, b); 589 } 590 unittest 591 { 592 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 593 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 594 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 595 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 596 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 597 __m128i R1 = cast(__m128i) _mm_cmpnlt_ss(A, B); 598 __m128i R2 = cast(__m128i) _mm_cmpnlt_ss(A, C); 599 __m128i R3 = cast(__m128i) _mm_cmpnlt_ss(A, D); 600 __m128i R4 = cast(__m128i) _mm_cmpnlt_ss(A, E); 601 int[4] correct1 = [-1, 0, 0, 0]; 602 int[4] correct2 = [-1, 0, 0, 0]; 603 int[4] correct3 = [-1, 0, 0, 0]; 604 int[4] correct4 = [0, 0, 0, 0]; 605 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 606 } 607 608 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN. 609 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe 610 { 611 static if (DMD_with_DSIMD) 612 return cast(__m128) __simd(XMM.CMPPS, a, b, 7); 613 else 614 return cast(__m128) cmpps!(FPComparison.ord)(a, b); 615 } 616 unittest 617 { 618 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 619 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 620 __m128i R = cast(__m128i) _mm_cmpord_ps(A, B); 621 int[4] correct = [-1, -1, -1, 0]; 622 assert(R.array == correct); 623 } 624 625 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN, 626 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 627 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe 628 { 629 static if (DMD_with_DSIMD) 630 return cast(__m128) __simd(XMM.CMPSS, a, b, 7); 631 else 632 return cast(__m128) cmpss!(FPComparison.ord)(a, b); 633 } 634 unittest 635 { 636 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 637 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 638 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 639 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 640 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 641 __m128i R1 = cast(__m128i) _mm_cmpord_ss(A, B); 642 __m128i R2 = cast(__m128i) _mm_cmpord_ss(A, C); 643 __m128i R3 = cast(__m128i) _mm_cmpord_ss(A, D); 644 __m128i R4 = cast(__m128i) _mm_cmpord_ss(A, E); 645 int[4] correct1 = [-1, 0, 0, 0]; 646 int[4] correct2 = [-1, 0, 0, 0]; 647 int[4] correct3 = [0, 0, 0, 0]; 648 int[4] correct4 = [-1, 0, 0, 0]; 649 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 650 } 651 652 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN. 653 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe 654 { 655 static if (DMD_with_DSIMD) 656 return cast(__m128) __simd(XMM.CMPPS, a, b, 3); 657 else 658 return cast(__m128) cmpps!(FPComparison.uno)(a, b); 659 } 660 unittest 661 { 662 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 663 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 664 __m128i R = cast(__m128i) _mm_cmpunord_ps(A, B); 665 int[4] correct = [0, 0, 0, -1]; 666 assert(R.array == correct); 667 } 668 669 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN. 670 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 671 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe 672 { 673 static if (DMD_with_DSIMD) 674 return cast(__m128) __simd(XMM.CMPSS, a, b, 3); 675 else return cast(__m128) cmpss!(FPComparison.uno)(a, b); 676 } 677 unittest 678 { 679 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 680 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 681 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 682 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 683 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 684 __m128i R1 = cast(__m128i) _mm_cmpunord_ss(A, B); 685 __m128i R2 = cast(__m128i) _mm_cmpunord_ss(A, C); 686 __m128i R3 = cast(__m128i) _mm_cmpunord_ss(A, D); 687 __m128i R4 = cast(__m128i) _mm_cmpunord_ss(A, E); 688 int[4] correct1 = [0, 0, 0, 0]; 689 int[4] correct2 = [0, 0, 0, 0]; 690 int[4] correct3 = [-1, 0, 0, 0]; 691 int[4] correct4 = [0, 0, 0, 0]; 692 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 693 } 694 695 696 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for equality, 697 /// and return the boolean result (0 or 1). 698 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe 699 { 700 return a.array[0] == b.array[0]; 701 } 702 unittest 703 { 704 assert(1 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 705 assert(0 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 706 assert(0 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 707 assert(0 == _mm_comieq_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 708 assert(1 == _mm_comieq_ss(_mm_set_ss(0.0), _mm_set_ss(-0.0))); 709 } 710 711 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than-or-equal, 712 /// and return the boolean result (0 or 1). 713 int _mm_comige_ss (__m128 a, __m128 b) pure @safe 714 { 715 return a.array[0] >= b.array[0]; 716 } 717 unittest 718 { 719 assert(1 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 720 assert(1 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 721 assert(0 == _mm_comige_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f))); 722 assert(0 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 723 assert(0 == _mm_comige_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 724 assert(1 == _mm_comige_ss(_mm_set_ss(-0.0f), _mm_set_ss(0.0f))); 725 } 726 727 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than, 728 /// and return the boolean result (0 or 1). 729 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta 730 { 731 return a.array[0] > b.array[0]; 732 } 733 unittest 734 { 735 assert(0 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 736 assert(1 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 737 assert(0 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 738 assert(0 == _mm_comigt_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 739 assert(0 == _mm_comigt_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f))); 740 } 741 742 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than-or-equal, 743 /// and return the boolean result (0 or 1). 744 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe 745 { 746 return a.array[0] <= b.array[0]; 747 } 748 unittest 749 { 750 assert(1 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 751 assert(0 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 752 assert(1 == _mm_comile_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f))); 753 assert(0 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 754 assert(0 == _mm_comile_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 755 assert(1 == _mm_comile_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f))); 756 } 757 758 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than, 759 /// and return the boolean result (0 or 1). 760 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb 761 { 762 return a.array[0] < b.array[0]; 763 } 764 unittest 765 { 766 assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 767 assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 768 assert(1 == _mm_comilt_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f))); 769 assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 770 assert(0 == _mm_comilt_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 771 assert(0 == _mm_comilt_ss(_mm_set_ss(-0.0f), _mm_set_ss(0.0f))); 772 } 773 774 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for not-equal, 775 /// and return the boolean result (0 or 1). 776 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne 777 { 778 return a.array[0] != b.array[0]; 779 } 780 unittest 781 { 782 assert(0 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 783 assert(1 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 784 assert(1 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 785 assert(1 == _mm_comineq_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 786 assert(0 == _mm_comineq_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f))); 787 } 788 789 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 790 /// floating-point elements, store the results in the lower 2 elements, 791 /// and copy the upper 2 packed elements from `a` to the upper elements of result. 792 alias _mm_cvt_pi2ps = _mm_cvtpi32_ps; 793 794 /// Convert 2 lower packed single-precision (32-bit) floating-point elements in `a` 795 /// to packed 32-bit integers. 796 __m64 _mm_cvt_ps2pi (__m128 a) @safe 797 { 798 return to_m64(_mm_cvtps_epi32(a)); 799 } 800 801 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 802 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 803 /// upper elements of the result. 804 __m128 _mm_cvt_si2ss (__m128 v, int x) pure @trusted 805 { 806 v.ptr[0] = cast(float)x; 807 return v; 808 } 809 unittest 810 { 811 __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42); 812 assert(a.array == [42f, 0, 0, 0]); 813 } 814 815 /// Convert packed 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements. 816 __m128 _mm_cvtpi16_ps (__m64 a) pure @safe 817 { 818 __m128i ma = to_m128i(a); 819 ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit 820 ma = _mm_srai_epi32(_mm_slli_epi32(ma, 16), 16); // Replicate sign bit 821 return _mm_cvtepi32_ps(ma); 822 } 823 unittest 824 { 825 __m64 A = _mm_setr_pi16(-1, 2, -3, 4); 826 __m128 R = _mm_cvtpi16_ps(A); 827 float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f]; 828 assert(R.array == correct); 829 } 830 831 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 832 /// floating-point elements, store the results in the lower 2 elements, 833 /// and copy the upper 2 packed elements from `a` to the upper elements of result. 834 __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) pure @trusted 835 { 836 __m128 fb = _mm_cvtepi32_ps(to_m128i(b)); 837 a.ptr[0] = fb.array[0]; 838 a.ptr[1] = fb.array[1]; 839 return a; 840 } 841 unittest 842 { 843 __m128 R = _mm_cvtpi32_ps(_mm_set1_ps(4.0f), _mm_setr_pi32(1, 2)); 844 float[4] correct = [1.0f, 2.0f, 4.0f, 4.0f]; 845 assert(R.array == correct); 846 } 847 848 /// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point elements, 849 /// store the results in the lower 2 elements, then covert the packed signed 32-bit integers in `b` to 850 /// single-precision (32-bit) floating-point element, and store the results in the upper 2 elements. 851 __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) pure @trusted 852 { 853 long2 l; 854 l.ptr[0] = a.array[0]; 855 l.ptr[1] = b.array[0]; 856 return _mm_cvtepi32_ps(cast(__m128i)l); 857 } 858 unittest 859 { 860 __m64 A = _mm_setr_pi32(-45, 128); 861 __m64 B = _mm_setr_pi32(0, 1000); 862 __m128 R = _mm_cvtpi32x2_ps(A, B); 863 float[4] correct = [-45.0f, 128.0f, 0.0f, 1000.0f]; 864 assert(R.array == correct); 865 } 866 867 /// Convert the lower packed 8-bit integers in `a` to packed single-precision (32-bit) floating-point elements. 868 __m128 _mm_cvtpi8_ps (__m64 a) pure @safe 869 { 870 __m128i b = to_m128i(a); 871 872 // Zero extend to 32-bit 873 b = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 874 b = _mm_unpacklo_epi16(b, _mm_setzero_si128()); 875 876 // Replicate sign bit 877 b = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24); // Replicate sign bit 878 return _mm_cvtepi32_ps(b); 879 } 880 unittest 881 { 882 __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0); 883 __m128 R = _mm_cvtpi8_ps(A); 884 float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f]; 885 assert(R.array == correct); 886 } 887 888 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 16-bit integers. 889 /// Note: this intrinsic will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and 0x7FFFFFFF. 890 __m64 _mm_cvtps_pi16 (__m128 a) @safe 891 { 892 // The C++ version of this intrinsic convert to 32-bit float, then use packssdw 893 // Which means the 16-bit integers should be saturated 894 __m128i b = _mm_cvtps_epi32(a); 895 b = _mm_packs_epi32(b, b); 896 return to_m64(b); 897 } 898 unittest 899 { 900 __m128 A = _mm_setr_ps(-1.0f, 2.0f, -33000.0f, 70000.0f); 901 short4 R = cast(short4) _mm_cvtps_pi16(A); 902 short[4] correct = [-1, 2, -32768, 32767]; 903 assert(R.array == correct); 904 } 905 906 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers. 907 __m64 _mm_cvtps_pi32 (__m128 a) @safe 908 { 909 return to_m64(_mm_cvtps_epi32(a)); 910 } 911 unittest 912 { 913 __m128 A = _mm_setr_ps(-33000.0f, 70000.0f, -1.0f, 2.0f, ); 914 int2 R = cast(int2) _mm_cvtps_pi32(A); 915 int[2] correct = [-33000, 70000]; 916 assert(R.array == correct); 917 } 918 919 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 8-bit integers, 920 /// and store the results in lower 4 elements. 921 /// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values between 0x7F and 0x7FFFFFFF. 922 __m64 _mm_cvtps_pi8 (__m128 a) @safe 923 { 924 // The C++ version of this intrinsic convert to 32-bit float, then use packssdw + packsswb 925 // Which means the 8-bit integers should be saturated 926 __m128i b = _mm_cvtps_epi32(a); 927 b = _mm_packs_epi32(b, _mm_setzero_si128()); 928 b = _mm_packs_epi16(b, _mm_setzero_si128()); 929 return to_m64(b); 930 } 931 unittest 932 { 933 __m128 A = _mm_setr_ps(-1.0f, 2.0f, -129.0f, 128.0f); 934 byte8 R = cast(byte8) _mm_cvtps_pi8(A); 935 byte[8] correct = [-1, 2, -128, 127, 0, 0, 0, 0]; 936 assert(R.array == correct); 937 } 938 939 /// Convert packed unsigned 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements. 940 __m128 _mm_cvtpu16_ps (__m64 a) pure @safe 941 { 942 __m128i ma = to_m128i(a); 943 ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit 944 return _mm_cvtepi32_ps(ma); 945 } 946 unittest 947 { 948 __m64 A = _mm_setr_pi16(-1, 2, -3, 4); 949 __m128 R = _mm_cvtpu16_ps(A); 950 float[4] correct = [65535.0f, 2.0f, 65533.0f, 4.0f]; 951 assert(R.array == correct); 952 } 953 954 /// Convert the lower packed unsigned 8-bit integers in `a` to packed single-precision (32-bit) floating-point element. 955 __m128 _mm_cvtpu8_ps (__m64 a) pure @safe 956 { 957 __m128i b = to_m128i(a); 958 959 // Zero extend to 32-bit 960 b = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 961 b = _mm_unpacklo_epi16(b, _mm_setzero_si128()); 962 return _mm_cvtepi32_ps(b); 963 } 964 unittest 965 { 966 __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0); 967 __m128 R = _mm_cvtpu8_ps(A); 968 float[4] correct = [255.0f, 2.0f, 253.0f, 4.0f]; 969 assert(R.array == correct); 970 } 971 972 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 973 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 974 /// upper elements of result. 975 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @trusted 976 { 977 v.ptr[0] = cast(float)x; 978 return v; 979 } 980 unittest 981 { 982 __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42); 983 assert(a.array == [42.0f, 0, 0, 0]); 984 } 985 986 987 /// Convert the signed 64-bit integer `b` to a single-precision (32-bit) floating-point element, 988 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 989 /// upper elements of result. 990 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @trusted 991 { 992 v.ptr[0] = cast(float)x; 993 return v; 994 } 995 unittest 996 { 997 __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42); 998 assert(a.array == [42.0f, 0, 0, 0]); 999 } 1000 1001 /// Take the lower single-precision (32-bit) floating-point element of `a`. 1002 float _mm_cvtss_f32(__m128 a) pure @safe 1003 { 1004 return a.array[0]; 1005 } 1006 1007 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit integer. 1008 int _mm_cvtss_si32 (__m128 a) @safe // PERF GDC 1009 { 1010 static if (GDC_with_SSE) 1011 { 1012 return __builtin_ia32_cvtss2si(a); 1013 } 1014 else static if (LDC_with_SSE) 1015 { 1016 return __builtin_ia32_cvtss2si(a); 1017 } 1018 else static if (DMD_with_DSIMD) 1019 { 1020 __m128 b; 1021 __m128i r = cast(__m128i) __simd(XMM.CVTPS2DQ, a); // Note: converts 4 integers. 1022 return r.array[0]; 1023 } 1024 else 1025 { 1026 return convertFloatToInt32UsingMXCSR(a.array[0]); 1027 } 1028 } 1029 unittest 1030 { 1031 assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 1032 } 1033 1034 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer. 1035 long _mm_cvtss_si64 (__m128 a) @safe 1036 { 1037 static if (LDC_with_SSE2) 1038 { 1039 version(X86_64) 1040 { 1041 return __builtin_ia32_cvtss2si64(a); 1042 } 1043 else 1044 { 1045 // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer 1046 // using SSE instructions only. So the builtin doesn't exit for this arch. 1047 return convertFloatToInt64UsingMXCSR(a.array[0]); 1048 } 1049 } 1050 else 1051 { 1052 return convertFloatToInt64UsingMXCSR(a.array[0]); 1053 } 1054 } 1055 unittest 1056 { 1057 assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 1058 1059 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1060 1061 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1062 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.49f))); 1063 1064 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1065 assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f))); 1066 1067 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1068 assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f))); 1069 1070 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1071 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f))); 1072 1073 _MM_SET_ROUNDING_MODE(savedRounding); 1074 } 1075 1076 1077 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit 1078 /// integer with truncation. 1079 int _mm_cvtt_ss2si (__m128 a) pure @safe 1080 { 1081 // x86: cvttss2si always generated, even in -O0 1082 return cast(int)(a.array[0]); 1083 } 1084 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; ///ditto 1085 unittest 1086 { 1087 assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1088 } 1089 1090 1091 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit 1092 /// integers with truncation. 1093 __m64 _mm_cvtt_ps2pi (__m128 a) pure @safe 1094 { 1095 return to_m64(_mm_cvttps_epi32(a)); 1096 } 1097 1098 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit 1099 /// integer with truncation. 1100 long _mm_cvttss_si64 (__m128 a) pure @safe 1101 { 1102 return cast(long)(a.array[0]); 1103 } 1104 unittest 1105 { 1106 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1107 } 1108 1109 /// Divide packed single-precision (32-bit) floating-point elements in `a` by packed elements in `b`. 1110 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe 1111 { 1112 pragma(inline, true); 1113 return a / b; 1114 } 1115 unittest 1116 { 1117 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1118 a = _mm_div_ps(a, a); 1119 float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f]; 1120 assert(a.array == correct); 1121 } 1122 1123 /// Divide the lower single-precision (32-bit) floating-point element in `a` by the lower 1124 /// single-precision (32-bit) floating-point element in `b`, store the result in the lower 1125 /// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result. 1126 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe 1127 { 1128 static if (DMD_with_DSIMD) 1129 return cast(__m128) __simd(XMM.DIVSS, a, b); 1130 else static if (GDC_with_SSE) 1131 return __builtin_ia32_divss(a, b); 1132 else 1133 { 1134 a[0] /= b[0]; 1135 return a; 1136 } 1137 } 1138 unittest 1139 { 1140 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1141 a = _mm_div_ss(a, a); 1142 float[4] correct = [1.0f, -2.0, 3.0f, 1.0f]; 1143 assert(a.array == correct); 1144 } 1145 1146 /// Extract a 16-bit unsigned integer from `a`, selected with `imm8`. Zero-extended. 1147 int _mm_extract_pi16 (__m64 a, int imm8) 1148 { 1149 short4 sa = cast(short4)a; 1150 return cast(ushort)(sa.array[imm8]); 1151 } 1152 unittest 1153 { 1154 __m64 A = _mm_setr_pi16(-1, 6, 0, 4); 1155 assert(_mm_extract_pi16(A, 0) == 65535); 1156 assert(_mm_extract_pi16(A, 1) == 6); 1157 assert(_mm_extract_pi16(A, 2) == 0); 1158 assert(_mm_extract_pi16(A, 3) == 4); 1159 } 1160 1161 /// Free aligned memory that was allocated with `_mm_malloc` or `_mm_realloc`. 1162 void _mm_free(void * mem_addr) @trusted 1163 { 1164 // support for free(NULL) 1165 if (mem_addr is null) 1166 return; 1167 1168 // Technically we don't need to store size and alignement in the chunk, but we do in case we 1169 // have to implement _mm_realloc 1170 1171 size_t pointerSize = (void*).sizeof; 1172 void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof); 1173 size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize); 1174 size_t alignment = *alignmentLocation; 1175 assert(alignment != 0); 1176 assert(isPointerAligned(mem_addr, alignment)); 1177 free(*rawLocation); 1178 } 1179 1180 /// Get the exception mask bits from the MXCSR control and status register. 1181 /// The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, 1182 /// `_MM_MASK_DIV_ZERO`, `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`. 1183 /// Note: won't correspond to reality on non-x86, where MXCSR this is emulated. 1184 uint _MM_GET_EXCEPTION_MASK() @safe 1185 { 1186 return _mm_getcsr() & _MM_MASK_MASK; 1187 } 1188 1189 /// Get the exception state bits from the MXCSR control and status register. 1190 /// The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 1191 /// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`. 1192 /// Note: won't correspond to reality on non-x86, where MXCSR this is emulated. No exception reported. 1193 uint _MM_GET_EXCEPTION_STATE() @safe 1194 { 1195 return _mm_getcsr() & _MM_EXCEPT_MASK; 1196 } 1197 1198 /// Get the flush zero bits from the MXCSR control and status register. 1199 /// The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF` 1200 uint _MM_GET_FLUSH_ZERO_MODE() @safe 1201 { 1202 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 1203 } 1204 1205 /// Get the rounding mode bits from the MXCSR control and status register. The rounding mode may 1206 /// contain any of the following flags: `_MM_ROUND_NEAREST, `_MM_ROUND_DOWN`, `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`. 1207 uint _MM_GET_ROUNDING_MODE() @safe 1208 { 1209 return _mm_getcsr() & _MM_ROUND_MASK; 1210 } 1211 1212 /// Get the unsigned 32-bit value of the MXCSR control and status register. 1213 /// Note: this is emulated on ARM, because there is no MXCSR register then. 1214 uint _mm_getcsr() @trusted 1215 { 1216 static if (LDC_with_ARM) 1217 { 1218 // Note: we convert the ARM FPSCR into a x86 SSE control word. 1219 // However, only rounding mode and flush to zero are actually set. 1220 // The returned control word will have all exceptions masked, and no exception detected. 1221 1222 uint fpscr = arm_get_fpcr(); 1223 1224 uint cw = 0; // No exception detected 1225 if (fpscr & _MM_FLUSH_ZERO_MASK_ARM) 1226 { 1227 // ARM has one single flag for ARM. 1228 // It does both x86 bits. 1229 // https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode 1230 cw |= _MM_FLUSH_ZERO_ON; 1231 cw |= 0x40; // set "denormals are zeros" 1232 } 1233 cw |= _MM_MASK_MASK; // All exception maske 1234 1235 // Rounding mode 1236 switch(fpscr & _MM_ROUND_MASK_ARM) 1237 { 1238 default: 1239 case _MM_ROUND_NEAREST_ARM: cw |= _MM_ROUND_NEAREST; break; 1240 case _MM_ROUND_DOWN_ARM: cw |= _MM_ROUND_DOWN; break; 1241 case _MM_ROUND_UP_ARM: cw |= _MM_ROUND_UP; break; 1242 case _MM_ROUND_TOWARD_ZERO_ARM: cw |= _MM_ROUND_TOWARD_ZERO; break; 1243 } 1244 return cw; 1245 } 1246 else version(GNU) 1247 { 1248 static if (GDC_with_SSE) 1249 { 1250 return __builtin_ia32_stmxcsr(); 1251 } 1252 else version(X86) 1253 { 1254 uint sseRounding = 0; 1255 asm pure nothrow @nogc @trusted 1256 { 1257 "stmxcsr %0;\n" 1258 : "=m" (sseRounding) 1259 : 1260 : ; 1261 } 1262 return sseRounding; 1263 } 1264 else return __warn_noop_ret!uint(); 1265 } 1266 else version (InlineX86Asm) 1267 { 1268 uint controlWord; 1269 asm nothrow @nogc pure @trusted 1270 { 1271 stmxcsr controlWord; 1272 } 1273 return controlWord; 1274 } 1275 else 1276 static assert(0, "Not yet supported"); 1277 } 1278 unittest 1279 { 1280 uint csr = _mm_getcsr(); 1281 } 1282 1283 /// Insert a 16-bit integer `i` inside `a` at the location specified by `imm8`. 1284 __m64 _mm_insert_pi16 (__m64 v, int i, int imm8) pure @trusted 1285 { 1286 short4 r = cast(short4)v; 1287 r.ptr[imm8 & 3] = cast(short)i; 1288 return cast(__m64)r; 1289 } 1290 unittest 1291 { 1292 __m64 A = _mm_set_pi16(3, 2, 1, 0); 1293 short4 R = cast(short4) _mm_insert_pi16(A, 42, 1 | 4); 1294 short[4] correct = [0, 42, 2, 3]; 1295 assert(R.array == correct); 1296 } 1297 1298 /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory. 1299 // `p` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1300 __m128 _mm_load_ps(const(float)*p) pure @trusted // FUTURE shouldn't be trusted, see #62 1301 { 1302 pragma(inline, true); 1303 return *cast(__m128*)p; 1304 } 1305 unittest 1306 { 1307 static immutable align(16) float[4] correct = [1.0f, 2.0f, 3.0f, 4.0f]; 1308 __m128 A = _mm_load_ps(correct.ptr); 1309 assert(A.array == correct); 1310 } 1311 1312 /// Load a single-precision (32-bit) floating-point element from memory into all elements. 1313 __m128 _mm_load_ps1(const(float)*p) pure @trusted 1314 { 1315 return __m128(*p); 1316 } 1317 unittest 1318 { 1319 float n = 2.5f; 1320 float[4] correct = [2.5f, 2.5f, 2.5f, 2.5f]; 1321 __m128 A = _mm_load_ps1(&n); 1322 assert(A.array == correct); 1323 } 1324 1325 /// Load a single-precision (32-bit) floating-point element from memory into the lower of dst, and zero the upper 3 1326 /// elements. `mem_addr` does not need to be aligned on any particular boundary. 1327 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted 1328 { 1329 pragma(inline, true); 1330 static if (DMD_with_DSIMD) 1331 { 1332 return cast(__m128)__simd(XMM.LODSS, *cast(__m128*)mem_addr); 1333 } 1334 else 1335 { 1336 __m128 r; // PERf =void; 1337 r.ptr[0] = *mem_addr; 1338 r.ptr[1] = 0; 1339 r.ptr[2] = 0; 1340 r.ptr[3] = 0; 1341 return r; 1342 } 1343 } 1344 unittest 1345 { 1346 float n = 2.5f; 1347 float[4] correct = [2.5f, 0.0f, 0.0f, 0.0f]; 1348 __m128 A = _mm_load_ss(&n); 1349 assert(A.array == correct); 1350 } 1351 1352 /// Load a single-precision (32-bit) floating-point element from memory into all elements. 1353 alias _mm_load1_ps = _mm_load_ps1; 1354 1355 /// Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of result, 1356 /// and copy the lower 2 elements from `a` to result. `mem_addr does` not need to be aligned on any particular boundary. 1357 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @trusted 1358 { 1359 pragma(inline, true); 1360 static if (DMD_with_DSIMD) 1361 { 1362 return cast(__m128) __simd(XMM.LODHPS, a, *cast(const(__m128)*)mem_addr); 1363 } 1364 else 1365 { 1366 // x86: movlhps generated since LDC 1.9.0 -O1 1367 long2 la = cast(long2)a; 1368 la.ptr[1] = (*mem_addr).array[0]; 1369 return cast(__m128)la; 1370 } 1371 } 1372 unittest 1373 { 1374 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1375 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1376 __m64 M = to_m64(cast(__m128i)B); 1377 __m128 R = _mm_loadh_pi(A, &M); 1378 float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f]; 1379 assert(R.array == correct); 1380 } 1381 1382 /// Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of result, 1383 /// and copy the upper 2 elements from `a` to result. `mem_addr` does not need to be aligned on any particular boundary. 1384 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @trusted 1385 { 1386 pragma(inline, true); 1387 1388 // Disabled because of https://issues.dlang.org/show_bug.cgi?id=23046 1389 /* 1390 static if (DMD_with_DSIMD) 1391 { 1392 return cast(__m128) __simd(XMM.LODLPS, a, *cast(const(__m128)*)mem_addr); 1393 } 1394 else */ 1395 { 1396 // x86: movlpd/movlps generated with all LDC -01 1397 long2 la = cast(long2)a; 1398 la.ptr[0] = (*mem_addr).array[0]; 1399 return cast(__m128)la; 1400 } 1401 } 1402 unittest 1403 { 1404 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1405 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1406 __m64 M = to_m64(cast(__m128i)B); 1407 __m128 R = _mm_loadl_pi(A, &M); 1408 float[4] correct = [5.0f, 6.0f, 3.0f, 4.0f]; 1409 assert(R.array == correct); 1410 } 1411 1412 /// Load 4 single-precision (32-bit) floating-point elements from memory in reverse order. 1413 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1414 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted // FUTURE shouldn't be trusted, see #62 1415 { 1416 __m128* aligned = cast(__m128*)mem_addr; // x86: movaps + shups since LDC 1.0.0 -O1 1417 __m128 a = *aligned; 1418 static if (DMD_with_DSIMD) 1419 { 1420 return cast(__m128) __simd(XMM.SHUFPS, a, a, 27); 1421 } 1422 else 1423 { 1424 __m128 r; // PERF =void; 1425 r.ptr[0] = a.array[3]; 1426 r.ptr[1] = a.array[2]; 1427 r.ptr[2] = a.array[1]; 1428 r.ptr[3] = a.array[0]; 1429 return r; 1430 } 1431 } 1432 unittest 1433 { 1434 align(16) static immutable float[4] arr = [ 1.0f, 2.0f, 3.0f, 8.0f ]; 1435 __m128 A = _mm_loadr_ps(arr.ptr); 1436 float[4] correct = [ 8.0f, 3.0f, 2.0f, 1.0f ]; 1437 assert(A.array == correct); 1438 } 1439 1440 /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory. 1441 /// `mem_addr` does not need to be aligned on any particular boundary. 1442 __m128 _mm_loadu_ps(const(float)* mem_addr) pure @trusted 1443 { 1444 pragma(inline, true); 1445 static if (GDC_with_SSE2) 1446 { 1447 return __builtin_ia32_loadups(mem_addr); 1448 } 1449 else static if (LDC_with_optimizations) 1450 { 1451 static if (LDC_with_optimizations) 1452 { 1453 return loadUnaligned!(__m128)(mem_addr); 1454 } 1455 else 1456 { 1457 __m128 result; 1458 result.ptr[0] = mem_addr[0]; 1459 result.ptr[1] = mem_addr[1]; 1460 result.ptr[2] = mem_addr[2]; 1461 result.ptr[3] = mem_addr[3]; 1462 return result; 1463 } 1464 } 1465 else version(DigitalMars) 1466 { 1467 static if (DMD_with_DSIMD) 1468 { 1469 return cast(__m128)__simd(XMM.LODUPS, *cast(const(float4*))mem_addr); 1470 } 1471 else static if (SSESizedVectorsAreEmulated) 1472 { 1473 // Since this vector is emulated, it doesn't have alignement constraints 1474 // and as such we can just cast it. 1475 return *cast(__m128*)(mem_addr); 1476 } 1477 else 1478 { 1479 __m128 result; 1480 result.ptr[0] = mem_addr[0]; 1481 result.ptr[1] = mem_addr[1]; 1482 result.ptr[2] = mem_addr[2]; 1483 result.ptr[3] = mem_addr[3]; 1484 return result; 1485 } 1486 } 1487 else 1488 { 1489 __m128 result; 1490 result.ptr[0] = mem_addr[0]; 1491 result.ptr[1] = mem_addr[1]; 1492 result.ptr[2] = mem_addr[2]; 1493 result.ptr[3] = mem_addr[3]; 1494 return result; 1495 } 1496 } 1497 unittest 1498 { 1499 align(16) static immutable float[5] arr = [ 1.0f, 2.0f, 3.0f, 8.0f, 9.0f ]; // force unaligned load 1500 __m128 A = _mm_loadu_ps(&arr[1]); 1501 float[4] correct = [ 2.0f, 3.0f, 8.0f, 9.0f ]; 1502 assert(A.array == correct); 1503 } 1504 1505 /// Allocate size bytes of memory, aligned to the alignment specified in align, 1506 /// and return a pointer to the allocated memory. `_mm_free` should be used to free 1507 /// memory that is allocated with `_mm_malloc`. 1508 void* _mm_malloc(size_t size, size_t alignment) @trusted 1509 { 1510 assert(alignment != 0); 1511 size_t request = requestedSize(size, alignment); 1512 void* raw = malloc(request); 1513 if (request > 0 && raw == null) // malloc(0) can validly return anything 1514 onOutOfMemoryError(); 1515 return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size 1516 } 1517 1518 /// Conditionally store 8-bit integer elements from a into memory using mask (elements are not stored when the highest 1519 /// bit is not set in the corresponding element) and a non-temporal memory hint. 1520 void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr) @trusted 1521 { 1522 // this works since mask is zero-extended 1523 return _mm_maskmoveu_si128 (to_m128i(a), to_m128i(mask), mem_addr); 1524 } 1525 1526 deprecated("Use _mm_maskmove_si64 instead") alias _m_maskmovq = _mm_maskmove_si64;/// 1527 1528 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum value. 1529 __m64 _mm_max_pi16 (__m64 a, __m64 b) pure @safe 1530 { 1531 return to_m64(_mm_max_epi16(to_m128i(a), to_m128i(b))); 1532 } 1533 1534 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values. 1535 __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe 1536 { 1537 static if (DMD_with_DSIMD) 1538 { 1539 return cast(__m128) __simd(XMM.MAXPS, a, b); 1540 } 1541 else static if (GDC_with_SSE) 1542 { 1543 return __builtin_ia32_maxps(a, b); 1544 } 1545 else static if (LDC_with_SSE) 1546 { 1547 return __builtin_ia32_maxps(a, b); 1548 } 1549 else 1550 { 1551 // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02 1552 __m128 r; // PERF =void; 1553 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 1554 r[1] = (a[1] > b[1]) ? a[1] : b[1]; 1555 r[2] = (a[2] > b[2]) ? a[2] : b[2]; 1556 r[3] = (a[3] > b[3]) ? a[3] : b[3]; 1557 return r; 1558 } 1559 } 1560 unittest 1561 { 1562 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 1563 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 1564 __m128 M = _mm_max_ps(A, B); 1565 assert(M.array[0] == 4); 1566 assert(M.array[1] == 2); 1567 assert(M.array[2] == 4); // in case of NaN, second operand prevails (as it seems) 1568 assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems) 1569 } 1570 1571 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed maximum values. 1572 __m64 _mm_max_pu8 (__m64 a, __m64 b) pure @safe 1573 { 1574 return to_m64(_mm_max_epu8(to_m128i(a), to_m128i(b))); 1575 } 1576 1577 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the maximum value in the 1578 /// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result. 1579 __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe 1580 { 1581 static if (DMD_with_DSIMD) 1582 { 1583 return cast(__m128) __simd(XMM.MAXSS, a, b); 1584 } 1585 else static if (GDC_with_SSE) 1586 { 1587 return __builtin_ia32_maxss(a, b); 1588 } 1589 else static if (LDC_with_SSE) 1590 { 1591 return __builtin_ia32_maxss(a, b); 1592 } 1593 else 1594 { 1595 __m128 r = a; 1596 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 1597 return r; 1598 } 1599 } 1600 unittest 1601 { 1602 __m128 A = _mm_setr_ps(1, 2, 3, 4); 1603 __m128 B = _mm_setr_ps(4, 1, 4, 1); 1604 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 1605 __m128 M = _mm_max_ss(A, B); 1606 assert(M.array[0] == 4); 1607 assert(M.array[1] == 2); 1608 assert(M.array[2] == 3); 1609 assert(M.array[3] == 4); 1610 M = _mm_max_ps(A, C); // in case of NaN, second operand prevails 1611 assert(M.array[0] != M.array[0]); 1612 M = _mm_max_ps(C, A); // in case of NaN, second operand prevails 1613 assert(M.array[0] == 1); 1614 } 1615 1616 /// Compare packed signed 16-bit integers in a and b, and return packed minimum values. 1617 __m64 _mm_min_pi16 (__m64 a, __m64 b) pure @safe 1618 { 1619 return to_m64(_mm_min_epi16(to_m128i(a), to_m128i(b))); 1620 } 1621 1622 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values. 1623 __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe 1624 { 1625 static if (DMD_with_DSIMD) 1626 { 1627 return cast(__m128) __simd(XMM.MINPS, a, b); 1628 } 1629 else static if (GDC_with_SSE) 1630 { 1631 return __builtin_ia32_minps(a, b); 1632 } 1633 else static if (LDC_with_SSE) 1634 { 1635 // not technically needed, but better perf in debug mode 1636 return __builtin_ia32_minps(a, b); 1637 } 1638 else 1639 { 1640 // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02 1641 __m128 r; // PERF =void; 1642 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 1643 r[1] = (a[1] < b[1]) ? a[1] : b[1]; 1644 r[2] = (a[2] < b[2]) ? a[2] : b[2]; 1645 r[3] = (a[3] < b[3]) ? a[3] : b[3]; 1646 return r; 1647 } 1648 } 1649 unittest 1650 { 1651 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 1652 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 1653 __m128 M = _mm_min_ps(A, B); 1654 assert(M.array[0] == 1); 1655 assert(M.array[1] == 1); 1656 assert(M.array[2] == 4); // in case of NaN, second operand prevails (as it seems) 1657 assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems) 1658 } 1659 1660 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values. 1661 __m64 _mm_min_pu8 (__m64 a, __m64 b) pure @safe 1662 { 1663 return to_m64(_mm_min_epu8(to_m128i(a), to_m128i(b))); 1664 } 1665 1666 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the minimum value in the 1667 /// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result. 1668 __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe 1669 { 1670 static if (DMD_with_DSIMD) 1671 { 1672 return cast(__m128) __simd(XMM.MINSS, a, b); 1673 } 1674 else static if (GDC_with_SSE) 1675 { 1676 return __builtin_ia32_minss(a, b); 1677 } 1678 else static if (LDC_with_SSE) 1679 { 1680 return __builtin_ia32_minss(a, b); 1681 } 1682 else 1683 { 1684 // Generates minss since LDC 1.3 -O1 1685 __m128 r = a; 1686 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 1687 return r; 1688 } 1689 } 1690 unittest 1691 { 1692 __m128 A = _mm_setr_ps(1, 2, 3, 4); 1693 __m128 B = _mm_setr_ps(4, 1, 4, 1); 1694 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 1695 __m128 M = _mm_min_ss(A, B); 1696 assert(M.array[0] == 1); 1697 assert(M.array[1] == 2); 1698 assert(M.array[2] == 3); 1699 assert(M.array[3] == 4); 1700 M = _mm_min_ps(A, C); // in case of NaN, second operand prevails 1701 assert(M.array[0] != M.array[0]); 1702 M = _mm_min_ps(C, A); // in case of NaN, second operand prevails 1703 assert(M.array[0] == 1); 1704 } 1705 1706 /// Move the lower single-precision (32-bit) floating-point element from `b` to the lower element of result, and copy 1707 /// the upper 3 packed elements from `a` to the upper elements of result. 1708 __m128 _mm_move_ss (__m128 a, __m128 b) pure @trusted 1709 { 1710 // Workaround https://issues.dlang.org/show_bug.cgi?id=21673 1711 // inlining of this function fails. 1712 version(DigitalMars) asm nothrow @nogc pure { nop; } 1713 1714 a.ptr[0] = b.array[0]; 1715 return a; 1716 } 1717 unittest 1718 { 1719 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1720 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1721 __m128 R = _mm_move_ss(A, B); 1722 float[4] correct = [5.0f, 2.0f, 3.0f, 4.0f]; 1723 assert(R.array == correct); 1724 } 1725 1726 /// Move the upper 2 single-precision (32-bit) floating-point elements from `b` to the lower 2 elements of result, and 1727 /// copy the upper 2 elements from `a` to the upper 2 elements of dst. 1728 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @trusted 1729 { 1730 // PERF DMD 1731 // Disabled because of https://issues.dlang.org/show_bug.cgi?id=19443 1732 /* 1733 static if (DMD_with_DSIMD) 1734 { 1735 1736 return cast(__m128) __simd(XMM.MOVHLPS, a, b); 1737 } 1738 else */ 1739 { 1740 a.ptr[0] = b.array[2]; 1741 a.ptr[1] = b.array[3]; 1742 return a; 1743 } 1744 } 1745 unittest 1746 { 1747 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1748 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1749 __m128 R = _mm_movehl_ps(A, B); 1750 float[4] correct = [7.0f, 8.0f, 3.0f, 4.0f]; 1751 assert(R.array == correct); 1752 } 1753 1754 /// Move the upper 2 32-bit integer elements from `b` to the lower 2 elements of result, and 1755 /// copy the upper 2 elements from `a` to the upper 2 elements of dst. 1756 __m128i _mm_movehl_epi32 (__m128i a, __m128i b) pure @trusted 1757 { 1758 a.ptr[0] = b.array[2]; 1759 a.ptr[1] = b.array[3]; 1760 return a; 1761 } 1762 unittest 1763 { 1764 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 1765 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 1766 __m128i R = _mm_movehl_epi32(A, B); 1767 int[4] correct = [7, 8, 3, 4]; 1768 assert(R.array == correct); 1769 } 1770 1771 /// Move the lower 2 single-precision (32-bit) floating-point elements from `b` to the upper 2 elements of result, and 1772 /// copy the lower 2 elements from `a` to the lower 2 elements of result 1773 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @trusted 1774 { 1775 // Was disabled because of https://issues.dlang.org/show_bug.cgi?id=19443 1776 static if (DMD_with_DSIMD && __VERSION__ >= 2101) 1777 { 1778 return cast(__m128) __simd(XMM.MOVLHPS, a, b); 1779 } 1780 else 1781 { 1782 a.ptr[2] = b.array[0]; 1783 a.ptr[3] = b.array[1]; 1784 return a; 1785 } 1786 } 1787 unittest 1788 { 1789 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1790 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1791 __m128 R = _mm_movelh_ps(A, B); 1792 float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f]; 1793 assert(R.array == correct); 1794 } 1795 1796 /// Move the lower 2 32-bit integers `b` to the upper 2 elements of result, and 1797 /// copy the lower 2 elements from `a` to the lower 2 elements of result 1798 __m128i _mm_movelh_epi32 (__m128i a, __m128i b) pure @trusted // #BONUS 1799 { 1800 version(DigitalMars) 1801 { 1802 // Crash in DMD 2.098 with -O -inline -a x86 1803 // not sure when it was fixed 1804 pragma(inline, false); 1805 a.ptr[2] = b.array[0]; 1806 a.ptr[3] = b.array[1]; 1807 } 1808 else 1809 { 1810 a.ptr[2] = b.array[0]; 1811 a.ptr[3] = b.array[1]; 1812 } 1813 return a; 1814 } 1815 unittest 1816 { 1817 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 1818 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 1819 __m128i R = _mm_movelh_epi32(A, B); 1820 int[4] correct = [1, 2, 5, 6]; 1821 assert(R.array == correct); 1822 } 1823 1824 /// Create mask from the most significant bit of each 8-bit element in `a`. 1825 int _mm_movemask_pi8 (__m64 a) pure @safe 1826 { 1827 return _mm_movemask_epi8(to_m128i(a)); 1828 } 1829 unittest 1830 { 1831 assert(0x9C == _mm_movemask_pi8(_mm_set_pi8(-1, 0, 0, -1, -1, -1, 0, 0))); 1832 } 1833 1834 /// Set each bit of result based on the most significant bit of the corresponding packed single-precision (32-bit) 1835 /// floating-point element in `a`. 1836 int _mm_movemask_ps (__m128 a) pure @trusted 1837 { 1838 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 1839 static if (GDC_with_SSE) 1840 { 1841 return __builtin_ia32_movmskps(a); 1842 } 1843 else static if (LDC_with_SSE) 1844 { 1845 return __builtin_ia32_movmskps(a); 1846 } 1847 else static if (LDC_with_ARM) 1848 { 1849 int4 ai = cast(int4)a; 1850 int4 shift31 = [31, 31, 31, 31]; 1851 ai = ai >>> shift31; 1852 int4 shift = [0, 1, 2, 3]; 1853 ai = ai << shift; // 4-way shift, only efficient on ARM. 1854 int r = ai.array[0] + (ai.array[1]) + (ai.array[2]) + (ai.array[3]); 1855 return r; 1856 } 1857 else 1858 { 1859 int4 ai = cast(int4)a; 1860 int r = 0; 1861 if (ai.array[0] < 0) r += 1; 1862 if (ai.array[1] < 0) r += 2; 1863 if (ai.array[2] < 0) r += 4; 1864 if (ai.array[3] < 0) r += 8; 1865 return r; 1866 } 1867 } 1868 unittest 1869 { 1870 int4 A = [-1, 0, -43, 0]; 1871 assert(5 == _mm_movemask_ps(cast(float4)A)); 1872 } 1873 1874 /// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`. 1875 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe 1876 { 1877 pragma(inline, true); 1878 return a * b; 1879 } 1880 unittest 1881 { 1882 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1883 a = _mm_mul_ps(a, a); 1884 float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f]; 1885 assert(a.array == correct); 1886 } 1887 1888 /// Multiply the lower single-precision (32-bit) floating-point element in `a` and `b`, store the result in the lower 1889 /// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result. 1890 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe 1891 { 1892 static if (DMD_with_DSIMD) 1893 return cast(__m128) __simd(XMM.MULSS, a, b); 1894 else static if (GDC_with_SSE) 1895 return __builtin_ia32_mulss(a, b); 1896 else 1897 { 1898 a[0] *= b[0]; 1899 return a; 1900 } 1901 } 1902 unittest 1903 { 1904 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1905 a = _mm_mul_ss(a, a); 1906 float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f]; 1907 assert(a.array == correct); 1908 } 1909 1910 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 1911 /// and return the high 16 bits of the intermediate integers. 1912 __m64 _mm_mulhi_pu16 (__m64 a, __m64 b) pure @safe 1913 { 1914 return to_m64(_mm_mulhi_epu16(to_m128i(a), to_m128i(b))); 1915 } 1916 unittest 1917 { 1918 __m64 A = _mm_setr_pi16(0, -16, 2, 3); 1919 __m64 B = _mm_set1_pi16(16384); 1920 short4 R = cast(short4)_mm_mulhi_pu16(A, B); 1921 short[4] correct = [0, 0x3FFC, 0, 0]; 1922 assert(R.array == correct); 1923 } 1924 1925 /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in `a` and `b`, and 1926 /// return the result. 1927 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe 1928 { 1929 static if (DMD_with_DSIMD) 1930 return cast(__m128)__simd(XMM.ORPS, a, b); 1931 else 1932 return cast(__m128)(cast(__m128i)a | cast(__m128i)b); 1933 } 1934 unittest 1935 { 1936 __m128 A = cast(__m128) _mm_set1_epi32(0x80000000); 1937 __m128 B = _mm_setr_ps(4.0f, -5.0, -9.5f, float.infinity); 1938 __m128 C = _mm_or_ps(A, B); 1939 float[4] correct = [-4.0f, -5.0, -9.5f, -float.infinity]; 1940 assert(C.array == correct); 1941 } 1942 1943 deprecated("Use _mm_avg_pu8 instead") alias _m_pavgb = _mm_avg_pu8;/// 1944 deprecated("Use _mm_avg_pu16 instead") alias _m_pavgw = _mm_avg_pu16;/// 1945 deprecated("Use _mm_extract_pi16 instead") alias _m_pextrw = _mm_extract_pi16;/// 1946 deprecated("Use _mm_insert_pi16 instead") alias _m_pinsrw = _mm_insert_pi16;/// 1947 deprecated("Use _mm_max_pi16 instead") alias _m_pmaxsw = _mm_max_pi16;/// 1948 deprecated("Use _mm_max_pu8 instead") alias _m_pmaxub = _mm_max_pu8;/// 1949 deprecated("Use _mm_min_pi16 instead") alias _m_pminsw = _mm_min_pi16;/// 1950 deprecated("Use _mm_min_pu8 instead") alias _m_pminub = _mm_min_pu8;/// 1951 deprecated("Use _mm_movemask_pi8 instead") alias _m_pmovmskb = _mm_movemask_pi8;/// 1952 deprecated("Use _mm_mulhi_pu16 instead") alias _m_pmulhuw = _mm_mulhi_pu16;/// 1953 1954 enum _MM_HINT_T0 = 3; /// 1955 enum _MM_HINT_T1 = 2; /// 1956 enum _MM_HINT_T2 = 1; /// 1957 enum _MM_HINT_NTA = 0; /// 1958 1959 1960 version(LDC) 1961 { 1962 // Starting with LLVM 10, it seems llvm.prefetch has changed its name. 1963 // Was reported at: https://github.com/ldc-developers/ldc/issues/3397 1964 static if (__VERSION__ >= 2091) 1965 { 1966 pragma(LDC_intrinsic, "llvm.prefetch.p0i8") // was "llvm.prefetch" 1967 void llvm_prefetch_fixed(void* ptr, uint rw, uint locality, uint cachetype) pure @safe; 1968 } 1969 } 1970 1971 /// Fetch the line of data from memory that contains address `p` to a location in the 1972 /// cache hierarchy specified by the locality hint i. 1973 /// 1974 /// Warning: `locality` is a compile-time parameter, unlike in Intel Intrinsics API. 1975 void _mm_prefetch(int locality)(const(void)* p) pure @trusted 1976 { 1977 static if (GDC_with_SSE) 1978 { 1979 return __builtin_prefetch(p, (locality & 0x4) >> 2, locality & 0x3); 1980 } 1981 else static if (DMD_with_DSIMD) 1982 { 1983 enum bool isWrite = (locality & 0x4) != 0; 1984 enum level = locality & 3; 1985 return prefetch!(isWrite, level)(p); 1986 } 1987 else version(LDC) 1988 { 1989 static if ((__VERSION__ >= 2091) && (__VERSION__ < 2106)) 1990 { 1991 // const_cast here. `llvm_prefetch` wants a mutable pointer 1992 llvm_prefetch_fixed( cast(void*)p, 0, locality, 1); 1993 } 1994 else 1995 { 1996 // const_cast here. `llvm_prefetch` wants a mutable pointer 1997 llvm_prefetch( cast(void*)p, 0, locality, 1); 1998 } 1999 } 2000 else version(D_InlineAsm_X86_64) 2001 { 2002 static if (locality == _MM_HINT_NTA) 2003 { 2004 asm pure nothrow @nogc @trusted 2005 { 2006 mov RAX, p; 2007 prefetchnta [RAX]; 2008 } 2009 } 2010 else static if (locality == _MM_HINT_T0) 2011 { 2012 asm pure nothrow @nogc @trusted 2013 { 2014 mov RAX, p; 2015 prefetcht0 [RAX]; 2016 } 2017 } 2018 else static if (locality == _MM_HINT_T1) 2019 { 2020 asm pure nothrow @nogc @trusted 2021 { 2022 mov RAX, p; 2023 prefetcht1 [RAX]; 2024 } 2025 } 2026 else static if (locality == _MM_HINT_T2) 2027 { 2028 asm pure nothrow @nogc @trusted 2029 { 2030 mov RAX, p; 2031 prefetcht2 [RAX]; 2032 } 2033 } 2034 else 2035 assert(false); // invalid locality hint 2036 } 2037 else version(D_InlineAsm_X86) 2038 { 2039 static if (locality == _MM_HINT_NTA) 2040 { 2041 asm pure nothrow @nogc @trusted 2042 { 2043 mov EAX, p; 2044 prefetchnta [EAX]; 2045 } 2046 } 2047 else static if (locality == _MM_HINT_T0) 2048 { 2049 asm pure nothrow @nogc @trusted 2050 { 2051 mov EAX, p; 2052 prefetcht0 [EAX]; 2053 } 2054 } 2055 else static if (locality == _MM_HINT_T1) 2056 { 2057 asm pure nothrow @nogc @trusted 2058 { 2059 mov EAX, p; 2060 prefetcht1 [EAX]; 2061 } 2062 } 2063 else static if (locality == _MM_HINT_T2) 2064 { 2065 asm pure nothrow @nogc @trusted 2066 { 2067 mov EAX, p; 2068 prefetcht2 [EAX]; 2069 } 2070 } 2071 else 2072 assert(false); // invalid locality hint 2073 } 2074 else 2075 { 2076 // Generic version: do nothing. From bitter experience, 2077 // it's unlikely you get ANY speed-up with manual prefetching. 2078 // Prefetching or not doesn't change program behaviour. 2079 } 2080 } 2081 unittest 2082 { 2083 // From Intel documentation: 2084 // "The amount of data prefetched is also processor implementation-dependent. It will, however, be a minimum of 2085 // 32 bytes." 2086 ubyte[256] cacheline; // though it seems it cannot generate GP fault 2087 _mm_prefetch!_MM_HINT_T0(cacheline.ptr); 2088 _mm_prefetch!_MM_HINT_T1(cacheline.ptr); 2089 _mm_prefetch!_MM_HINT_T2(cacheline.ptr); 2090 _mm_prefetch!_MM_HINT_NTA(cacheline.ptr); 2091 } 2092 2093 deprecated("Use _mm_sad_pu8 instead") alias _m_psadbw = _mm_sad_pu8;/// 2094 deprecated("Use _mm_shuffle_pi16 instead") alias _m_pshufw = _mm_shuffle_pi16;/// 2095 2096 2097 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a`` , 2098 /// and return the results. The maximum relative error for this approximation is less than 1.5*2^-12. 2099 __m128 _mm_rcp_ps (__m128 a) pure @trusted 2100 { 2101 static if (DMD_with_DSIMD) 2102 { 2103 return cast(__m128) __simd(XMM.RCPPS, a); 2104 } 2105 else static if (GDC_with_SSE) 2106 { 2107 return __builtin_ia32_rcpps(a); 2108 } 2109 else static if (LDC_with_SSE) 2110 { 2111 return __builtin_ia32_rcpps(a); 2112 } 2113 else 2114 { 2115 a.ptr[0] = 1.0f / a.array[0]; 2116 a.ptr[1] = 1.0f / a.array[1]; 2117 a.ptr[2] = 1.0f / a.array[2]; 2118 a.ptr[3] = 1.0f / a.array[3]; 2119 return a; 2120 } 2121 } 2122 unittest 2123 { 2124 __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f); 2125 __m128 groundTruth = _mm_set1_ps(1.0f) / A; 2126 __m128 result = _mm_rcp_ps(A); 2127 foreach(i; 0..4) 2128 { 2129 double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1; 2130 assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 2131 } 2132 } 2133 2134 /// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in `a`, store it 2135 /// in the lower element of the result, and copy the upper 3 packed elements from `a` to the upper elements of result. 2136 /// The maximum relative error for this approximation is less than 1.5*2^-12. 2137 __m128 _mm_rcp_ss (__m128 a) pure @trusted 2138 { 2139 // Disabled, see https://issues.dlang.org/show_bug.cgi?id=23049 2140 /*static if (DMD_with_DSIMD) 2141 { 2142 return cast(__m128) __simd(XMM.RCPSS, a); 2143 } 2144 else*/ 2145 static if (GDC_with_SSE) 2146 { 2147 return __builtin_ia32_rcpss(a); 2148 } 2149 else static if (LDC_with_SSE) 2150 { 2151 return __builtin_ia32_rcpss(a); 2152 } 2153 else 2154 { 2155 a.ptr[0] = 1.0f / a.array[0]; 2156 return a; 2157 } 2158 } 2159 unittest 2160 { 2161 __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f); 2162 __m128 correct = _mm_setr_ps(1 / 2.34f, -70000.0f, 0.00001f, 345.5f); 2163 __m128 R = _mm_rcp_ss(A); 2164 double relError = (cast(double)(correct.array[0]) / R.array[0]) - 1; 2165 assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 2166 assert(R.array[1] == correct.array[1]); 2167 assert(R.array[2] == correct.array[2]); 2168 assert(R.array[3] == correct.array[3]); 2169 } 2170 2171 /// Reallocate `size` bytes of memory, aligned to the alignment specified in `alignment`, and 2172 /// return a pointer to the newly allocated memory. 2173 /// Previous data is preserved if any. 2174 /// 2175 /// IMPORTANT: `size` MUST be > 0. 2176 /// 2177 /// `_mm_free` MUST be used to free memory that is allocated with `_mm_malloc` or `_mm_realloc`. 2178 /// Do NOT call _mm_realloc with size = 0. 2179 void* _mm_realloc(void* aligned, size_t size, size_t alignment) nothrow @nogc // #BONUS 2180 { 2181 return alignedReallocImpl!true(aligned, size, alignment); 2182 } 2183 unittest 2184 { 2185 enum NALLOC = 8; 2186 enum size_t[8] ALIGNMENTS = [1, 2, 4, 8, 16, 32, 64, 128]; 2187 2188 void*[NALLOC] alloc; 2189 2190 foreach(t; 0..100) 2191 { 2192 foreach(n; 0..NALLOC) 2193 { 2194 size_t alignment = ALIGNMENTS[n]; 2195 size_t s = 1 + ( (n + t * 69096) & 0xffff ); 2196 alloc[n] = _mm_realloc(alloc[n], s, alignment); 2197 assert(isPointerAligned(alloc[n], alignment)); 2198 foreach(b; 0..s) 2199 (cast(ubyte*)alloc[n])[b] = cast(ubyte)n; 2200 } 2201 } 2202 foreach(n; 0..NALLOC) 2203 { 2204 _mm_free(alloc[n]); 2205 } 2206 } 2207 2208 /// Reallocate `size` bytes of memory, aligned to the alignment specified in `alignment`, and 2209 /// return a pointer to the newly allocated memory. 2210 /// Previous data is discarded. 2211 /// 2212 /// IMPORTANT: `size` MUST be > 0. 2213 /// 2214 /// `_mm_free` MUST be used to free memory that is allocated with `_mm_malloc` or `_mm_realloc`. 2215 void* _mm_realloc_discard(void* aligned, size_t size, size_t alignment) nothrow @nogc // #BONUS 2216 { 2217 return alignedReallocImpl!false(aligned, size, alignment); 2218 } 2219 2220 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in `a`. 2221 /// The maximum relative error for this approximation is less than 1.5*2^-12. 2222 __m128 _mm_rsqrt_ps (__m128 a) pure @trusted 2223 { 2224 static if (DMD_with_DSIMD) 2225 { 2226 return cast(__m128) __simd(XMM.RSQRTPS, a); 2227 } 2228 else static if (GDC_with_SSE) 2229 { 2230 return __builtin_ia32_rsqrtps(a); 2231 } 2232 else static if (LDC_with_SSE) 2233 { 2234 return __builtin_ia32_rsqrtps(a); 2235 } 2236 else version(LDC) 2237 { 2238 a[0] = 1.0f / llvm_sqrt(a[0]); 2239 a[1] = 1.0f / llvm_sqrt(a[1]); 2240 a[2] = 1.0f / llvm_sqrt(a[2]); 2241 a[3] = 1.0f / llvm_sqrt(a[3]); 2242 return a; 2243 } 2244 else 2245 { 2246 a.ptr[0] = 1.0f / sqrt(a.array[0]); 2247 a.ptr[1] = 1.0f / sqrt(a.array[1]); 2248 a.ptr[2] = 1.0f / sqrt(a.array[2]); 2249 a.ptr[3] = 1.0f / sqrt(a.array[3]); 2250 return a; 2251 } 2252 } 2253 unittest 2254 { 2255 __m128 A = _mm_setr_ps(2.34f, 70000.0f, 0.00001f, 345.5f); 2256 __m128 groundTruth = _mm_setr_ps(0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f); 2257 __m128 result = _mm_rsqrt_ps(A); 2258 foreach(i; 0..4) 2259 { 2260 double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1; 2261 assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 2262 } 2263 } 2264 2265 /// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in `a`, 2266 /// store the result in the lower element. Copy the upper 3 packed elements from `a` to the upper elements of result. 2267 /// The maximum relative error for this approximation is less than 1.5*2^-12. 2268 __m128 _mm_rsqrt_ss (__m128 a) pure @trusted 2269 { 2270 static if (DMD_with_DSIMD) 2271 { 2272 return cast(__m128) __simd(XMM.RSQRTSS, a); 2273 } 2274 else static if (GDC_with_SSE) 2275 { 2276 return __builtin_ia32_rsqrtss(a); 2277 } 2278 else static if (LDC_with_SSE) 2279 { 2280 return __builtin_ia32_rsqrtss(a); 2281 } 2282 else version(LDC) 2283 { 2284 a[0] = 1.0f / llvm_sqrt(a[0]); 2285 return a; 2286 } 2287 else 2288 { 2289 a[0] = 1.0f / sqrt(a[0]); 2290 return a; 2291 } 2292 } 2293 unittest // this one test 4 different intrinsics: _mm_rsqrt_ss, _mm_rsqrt_ps, _mm_rcp_ps, _mm_rcp_ss 2294 { 2295 double maxRelativeError = 0.000245; // -72 dB, stuff is apparently more precise than said in the doc? 2296 void testApproximateSSE(float number) nothrow @nogc 2297 { 2298 __m128 A = _mm_set1_ps(number); 2299 2300 // test _mm_rcp_ps 2301 __m128 B = _mm_rcp_ps(A); 2302 foreach(i; 0..4) 2303 { 2304 double exact = 1.0f / A.array[i]; 2305 double ratio = cast(double)(B.array[i]) / cast(double)(exact); 2306 assert(abs_double(ratio - 1) <= maxRelativeError); 2307 } 2308 2309 // test _mm_rcp_ss 2310 { 2311 B = _mm_rcp_ss(A); 2312 double exact = 1.0f / A.array[0]; 2313 double ratio = cast(double)(B.array[0]) / cast(double)(exact); 2314 assert(abs_double(ratio - 1) <= maxRelativeError); 2315 } 2316 2317 // test _mm_rsqrt_ps 2318 B = _mm_rsqrt_ps(A); 2319 foreach(i; 0..4) 2320 { 2321 double exact = 1.0f / sqrt(A.array[i]); 2322 double ratio = cast(double)(B.array[i]) / cast(double)(exact); 2323 assert(abs_double(ratio - 1) <= maxRelativeError); 2324 } 2325 2326 // test _mm_rsqrt_ss 2327 { 2328 B = _mm_rsqrt_ss(A); 2329 double exact = 1.0f / sqrt(A.array[0]); 2330 double ratio = cast(double)(B.array[0]) / cast(double)(exact); 2331 assert(abs_double(ratio - 1) <= maxRelativeError); 2332 } 2333 } 2334 2335 testApproximateSSE(0.00001f); 2336 testApproximateSSE(1.1f); 2337 testApproximateSSE(345.0f); 2338 testApproximateSSE(2.45674864151f); 2339 testApproximateSSE(700000.0f); 2340 testApproximateSSE(10000000.0f); 2341 testApproximateSSE(27841456468.0f); 2342 } 2343 2344 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 2345 /// consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 2346 /// low 16 bits of result. 2347 __m64 _mm_sad_pu8 (__m64 a, __m64 b) pure @safe 2348 { 2349 return to_m64(_mm_sad_epu8(to_m128i(a), to_m128i(b))); 2350 } 2351 2352 /// Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer 2353 /// `_MM_MASK_xxxx`. The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, `_MM_MASK_DIV_ZERO`, 2354 /// `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`. 2355 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) @safe 2356 { 2357 // Note: unsupported on ARM 2358 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx); 2359 } 2360 2361 /// Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer 2362 /// `_MM_EXCEPT_xxxx`. The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 2363 /// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`. 2364 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) @safe 2365 { 2366 // Note: unsupported on ARM 2367 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx); 2368 } 2369 2370 /// Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer 2371 /// `_MM_FLUSH_xxxx`. The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF`. 2372 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) @safe 2373 { 2374 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx); 2375 } 2376 2377 /// Set packed single-precision (32-bit) floating-point elements with the supplied values. 2378 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted 2379 { 2380 __m128 r; 2381 r.ptr[0] = e0; 2382 r.ptr[1] = e1; 2383 r.ptr[2] = e2; 2384 r.ptr[3] = e3; 2385 return r; 2386 } 2387 unittest 2388 { 2389 __m128 A = _mm_set_ps(3, 2, 1, 546); 2390 float[4] correct = [546.0f, 1.0f, 2.0f, 3.0f]; 2391 assert(A.array == correct); 2392 2393 // Very old LDC, like 1.17, cannot case __vector at CT 2394 static if (__VERSION__ >= 2094) 2395 { 2396 static immutable B = _mm_set_ps(3, 2, 1, 546); 2397 enum C = _mm_set_ps(3, 2, 1, 546); 2398 } 2399 } 2400 2401 deprecated("Use _mm_set1_ps instead") alias _mm_set_ps1 = _mm_set1_ps; /// 2402 2403 /// Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer 2404 /// `_MM_ROUND_xxxx`. The rounding mode may contain any of the following flags: `_MM_ROUND_NEAREST`, `_MM_ROUND_DOWN`, 2405 /// `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`. 2406 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) @safe 2407 { 2408 // Work-around for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607 2409 version(GNU) asm nothrow @nogc @trusted { "" : : : "memory"; } 2410 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx); 2411 } 2412 2413 /// Copy single-precision (32-bit) floating-point element `a` to the lower element of result, and zero the upper 3 elements. 2414 __m128 _mm_set_ss (float a) pure @trusted 2415 { 2416 static if (DMD_with_DSIMD) 2417 { 2418 return cast(__m128) __simd(XMM.LODSS, a); 2419 } 2420 else 2421 { 2422 __m128 r = _mm_setzero_ps(); 2423 r.ptr[0] = a; 2424 return r; 2425 } 2426 } 2427 unittest 2428 { 2429 float[4] correct = [42.0f, 0.0f, 0.0f, 0.0f]; 2430 __m128 A = _mm_set_ss(42.0f); 2431 assert(A.array == correct); 2432 } 2433 2434 /// Broadcast single-precision (32-bit) floating-point value `a` to all elements. 2435 __m128 _mm_set1_ps (float a) pure @trusted 2436 { 2437 pragma(inline, true); 2438 __m128 r = a; 2439 return r; 2440 } 2441 unittest 2442 { 2443 float[4] correct = [42.0f, 42.0f, 42.0f, 42.0f]; 2444 __m128 A = _mm_set1_ps(42.0f); 2445 assert(A.array == correct); 2446 2447 static if (__VERSION__ >= 2094) 2448 { 2449 enum __m128 B = _mm_set1_ps(2.4f); 2450 } 2451 } 2452 2453 /// Set the MXCSR control and status register with the value in unsigned 32-bit integer `controlWord`. 2454 void _mm_setcsr(uint controlWord) @trusted 2455 { 2456 static if (LDC_with_ARM) 2457 { 2458 // Convert from SSE to ARM control word. This is done _partially_ 2459 // and only support rounding mode changes. 2460 2461 // "To alter some bits of a VFP system register without 2462 // affecting other bits, use a read-modify-write procedure" 2463 uint fpscr = arm_get_fpcr(); 2464 2465 // Bits 23 to 22 are rounding modes, however not used in NEON 2466 fpscr = fpscr & ~_MM_ROUND_MASK_ARM; 2467 switch(controlWord & _MM_ROUND_MASK) 2468 { 2469 default: 2470 case _MM_ROUND_NEAREST: fpscr |= _MM_ROUND_NEAREST_ARM; break; 2471 case _MM_ROUND_DOWN: fpscr |= _MM_ROUND_DOWN_ARM; break; 2472 case _MM_ROUND_UP: fpscr |= _MM_ROUND_UP_ARM; break; 2473 case _MM_ROUND_TOWARD_ZERO: fpscr |= _MM_ROUND_TOWARD_ZERO_ARM; break; 2474 } 2475 fpscr = fpscr & ~_MM_FLUSH_ZERO_MASK_ARM; 2476 if (controlWord & _MM_FLUSH_ZERO_MASK) 2477 fpscr |= _MM_FLUSH_ZERO_MASK_ARM; 2478 arm_set_fpcr(fpscr); 2479 } 2480 else version(GNU) 2481 { 2482 static if (GDC_with_SSE) 2483 { 2484 // Work-around for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607 2485 version(GNU) asm nothrow @nogc @trusted { "" : : : "memory"; } 2486 __builtin_ia32_ldmxcsr(controlWord); 2487 } 2488 else version(X86) 2489 { 2490 asm nothrow @nogc @trusted 2491 { 2492 "ldmxcsr %0;\n" 2493 : 2494 : "m" (controlWord) 2495 : ; 2496 } 2497 } 2498 else return __warn_noop(); 2499 } 2500 else version (InlineX86Asm) 2501 { 2502 asm nothrow @nogc @trusted 2503 { 2504 ldmxcsr controlWord; 2505 } 2506 } 2507 else 2508 static assert(0, "Not yet supported"); 2509 } 2510 unittest 2511 { 2512 _mm_setcsr(_mm_getcsr()); 2513 } 2514 2515 /// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order. 2516 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted 2517 { 2518 pragma(inline, true); 2519 2520 if (__ctfe) 2521 { 2522 __m128 r; 2523 r.ptr[0] = e3; 2524 r.ptr[1] = e2; 2525 r.ptr[2] = e1; 2526 r.ptr[3] = e0; 2527 return r; 2528 } 2529 else 2530 { 2531 // This small = void here wins a bit in all optimization levels in GDC 2532 // and in -O0 in LDC. 2533 __m128 r = void; 2534 r.ptr[0] = e3; 2535 r.ptr[1] = e2; 2536 r.ptr[2] = e1; 2537 r.ptr[3] = e0; 2538 return r; 2539 } 2540 } 2541 unittest 2542 { 2543 __m128 A = _mm_setr_ps(3, 2, 1, 546); 2544 float[4] correct = [3.0f, 2.0f, 1.0f, 546.0f]; 2545 assert(A.array == correct); 2546 2547 // Very old LDC, like 1.17, cannot case __vector at CT 2548 static if (__VERSION__ >= 2094) 2549 { 2550 static immutable B = _mm_setr_ps(3, 2, 1, 546); 2551 enum C = _mm_setr_ps(3, 2, 1, 546); 2552 } 2553 } 2554 2555 /// Return vector of type `__m128` with all elements set to zero. 2556 __m128 _mm_setzero_ps() pure @trusted 2557 { 2558 pragma(inline, true); 2559 2560 // Note: for all compilers, this works best in debug builds, and in DMD -O 2561 int4 r; 2562 return cast(__m128)r; 2563 } 2564 unittest 2565 { 2566 __m128 R = _mm_setzero_ps(); 2567 float[4] correct = [0.0f, 0, 0, 0]; 2568 assert(R.array == correct); 2569 } 2570 2571 /// Do a serializing operation on all store-to-memory instructions that were issued prior 2572 /// to this instruction. Guarantees that every store instruction that precedes, in program order, 2573 /// is globally visible before any store instruction which follows the fence in program order. 2574 void _mm_sfence() @trusted 2575 { 2576 version(GNU) 2577 { 2578 static if (GDC_with_SSE) 2579 { 2580 __builtin_ia32_sfence(); 2581 } 2582 else version(X86) 2583 { 2584 asm pure nothrow @nogc @trusted 2585 { 2586 "sfence;\n" : : : ; 2587 } 2588 } 2589 else return __warn_noop(); 2590 } 2591 else static if (LDC_with_SSE) 2592 { 2593 __builtin_ia32_sfence(); 2594 } 2595 else static if (DMD_with_asm) 2596 { 2597 // PERF: can't be inlined in DMD, probably because of that assembly. 2598 asm nothrow @nogc pure @trusted 2599 { 2600 sfence; 2601 } 2602 } 2603 else static if (LDC_with_ARM64) 2604 { 2605 __builtin_arm_dmb(10); // dmb ishst 2606 } 2607 else version(LDC) 2608 { 2609 // When the architecture is unknown, generate a full memory barrier, 2610 // as the semantics of sfence do not really match those of atomics. 2611 llvm_memory_fence(); 2612 } 2613 else 2614 static assert(false); 2615 } 2616 unittest 2617 { 2618 _mm_sfence(); 2619 } 2620 2621 2622 __m64 _mm_shuffle_pi16(int imm8)(__m64 a) pure @trusted 2623 { 2624 // PERF DMD + D_SIMD 2625 version(LDC) 2626 { 2627 return cast(__m64) shufflevectorLDC!(short4, ( (imm8 >> 0) & 3 ), 2628 ( (imm8 >> 2) & 3 ), 2629 ( (imm8 >> 4) & 3 ), 2630 ( (imm8 >> 6) & 3 ))(cast(short4)a, cast(short4)a); 2631 } 2632 else 2633 { 2634 // GDC optimizes that correctly starting with -O2 2635 short4 sa = cast(short4)a; 2636 short4 r = void; 2637 r.ptr[0] = sa.array[ (imm8 >> 0) & 3 ]; 2638 r.ptr[1] = sa.array[ (imm8 >> 2) & 3 ]; 2639 r.ptr[2] = sa.array[ (imm8 >> 4) & 3 ]; 2640 r.ptr[3] = sa.array[ (imm8 >> 6) & 3 ]; 2641 return cast(__m64)r; 2642 } 2643 } 2644 unittest 2645 { 2646 __m64 A = _mm_setr_pi16(0, 1, 2, 3); 2647 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 2648 short4 B = cast(short4) _mm_shuffle_pi16!SHUFFLE(A); 2649 short[4] expectedB = [ 3, 2, 1, 0 ]; 2650 assert(B.array == expectedB); 2651 } 2652 2653 /// Shuffle single-precision (32-bit) floating-point elements in `a` and `b` using the control in `imm8`, 2654 /// Warning: the immediate shuffle value `imm` is given at compile-time instead of runtime. 2655 __m128 _mm_shuffle_ps(ubyte imm8)(__m128 a, __m128 b) pure @trusted 2656 { 2657 static if (GDC_with_SSE) 2658 { 2659 return __builtin_ia32_shufps(a, b, imm8); 2660 } 2661 else static if (DMD_with_DSIMD) 2662 { 2663 return cast(__m128) __simd(XMM.SHUFPS, a, b, imm8); 2664 } 2665 else static if (LDC_with_optimizations) 2666 { 2667 return shufflevectorLDC!(__m128, imm8 & 3, (imm8>>2) & 3, 2668 4 + ((imm8>>4) & 3), 4 + ((imm8>>6) & 3) )(a, b); 2669 } 2670 else 2671 { 2672 float4 r = void; 2673 r.ptr[0] = a.array[ (imm8 >> 0) & 3 ]; 2674 r.ptr[1] = a.array[ (imm8 >> 2) & 3 ]; 2675 r.ptr[2] = b.array[ (imm8 >> 4) & 3 ]; 2676 r.ptr[3] = b.array[ (imm8 >> 6) & 3 ]; 2677 return r; 2678 } 2679 } 2680 unittest 2681 { 2682 __m128 A = _mm_setr_ps(0, 1, 2, 3); 2683 __m128 B = _mm_setr_ps(4, 5, 6, 7); 2684 __m128 C = _mm_shuffle_ps!0x9c(A, B); 2685 float[4] correct = [0.0f, 3, 5, 6]; 2686 assert(C.array == correct); 2687 } 2688 2689 /// Compute the square root of packed single-precision (32-bit) floating-point elements in `a`. 2690 __m128 _mm_sqrt_ps(__m128 a) @trusted 2691 { 2692 static if (GDC_with_SSE) 2693 { 2694 return __builtin_ia32_sqrtps(a); 2695 } 2696 else static if (DMD_with_DSIMD) 2697 { 2698 return cast(__m128) __simd(XMM.SQRTPS, a); 2699 } 2700 else version(LDC) 2701 { 2702 // Disappeared with LDC 1.11 2703 static if (__VERSION__ < 2081) 2704 return __builtin_ia32_sqrtps(a); 2705 else 2706 { 2707 // PERF: use llvm_sqrt on the vector, works better 2708 a[0] = llvm_sqrt(a[0]); 2709 a[1] = llvm_sqrt(a[1]); 2710 a[2] = llvm_sqrt(a[2]); 2711 a[3] = llvm_sqrt(a[3]); 2712 return a; 2713 } 2714 } 2715 else 2716 { 2717 a.ptr[0] = sqrt(a.array[0]); 2718 a.ptr[1] = sqrt(a.array[1]); 2719 a.ptr[2] = sqrt(a.array[2]); 2720 a.ptr[3] = sqrt(a.array[3]); 2721 return a; 2722 } 2723 } 2724 unittest 2725 { 2726 __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f)); 2727 assert(A.array[0] == 2.0f); 2728 assert(A.array[1] == 2.0f); 2729 assert(A.array[2] == 2.0f); 2730 assert(A.array[3] == 2.0f); 2731 } 2732 2733 /// Compute the square root of the lower single-precision (32-bit) floating-point element in `a`, store it in the lower 2734 /// element, and copy the upper 3 packed elements from `a` to the upper elements of result. 2735 __m128 _mm_sqrt_ss(__m128 a) @trusted 2736 { 2737 static if (GDC_with_SSE) 2738 { 2739 return __builtin_ia32_sqrtss(a); 2740 } 2741 // PERF DMD 2742 // TODO: enable when https://issues.dlang.org/show_bug.cgi?id=23437 is fixed for good 2743 /*else static if (DMD_with_DSIMD) 2744 { 2745 return cast(__m128) __simd(XMM.SQRTSS, a); 2746 }*/ 2747 else version(LDC) 2748 { 2749 a.ptr[0] = llvm_sqrt(a.array[0]); 2750 return a; 2751 } 2752 else 2753 { 2754 a.ptr[0] = sqrt(a.array[0]); 2755 return a; 2756 } 2757 } 2758 unittest 2759 { 2760 __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f)); 2761 assert(A.array[0] == 2.0f); 2762 assert(A.array[1] == 4.0f); 2763 assert(A.array[2] == 4.0f); 2764 assert(A.array[3] == 4.0f); 2765 } 2766 2767 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 2768 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 2769 void _mm_store_ps (float* mem_addr, __m128 a) pure 2770 { 2771 pragma(inline, true); 2772 __m128* aligned = cast(__m128*)mem_addr; 2773 *aligned = a; 2774 } 2775 2776 deprecated("Use _mm_store1_ps instead") alias _mm_store_ps1 = _mm_store1_ps; /// 2777 2778 /// Store the lower single-precision (32-bit) floating-point element from `a` into memory. 2779 /// `mem_addr` does not need to be aligned on any particular boundary. 2780 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe 2781 { 2782 pragma(inline, true); 2783 *mem_addr = a.array[0]; 2784 } 2785 unittest 2786 { 2787 float a; 2788 _mm_store_ss(&a, _mm_set_ps(3, 2, 1, 546)); 2789 assert(a == 546); 2790 } 2791 2792 /// Store the lower single-precision (32-bit) floating-point element from `a` into 4 contiguous elements in memory. 2793 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 2794 void _mm_store1_ps(float* mem_addr, __m128 a) pure @trusted // FUTURE: shouldn't be trusted, see #62 2795 { 2796 __m128* aligned = cast(__m128*)mem_addr; 2797 static if (DMD_with_DSIMD) 2798 { 2799 __m128 r = cast(__m128) __simd(XMM.SHUFPS, a, a, 0); 2800 } 2801 else 2802 { 2803 __m128 r; // PERF =void; 2804 r.ptr[0] = a.array[0]; 2805 r.ptr[1] = a.array[0]; 2806 r.ptr[2] = a.array[0]; 2807 r.ptr[3] = a.array[0]; 2808 } 2809 *aligned = r; 2810 } 2811 unittest 2812 { 2813 align(16) float[4] A; 2814 _mm_store1_ps(A.ptr, _mm_set_ss(42.0f)); 2815 float[4] correct = [42.0f, 42, 42, 42]; 2816 assert(A == correct); 2817 } 2818 2819 /// Store the upper 2 single-precision (32-bit) floating-point elements from `a` into memory. 2820 void _mm_storeh_pi(__m64* p, __m128 a) pure @trusted 2821 { 2822 pragma(inline, true); 2823 long2 la = cast(long2)a; 2824 (*p).ptr[0] = la.array[1]; 2825 } 2826 unittest 2827 { 2828 __m64 R = _mm_setzero_si64(); 2829 long2 A = [13, 25]; 2830 _mm_storeh_pi(&R, cast(__m128)A); 2831 assert(R.array[0] == 25); 2832 } 2833 2834 /// Store the lower 2 single-precision (32-bit) floating-point elements from `a` into memory. 2835 void _mm_storel_pi(__m64* p, __m128 a) pure @trusted 2836 { 2837 pragma(inline, true); 2838 long2 la = cast(long2)a; 2839 (*p).ptr[0] = la.array[0]; 2840 } 2841 unittest 2842 { 2843 __m64 R = _mm_setzero_si64(); 2844 long2 A = [13, 25]; 2845 _mm_storel_pi(&R, cast(__m128)A); 2846 assert(R.array[0] == 13); 2847 } 2848 2849 /// Store 4 single-precision (32-bit) floating-point elements from `a` into memory in reverse order. 2850 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 2851 void _mm_storer_ps(float* mem_addr, __m128 a) pure @trusted // FUTURE should not be trusted 2852 { 2853 __m128* aligned = cast(__m128*)mem_addr; 2854 static if (DMD_with_DSIMD) 2855 { 2856 __m128 r = cast(__m128) __simd(XMM.SHUFPS, a, a, 27); 2857 } 2858 else 2859 { 2860 __m128 r; // PERF =void; 2861 r.ptr[0] = a.array[3]; 2862 r.ptr[1] = a.array[2]; 2863 r.ptr[2] = a.array[1]; 2864 r.ptr[3] = a.array[0]; 2865 } 2866 *aligned = r; 2867 } 2868 unittest 2869 { 2870 align(16) float[4] A; 2871 _mm_storer_ps(A.ptr, _mm_setr_ps(1.0f, 2, 3, 4)); 2872 float[4] correct = [4.0f, 3.0f, 2.0f, 1.0f]; 2873 assert(A == correct); 2874 } 2875 2876 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 2877 /// `mem_addr` does not need to be aligned on any particular boundary. 2878 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @trusted // FUTURE should not be trusted, see #62 2879 { 2880 pragma(inline, true); 2881 static if (DMD_with_DSIMD) 2882 { 2883 cast(void) __simd_sto(XMM.STOUPS, *cast(void16*)(cast(float*)mem_addr), a); 2884 } 2885 else static if (GDC_with_SSE) 2886 { 2887 __builtin_ia32_storeups(mem_addr, a); // better in -O0 2888 } 2889 else static if (LDC_with_optimizations) 2890 { 2891 storeUnaligned!(float4)(a, mem_addr); 2892 } 2893 else 2894 { 2895 mem_addr[0] = a.array[0]; 2896 mem_addr[1] = a.array[1]; 2897 mem_addr[2] = a.array[2]; 2898 mem_addr[3] = a.array[3]; 2899 } 2900 } 2901 unittest 2902 { 2903 __m128 A = _mm_setr_ps(1.0f, 2, 3, 4); 2904 align(16) float[6] R = [0.0f, 0, 0, 0, 0, 0]; 2905 float[4] correct = [1.0f, 2, 3, 4]; 2906 _mm_storeu_ps(&R[1], A); 2907 assert(R[1..5] == correct); 2908 } 2909 2910 /// Store 64-bits of integer data from `a` into memory using a non-temporal memory hint. 2911 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 2912 void _mm_stream_pi (__m64* mem_addr, __m64 a) pure @trusted 2913 { 2914 _mm_stream_si64(cast(long*)mem_addr, a.array[0]); 2915 } 2916 2917 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 2918 /// `a`s into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 16-byte 2919 /// boundary or a general-protection exception may be generated. 2920 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 2921 void _mm_stream_ps (float* mem_addr, __m128 a) 2922 { 2923 // TODO report this bug: DMD generates no stream instruction when using D_SIMD 2924 static if (GDC_with_SSE) 2925 { 2926 return __builtin_ia32_movntps(mem_addr, a); 2927 } 2928 else static if (LDC_with_InlineIREx && LDC_with_optimizations) 2929 { 2930 enum prefix = `!0 = !{ i32 1 }`; 2931 enum ir = ` 2932 store <4 x float> %1, <4 x float>* %0, align 16, !nontemporal !0 2933 ret void`; 2934 LDCInlineIREx!(prefix, ir, "", void, __m128*, float4)(cast(__m128*)mem_addr, a); 2935 2936 } 2937 else 2938 { 2939 // Regular store instead. 2940 __m128* dest = cast(__m128*)mem_addr; 2941 *dest = a; // it's a regular move instead 2942 } 2943 } 2944 unittest 2945 { 2946 align(16) float[4] A; 2947 _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f)); 2948 assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f); 2949 } 2950 2951 /// Subtract packed single-precision (32-bit) floating-point elements in `b` from packed single-precision (32-bit) 2952 /// floating-point elements in `a`. 2953 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe 2954 { 2955 pragma(inline, true); 2956 return a - b; 2957 } 2958 unittest 2959 { 2960 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 2961 a = _mm_sub_ps(a, a); 2962 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 2963 assert(a.array == correct); 2964 } 2965 2966 /// Subtract the lower single-precision (32-bit) floating-point element in `b` from the lower single-precision (32-bit) 2967 /// floating-point element in `a`, store the subtration result in the lower element of result, and copy the upper 3 2968 /// packed elements from a to the upper elements of result. 2969 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe 2970 { 2971 static if (DMD_with_DSIMD) 2972 return cast(__m128) __simd(XMM.SUBSS, a, b); 2973 else static if (GDC_with_SSE) 2974 return __builtin_ia32_subss(a, b); 2975 else 2976 { 2977 a[0] -= b[0]; 2978 return a; 2979 } 2980 } 2981 unittest 2982 { 2983 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 2984 a = _mm_sub_ss(a, a); 2985 float[4] correct = [0.0f, -2.0, 3.0f, 1.0f]; 2986 assert(a.array == correct); 2987 } 2988 2989 /// Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in row0, row1, 2990 /// row2, and row3, and store the transposed matrix in these vectors (row0 now contains column 0, etc.). 2991 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe 2992 { 2993 __m128 tmp3, tmp2, tmp1, tmp0; 2994 tmp0 = _mm_unpacklo_ps(row0, row1); 2995 tmp2 = _mm_unpacklo_ps(row2, row3); 2996 tmp1 = _mm_unpackhi_ps(row0, row1); 2997 tmp3 = _mm_unpackhi_ps(row2, row3); 2998 row0 = _mm_movelh_ps(tmp0, tmp2); 2999 row1 = _mm_movehl_ps(tmp2, tmp0); 3000 row2 = _mm_movelh_ps(tmp1, tmp3); 3001 row3 = _mm_movehl_ps(tmp3, tmp1); 3002 } 3003 unittest 3004 { 3005 __m128 l0 = _mm_setr_ps(0, 1, 2, 3); 3006 __m128 l1 = _mm_setr_ps(4, 5, 6, 7); 3007 __m128 l2 = _mm_setr_ps(8, 9, 10, 11); 3008 __m128 l3 = _mm_setr_ps(12, 13, 14, 15); 3009 _MM_TRANSPOSE4_PS(l0, l1, l2, l3); 3010 float[4] r0 = [0.0f, 4, 8, 12]; 3011 float[4] r1 = [1.0f, 5, 9, 13]; 3012 float[4] r2 = [2.0f, 6, 10, 14]; 3013 float[4] r3 = [3.0f, 7, 11, 15]; 3014 assert(l0.array == r0); 3015 assert(l1.array == r1); 3016 assert(l2.array == r2); 3017 assert(l3.array == r3); 3018 } 3019 3020 /// Transpose the 4x4 matrix formed by the 4 rows of 32-bit integer elements in row0, row1, 3021 /// row2, and row3, and store the transposed matrix in these vectors (row0 now contains column 0, etc.). 3022 void _MM_TRANSPOSE4_EPI32 (ref __m128i row0, ref __m128i row1, ref __m128i row2, ref __m128i row3) pure @safe // #BONUS 3023 { 3024 __m128i tmp3, tmp2, tmp1, tmp0; 3025 tmp0 = _mm_unpacklo_epi32(row0, row1); 3026 tmp2 = _mm_unpacklo_epi32(row2, row3); 3027 tmp1 = _mm_unpackhi_epi32(row0, row1); 3028 tmp3 = _mm_unpackhi_epi32(row2, row3); 3029 row0 = _mm_movelh_epi32(tmp0, tmp2); 3030 row1 = _mm_movehl_epi32(tmp2, tmp0); 3031 row2 = _mm_movelh_epi32(tmp1, tmp3); 3032 row3 = _mm_movehl_epi32(tmp3, tmp1); 3033 } 3034 unittest 3035 { 3036 __m128i l0 = _mm_setr_epi32(0, 1, 2, 3); 3037 __m128i l1 = _mm_set_epi32(7, 6, 5, 4); 3038 __m128i l2 = _mm_setr_epi32(8, 9, 10, 11); 3039 __m128i l3 = _mm_setr_epi32(12, 13, 14, 15); 3040 _MM_TRANSPOSE4_EPI32(l0, l1, l2, l3); 3041 int[4] r0 = [0, 4, 8, 12]; 3042 int[4] r1 = [1, 5, 9, 13]; 3043 int[4] r2 = [2, 6, 10, 14]; 3044 int[4] r3 = [3, 7, 11, 15]; 3045 assert(l0.array == r0); 3046 assert(l1.array == r1); 3047 assert(l2.array == r2); 3048 assert(l3.array == r3); 3049 } 3050 3051 // Note: the only difference between these intrinsics is the signalling 3052 // behaviour of quiet NaNs. This is incorrect but the case where 3053 // you would want to differentiate between qNaN and sNaN and then 3054 // treat them differently on purpose seems extremely rare. 3055 alias _mm_ucomieq_ss = _mm_comieq_ss; 3056 alias _mm_ucomige_ss = _mm_comige_ss; 3057 alias _mm_ucomigt_ss = _mm_comigt_ss; 3058 alias _mm_ucomile_ss = _mm_comile_ss; 3059 alias _mm_ucomilt_ss = _mm_comilt_ss; 3060 alias _mm_ucomineq_ss = _mm_comineq_ss; 3061 3062 /// Return vector of type `__m128` with undefined elements. 3063 __m128 _mm_undefined_ps() pure @safe 3064 { 3065 pragma(inline, true); 3066 __m128 undef = void; 3067 return undef; 3068 } 3069 3070 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half `a` and `b`. 3071 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @trusted 3072 { 3073 // PERF GDC use intrinsic 3074 static if (DMD_with_DSIMD) 3075 { 3076 return cast(__m128) __simd(XMM.UNPCKHPS, a, b); 3077 } 3078 else static if (LDC_with_optimizations) 3079 { 3080 enum ir = `%r = shufflevector <4 x float> %0, <4 x float> %1, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 3081 ret <4 x float> %r`; 3082 return LDCInlineIR!(ir, float4, float4, float4)(a, b); 3083 } 3084 else 3085 { 3086 __m128 r; // PERF =void; 3087 r.ptr[0] = a.array[2]; 3088 r.ptr[1] = b.array[2]; 3089 r.ptr[2] = a.array[3]; 3090 r.ptr[3] = b.array[3]; 3091 return r; 3092 } 3093 } 3094 unittest 3095 { 3096 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 3097 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 3098 __m128 R = _mm_unpackhi_ps(A, B); 3099 float[4] correct = [3.0f, 7.0f, 4.0f, 8.0f]; 3100 assert(R.array == correct); 3101 } 3102 3103 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of `a` and `b`. 3104 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @trusted 3105 { 3106 // PERF GDC use intrinsic 3107 static if (DMD_with_DSIMD) 3108 { 3109 return cast(__m128) __simd(XMM.UNPCKLPS, a, b); 3110 } 3111 else static if (LDC_with_optimizations) 3112 { 3113 enum ir = `%r = shufflevector <4 x float> %0, <4 x float> %1, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 3114 ret <4 x float> %r`; 3115 return LDCInlineIR!(ir, float4, float4, float4)(a, b); 3116 } 3117 else 3118 { 3119 __m128 r; // PERF =void; 3120 r.ptr[0] = a.array[0]; 3121 r.ptr[1] = b.array[0]; 3122 r.ptr[2] = a.array[1]; 3123 r.ptr[3] = b.array[1]; 3124 return r; 3125 } 3126 } 3127 unittest 3128 { 3129 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 3130 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 3131 __m128 R = _mm_unpacklo_ps(A, B); 3132 float[4] correct = [1.0f, 5.0f, 2.0f, 6.0f]; 3133 assert(R.array == correct); 3134 } 3135 3136 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`. 3137 __m128 _mm_xor_ps (__m128 a, __m128 b) pure @safe 3138 { 3139 static if (DMD_with_DSIMD) 3140 { 3141 return cast(__m128) __simd(XMM.XORPS, cast(void16) a, cast(void16) b); 3142 } 3143 else 3144 { 3145 return cast(__m128)(cast(__m128i)a ^ cast(__m128i)b); 3146 } 3147 } 3148 unittest 3149 { 3150 __m128 A = cast(__m128) _mm_set1_epi32(0x80000000); 3151 __m128 B = _mm_setr_ps(4.0f, -5.0, -9.5f, float.infinity); 3152 __m128 C = _mm_xor_ps(A, B); 3153 float[4] correct = [-4.0f, 5.0, 9.5f, -float.infinity]; 3154 assert(C.array == correct); 3155 } 3156 3157 private 3158 { 3159 // Returns: `true` if the pointer is suitably aligned. 3160 bool isPointerAligned(void* p, size_t alignment) pure 3161 { 3162 assert(alignment != 0); 3163 return ( cast(size_t)p & (alignment - 1) ) == 0; 3164 } 3165 3166 // Returns: next pointer aligned with alignment bytes. 3167 void* nextAlignedPointer(void* start, size_t alignment) pure 3168 { 3169 return cast(void*)nextMultipleOf(cast(size_t)(start), alignment); 3170 } 3171 3172 // Returns number of bytes to actually allocate when asking 3173 // for a particular alignment 3174 @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure 3175 { 3176 enum size_t pointerSize = size_t.sizeof; 3177 return askedSize + alignment - 1 + pointerSize * 3; 3178 } 3179 3180 // Store pointer given by malloc + size + alignment 3181 @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure 3182 { 3183 enum size_t pointerSize = size_t.sizeof; 3184 char* start = cast(char*)raw + pointerSize * 3; 3185 void* aligned = nextAlignedPointer(start, alignment); 3186 void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize); 3187 *rawLocation = raw; 3188 size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize); 3189 *sizeLocation = size; 3190 size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize); 3191 *alignmentLocation = alignment; 3192 assert( isPointerAligned(aligned, alignment) ); 3193 return aligned; 3194 } 3195 3196 // Returns: x, multiple of powerOfTwo, so that x >= n. 3197 @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow 3198 { 3199 // check power-of-two 3200 assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0)); 3201 3202 size_t mask = ~(powerOfTwo - 1); 3203 return (n + powerOfTwo - 1) & mask; 3204 } 3205 3206 void* alignedReallocImpl(bool PreserveDataIfResized)(void* aligned, size_t size, size_t alignment) 3207 { 3208 // Calling `_mm_realloc`, `_mm_realloc_discard` or `realloc` with size 0 is 3209 // Undefined Behavior, and not only since C23. 3210 // Moreover, alignedReallocImpl was buggy about it. 3211 assert(size != 0); 3212 3213 if (aligned is null) 3214 return _mm_malloc(size, alignment); 3215 3216 assert(alignment != 0); 3217 assert(isPointerAligned(aligned, alignment)); 3218 3219 size_t previousSize = *cast(size_t*)(cast(char*)aligned - size_t.sizeof * 2); 3220 size_t prevAlignment = *cast(size_t*)(cast(char*)aligned - size_t.sizeof * 3); 3221 3222 // It is illegal to change the alignment across calls. 3223 assert(prevAlignment == alignment); 3224 3225 void* raw = *cast(void**)(cast(char*)aligned - size_t.sizeof); 3226 size_t request = requestedSize(size, alignment); 3227 size_t previousRequest = requestedSize(previousSize, alignment); 3228 assert(previousRequest - request == previousSize - size); 3229 3230 // Heuristic: if a requested size is within 50% to 100% of what is already allocated 3231 // then exit with the same pointer 3232 // PERF it seems like `realloc` should do that, not us. 3233 if ( (previousRequest < request * 4) && (request <= previousRequest) ) 3234 return aligned; 3235 3236 void* newRaw = malloc(request); 3237 if (request > 0 && newRaw == null) // realloc(0) can validly return anything 3238 onOutOfMemoryError(); 3239 3240 void* newAligned = storeRawPointerPlusInfo(newRaw, size, alignment); 3241 3242 static if (PreserveDataIfResized) 3243 { 3244 size_t minSize = size < previousSize ? size : previousSize; 3245 memcpy(newAligned, aligned, minSize); // ok to use memcpy: newAligned is into new memory, always different from aligned 3246 } 3247 3248 // Free previous data 3249 _mm_free(aligned); 3250 assert(isPointerAligned(newAligned, alignment)); 3251 return newAligned; 3252 } 3253 } 3254 3255 unittest 3256 { 3257 assert(nextMultipleOf(0, 4) == 0); 3258 assert(nextMultipleOf(1, 4) == 4); 3259 assert(nextMultipleOf(2, 4) == 4); 3260 assert(nextMultipleOf(3, 4) == 4); 3261 assert(nextMultipleOf(4, 4) == 4); 3262 assert(nextMultipleOf(5, 4) == 8); 3263 3264 { 3265 void* p = _mm_malloc(23, 16); 3266 assert(p !is null); 3267 assert(((cast(size_t)p) & 0xf) == 0); 3268 _mm_free(p); 3269 } 3270 3271 void* nullAlloc = _mm_malloc(0, 32); 3272 assert(nullAlloc != null); 3273 _mm_free(nullAlloc); 3274 } 3275 3276 unittest 3277 { 3278 // In C23, it is UB to call realloc with 0 size. 3279 // Ensure this is not the case, ever. 3280 3281 int alignment = 1; 3282 void* alloc = _mm_malloc(18, alignment); 3283 3284 // DO NOT DO THAT: 3285 //_mm_realloc(alloc, 0, alignment); 3286 3287 // DO THAT: 3288 _mm_free(alloc); 3289 } 3290 3291 3292 // For some reason, order of declaration is important for this one 3293 // so it is misplaced. 3294 // Note: is just another name for _mm_cvtss_si32 3295 alias _mm_cvt_ss2si = _mm_cvtss_si32;