1 /** 2 * SSE intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE 4 * 5 * Copyright: Copyright Guillaume Piolat 2016-2020. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.xmmintrin; 9 10 public import inteli.types; 11 12 import inteli.internals; 13 14 import inteli.mmx; 15 import inteli.emmintrin; 16 17 import core.stdc.stdlib: malloc, free; 18 import core.stdc.string: memcpy; 19 import core.exception: onOutOfMemoryError; 20 21 version(D_InlineAsm_X86) 22 version = InlineX86Asm; 23 else version(D_InlineAsm_X86_64) 24 version = InlineX86Asm; 25 26 27 // SSE1 28 29 nothrow @nogc: 30 31 32 enum int _MM_EXCEPT_INVALID = 0x0001; /// MXCSR Exception states. 33 enum int _MM_EXCEPT_DENORM = 0x0002; ///ditto 34 enum int _MM_EXCEPT_DIV_ZERO = 0x0004; ///ditto 35 enum int _MM_EXCEPT_OVERFLOW = 0x0008; ///ditto 36 enum int _MM_EXCEPT_UNDERFLOW = 0x0010; ///ditto 37 enum int _MM_EXCEPT_INEXACT = 0x0020; ///ditto 38 enum int _MM_EXCEPT_MASK = 0x003f; /// MXCSR Exception states mask. 39 40 enum int _MM_MASK_INVALID = 0x0080; /// MXCSR Exception masks. 41 enum int _MM_MASK_DENORM = 0x0100; ///ditto 42 enum int _MM_MASK_DIV_ZERO = 0x0200; ///ditto 43 enum int _MM_MASK_OVERFLOW = 0x0400; ///ditto 44 enum int _MM_MASK_UNDERFLOW = 0x0800; ///ditto 45 enum int _MM_MASK_INEXACT = 0x1000; ///ditto 46 enum int _MM_MASK_MASK = 0x1f80; /// MXCSR Exception masks mask. 47 48 enum int _MM_ROUND_NEAREST = 0x0000; /// MXCSR Rounding mode. 49 enum int _MM_ROUND_DOWN = 0x2000; ///ditto 50 enum int _MM_ROUND_UP = 0x4000; ///ditto 51 enum int _MM_ROUND_TOWARD_ZERO = 0x6000; ///ditto 52 enum int _MM_ROUND_MASK = 0x6000; /// MXCSR Rounding mode mask. 53 54 enum int _MM_FLUSH_ZERO_MASK = 0x8000; /// MXCSR Denormal flush to zero mask. 55 enum int _MM_FLUSH_ZERO_ON = 0x8000; /// MXCSR Denormal flush to zero modes. 56 enum int _MM_FLUSH_ZERO_OFF = 0x0000; ///ditto 57 58 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`. 59 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe 60 { 61 pragma(inline, true); 62 return a + b; 63 } 64 unittest 65 { 66 __m128 a = [1, 2, 3, 4]; 67 a = _mm_add_ps(a, a); 68 assert(a.array[0] == 2); 69 assert(a.array[1] == 4); 70 assert(a.array[2] == 6); 71 assert(a.array[3] == 8); 72 } 73 74 /// Add the lower single-precision (32-bit) floating-point element 75 /// in `a` and `b`, store the result in the lower element of result, 76 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 77 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe 78 { 79 static if (GDC_with_SSE) 80 { 81 return __builtin_ia32_addss(a, b); 82 } 83 else static if (DMD_with_DSIMD) 84 { 85 return cast(__m128) __simd(XMM.ADDSS, a, b); 86 } 87 else 88 { 89 a[0] += b[0]; 90 return a; 91 } 92 } 93 unittest 94 { 95 __m128 a = [1, 2, 3, 4]; 96 a = _mm_add_ss(a, a); 97 assert(a.array == [2.0f, 2, 3, 4]); 98 } 99 100 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`. 101 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe 102 { 103 pragma(inline, true); 104 return cast(__m128)(cast(__m128i)a & cast(__m128i)b); 105 } 106 unittest 107 { 108 float a = 4.32f; 109 float b = -78.99f; 110 int correct = (*cast(int*)(&a)) & (*cast(int*)(&b)); 111 __m128 A = _mm_set_ps(a, b, a, b); 112 __m128 B = _mm_set_ps(b, a, b, a); 113 int4 R = cast(int4)( _mm_and_ps(A, B) ); 114 assert(R.array[0] == correct); 115 assert(R.array[1] == correct); 116 assert(R.array[2] == correct); 117 assert(R.array[3] == correct); 118 } 119 120 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` and then AND with `b`. 121 __m128 _mm_andnot_ps (__m128 a, __m128 b) pure @safe 122 { 123 static if (DMD_with_DSIMD) 124 return cast(__m128) __simd(XMM.ANDNPS, a, b); 125 else 126 return cast(__m128)( (~cast(__m128i)a) & cast(__m128i)b ); 127 } 128 unittest 129 { 130 float a = 4.32f; 131 float b = -78.99f; 132 int correct = ~(*cast(int*)(&a)) & (*cast(int*)(&b)); 133 int correct2 = (*cast(int*)(&a)) & ~(*cast(int*)(&b)); 134 __m128 A = _mm_set_ps(a, b, a, b); 135 __m128 B = _mm_set_ps(b, a, b, a); 136 int4 R = cast(int4)( _mm_andnot_ps(A, B) ); 137 assert(R.array[0] == correct2); 138 assert(R.array[1] == correct); 139 assert(R.array[2] == correct2); 140 assert(R.array[3] == correct); 141 } 142 143 /// Average packed unsigned 16-bit integers in ``a` and `b`. 144 __m64 _mm_avg_pu16 (__m64 a, __m64 b) pure @safe 145 { 146 return to_m64(_mm_avg_epu16(to_m128i(a), to_m128i(b))); 147 } 148 149 /// Average packed unsigned 8-bit integers in ``a` and `b`. 150 __m64 _mm_avg_pu8 (__m64 a, __m64 b) pure @safe 151 { 152 return to_m64(_mm_avg_epu8(to_m128i(a), to_m128i(b))); 153 } 154 155 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for equality. 156 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe 157 { 158 static if (DMD_with_DSIMD) 159 return cast(__m128) __simd(XMM.CMPPS, a, b, 0); 160 else 161 return cast(__m128) cmpps!(FPComparison.oeq)(a, b); 162 } 163 unittest 164 { 165 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 166 __m128 B = _mm_setr_ps(3.0f, 2.0f, float.nan, float.nan); 167 __m128i R = cast(__m128i) _mm_cmpeq_ps(A, B); 168 int[4] correct = [0, -1, 0, 0]; 169 assert(R.array == correct); 170 } 171 172 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for equality, 173 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 174 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe 175 { 176 static if (DMD_with_DSIMD) 177 return cast(__m128) __simd(XMM.CMPSS, a, b, 0); 178 else 179 return cast(__m128) cmpss!(FPComparison.oeq)(a, b); 180 } 181 unittest 182 { 183 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 184 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 185 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 186 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 187 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 188 __m128i R1 = cast(__m128i) _mm_cmpeq_ss(A, B); 189 __m128i R2 = cast(__m128i) _mm_cmpeq_ss(A, C); 190 __m128i R3 = cast(__m128i) _mm_cmpeq_ss(A, D); 191 __m128i R4 = cast(__m128i) _mm_cmpeq_ss(A, E); 192 int[4] correct1 = [-1, 0, 0, 0]; 193 int[4] correct2 = [0, 0, 0, 0]; 194 int[4] correct3 = [0, 0, 0, 0]; 195 int[4] correct4 = [0, 0, 0, 0]; 196 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 197 } 198 199 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal. 200 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe 201 { 202 static if (DMD_with_DSIMD) 203 return cast(__m128) __simd(XMM.CMPPS, b, a, 2); 204 else 205 return cast(__m128) cmpps!(FPComparison.oge)(a, b); 206 } 207 unittest 208 { 209 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 210 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 211 __m128i R = cast(__m128i) _mm_cmpge_ps(A, B); 212 int[4] correct = [0, -1,-1, 0]; 213 assert(R.array == correct); 214 } 215 216 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal, 217 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 218 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe 219 { 220 static if (DMD_with_DSIMD) 221 { 222 __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 2); 223 a[0] = c[0]; 224 return a; 225 } 226 else 227 return cast(__m128) cmpss!(FPComparison.oge)(a, b); 228 } 229 unittest 230 { 231 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 232 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 233 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 234 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 235 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 236 __m128i R1 = cast(__m128i) _mm_cmpge_ss(A, B); 237 __m128i R2 = cast(__m128i) _mm_cmpge_ss(A, C); 238 __m128i R3 = cast(__m128i) _mm_cmpge_ss(A, D); 239 __m128i R4 = cast(__m128i) _mm_cmpge_ss(A, E); 240 int[4] correct1 = [-1, 0, 0, 0]; 241 int[4] correct2 = [-1, 0, 0, 0]; 242 int[4] correct3 = [0, 0, 0, 0]; 243 int[4] correct4 = [0, 0, 0, 0]; 244 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 245 } 246 247 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than. 248 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe 249 { 250 static if (DMD_with_DSIMD) 251 return cast(__m128) __simd(XMM.CMPPS, b, a, 1); 252 else 253 return cast(__m128) cmpps!(FPComparison.ogt)(a, b); 254 } 255 unittest 256 { 257 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 258 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 259 __m128i R = cast(__m128i) _mm_cmpgt_ps(A, B); 260 int[4] correct = [0, 0,-1, 0]; 261 assert(R.array == correct); 262 } 263 264 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than, 265 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 266 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe 267 { 268 static if (DMD_with_DSIMD) 269 { 270 __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 1); 271 a[0] = c[0]; 272 return a; 273 } 274 else 275 return cast(__m128) cmpss!(FPComparison.ogt)(a, b); 276 } 277 unittest 278 { 279 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 280 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 281 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 282 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 283 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 284 __m128i R1 = cast(__m128i) _mm_cmpgt_ss(A, B); 285 __m128i R2 = cast(__m128i) _mm_cmpgt_ss(A, C); 286 __m128i R3 = cast(__m128i) _mm_cmpgt_ss(A, D); 287 __m128i R4 = cast(__m128i) _mm_cmpgt_ss(A, E); 288 int[4] correct1 = [0, 0, 0, 0]; 289 int[4] correct2 = [-1, 0, 0, 0]; 290 int[4] correct3 = [0, 0, 0, 0]; 291 int[4] correct4 = [0, 0, 0, 0]; 292 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 293 } 294 295 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal. 296 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe 297 { 298 static if (DMD_with_DSIMD) 299 return cast(__m128) __simd(XMM.CMPPS, a, b, 2); 300 else 301 return cast(__m128) cmpps!(FPComparison.ole)(a, b); 302 } 303 unittest 304 { 305 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 306 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 307 __m128i R = cast(__m128i) _mm_cmple_ps(A, B); 308 int[4] correct = [-1, -1, 0, 0]; 309 assert(R.array == correct); 310 } 311 312 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal, 313 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 314 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe 315 { 316 static if (DMD_with_DSIMD) 317 return cast(__m128) __simd(XMM.CMPSS, a, b, 2); 318 else 319 return cast(__m128) cmpss!(FPComparison.ole)(a, b); 320 } 321 unittest 322 { 323 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 324 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 325 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 326 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 327 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 328 __m128i R1 = cast(__m128i) _mm_cmple_ss(A, B); 329 __m128i R2 = cast(__m128i) _mm_cmple_ss(A, C); 330 __m128i R3 = cast(__m128i) _mm_cmple_ss(A, D); 331 __m128i R4 = cast(__m128i) _mm_cmple_ss(A, E); 332 int[4] correct1 = [-1, 0, 0, 0]; 333 int[4] correct2 = [0, 0, 0, 0]; 334 int[4] correct3 = [0, 0, 0, 0]; 335 int[4] correct4 = [-1, 0, 0, 0]; 336 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 337 } 338 339 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than. 340 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe 341 { 342 static if (DMD_with_DSIMD) 343 return cast(__m128) __simd(XMM.CMPPS, a, b, 1); 344 else 345 return cast(__m128) cmpps!(FPComparison.olt)(a, b); 346 } 347 unittest 348 { 349 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 350 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 351 __m128i R = cast(__m128i) _mm_cmplt_ps(A, B); 352 int[4] correct = [-1, 0, 0, 0]; 353 assert(R.array == correct); 354 } 355 356 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than, 357 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 358 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe 359 { 360 static if (DMD_with_DSIMD) 361 return cast(__m128) __simd(XMM.CMPSS, a, b, 1); 362 else 363 return cast(__m128) cmpss!(FPComparison.olt)(a, b); 364 } 365 unittest 366 { 367 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 368 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 369 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 370 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 371 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 372 __m128i R1 = cast(__m128i) _mm_cmplt_ss(A, B); 373 __m128i R2 = cast(__m128i) _mm_cmplt_ss(A, C); 374 __m128i R3 = cast(__m128i) _mm_cmplt_ss(A, D); 375 __m128i R4 = cast(__m128i) _mm_cmplt_ss(A, E); 376 int[4] correct1 = [0, 0, 0, 0]; 377 int[4] correct2 = [0, 0, 0, 0]; 378 int[4] correct3 = [0, 0, 0, 0]; 379 int[4] correct4 = [-1, 0, 0, 0]; 380 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 381 } 382 383 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-equal. 384 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe 385 { 386 static if (DMD_with_DSIMD) 387 return cast(__m128) __simd(XMM.CMPPS, a, b, 4); 388 else 389 return cast(__m128) cmpps!(FPComparison.une)(a, b); 390 } 391 unittest 392 { 393 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 394 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 395 __m128i R = cast(__m128i) _mm_cmpneq_ps(A, B); 396 int[4] correct = [-1, 0, -1, -1]; 397 assert(R.array == correct); 398 } 399 400 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-equal, 401 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 402 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe 403 { 404 static if (DMD_with_DSIMD) 405 return cast(__m128) __simd(XMM.CMPSS, a, b, 4); 406 else 407 return cast(__m128) cmpss!(FPComparison.une)(a, b); 408 } 409 unittest 410 { 411 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 412 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 413 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 414 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 415 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 416 __m128i R1 = cast(__m128i) _mm_cmpneq_ss(A, B); 417 __m128i R2 = cast(__m128i) _mm_cmpneq_ss(A, C); 418 __m128i R3 = cast(__m128i) _mm_cmpneq_ss(A, D); 419 __m128i R4 = cast(__m128i) _mm_cmpneq_ss(A, E); 420 int[4] correct1 = [0, 0, 0, 0]; 421 int[4] correct2 = [-1, 0, 0, 0]; 422 int[4] correct3 = [-1, 0, 0, 0]; 423 int[4] correct4 = [-1, 0, 0, 0]; 424 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 425 } 426 427 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal. 428 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe 429 { 430 static if (DMD_with_DSIMD) 431 return cast(__m128) __simd(XMM.CMPPS, b, a, 6); 432 else 433 return cast(__m128) cmpps!(FPComparison.ult)(a, b); 434 } 435 unittest 436 { 437 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 438 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 439 __m128i R = cast(__m128i) _mm_cmpnge_ps(A, B); 440 int[4] correct = [-1, 0, 0, -1]; 441 assert(R.array == correct); 442 } 443 444 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal, 445 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 446 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe 447 { 448 static if (DMD_with_DSIMD) 449 { 450 __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 6); 451 a[0] = c[0]; 452 return a; 453 } 454 else 455 return cast(__m128) cmpss!(FPComparison.ult)(a, b); 456 } 457 unittest 458 { 459 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 460 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 461 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 462 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 463 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 464 __m128i R1 = cast(__m128i) _mm_cmpnge_ss(A, B); 465 __m128i R2 = cast(__m128i) _mm_cmpnge_ss(A, C); 466 __m128i R3 = cast(__m128i) _mm_cmpnge_ss(A, D); 467 __m128i R4 = cast(__m128i) _mm_cmpnge_ss(A, E); 468 int[4] correct1 = [0, 0, 0, 0]; 469 int[4] correct2 = [0, 0, 0, 0]; 470 int[4] correct3 = [-1, 0, 0, 0]; 471 int[4] correct4 = [-1, 0, 0, 0]; 472 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 473 } 474 475 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than. 476 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe 477 { 478 static if (DMD_with_DSIMD) 479 return cast(__m128) __simd(XMM.CMPPS, b, a, 5); 480 else 481 return cast(__m128) cmpps!(FPComparison.ule)(a, b); 482 } 483 unittest 484 { 485 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 486 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 487 __m128i R = cast(__m128i) _mm_cmpngt_ps(A, B); 488 int[4] correct = [-1, -1, 0, -1]; 489 assert(R.array == correct); 490 } 491 492 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than, 493 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 494 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe 495 { 496 static if (DMD_with_DSIMD) 497 { 498 __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 5); 499 a[0] = c[0]; 500 return a; 501 } 502 else 503 return cast(__m128) cmpss!(FPComparison.ule)(a, b); 504 } 505 unittest 506 { 507 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 508 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 509 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 510 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 511 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 512 __m128i R1 = cast(__m128i) _mm_cmpngt_ss(A, B); 513 __m128i R2 = cast(__m128i) _mm_cmpngt_ss(A, C); 514 __m128i R3 = cast(__m128i) _mm_cmpngt_ss(A, D); 515 __m128i R4 = cast(__m128i) _mm_cmpngt_ss(A, E); 516 int[4] correct1 = [-1, 0, 0, 0]; 517 int[4] correct2 = [0, 0, 0, 0]; 518 int[4] correct3 = [-1, 0, 0, 0]; 519 int[4] correct4 = [-1, 0, 0, 0]; 520 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 521 } 522 523 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal. 524 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe 525 { 526 static if (DMD_with_DSIMD) 527 return cast(__m128) __simd(XMM.CMPPS, a, b, 6); 528 else 529 return cast(__m128) cmpps!(FPComparison.ugt)(a, b); 530 } 531 unittest 532 { 533 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 534 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 535 __m128i R = cast(__m128i) _mm_cmpnle_ps(A, B); 536 int[4] correct = [0, 0, -1, -1]; 537 assert(R.array == correct); 538 } 539 540 541 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal, 542 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 543 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe 544 { 545 static if (DMD_with_DSIMD) 546 return cast(__m128) __simd(XMM.CMPSS, a, b, 6); 547 else 548 return cast(__m128) cmpss!(FPComparison.ugt)(a, b); 549 } 550 unittest 551 { 552 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 553 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 554 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 555 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 556 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 557 __m128i R1 = cast(__m128i) _mm_cmpnle_ss(A, B); 558 __m128i R2 = cast(__m128i) _mm_cmpnle_ss(A, C); 559 __m128i R3 = cast(__m128i) _mm_cmpnle_ss(A, D); 560 __m128i R4 = cast(__m128i) _mm_cmpnle_ss(A, E); 561 int[4] correct1 = [0, 0, 0, 0]; 562 int[4] correct2 = [-1, 0, 0, 0]; 563 int[4] correct3 = [-1, 0, 0, 0]; 564 int[4] correct4 = [0, 0, 0, 0]; 565 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 566 } 567 568 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than. 569 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe 570 { 571 static if (DMD_with_DSIMD) 572 return cast(__m128) __simd(XMM.CMPPS, a, b, 5); 573 else 574 return cast(__m128) cmpps!(FPComparison.uge)(a, b); 575 } 576 unittest 577 { 578 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 579 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 580 __m128i R = cast(__m128i) _mm_cmpnlt_ps(A, B); 581 int[4] correct = [0, -1, -1, -1]; 582 assert(R.array == correct); 583 } 584 585 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than, 586 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 587 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe 588 { 589 static if (DMD_with_DSIMD) 590 return cast(__m128) __simd(XMM.CMPSS, a, b, 5); 591 else 592 return cast(__m128) cmpss!(FPComparison.uge)(a, b); 593 } 594 unittest 595 { 596 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 597 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 598 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 599 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 600 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 601 __m128i R1 = cast(__m128i) _mm_cmpnlt_ss(A, B); 602 __m128i R2 = cast(__m128i) _mm_cmpnlt_ss(A, C); 603 __m128i R3 = cast(__m128i) _mm_cmpnlt_ss(A, D); 604 __m128i R4 = cast(__m128i) _mm_cmpnlt_ss(A, E); 605 int[4] correct1 = [-1, 0, 0, 0]; 606 int[4] correct2 = [-1, 0, 0, 0]; 607 int[4] correct3 = [-1, 0, 0, 0]; 608 int[4] correct4 = [0, 0, 0, 0]; 609 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 610 } 611 612 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN. 613 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe 614 { 615 static if (DMD_with_DSIMD) 616 return cast(__m128) __simd(XMM.CMPPS, a, b, 7); 617 else 618 return cast(__m128) cmpps!(FPComparison.ord)(a, b); 619 } 620 unittest 621 { 622 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 623 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 624 __m128i R = cast(__m128i) _mm_cmpord_ps(A, B); 625 int[4] correct = [-1, -1, -1, 0]; 626 assert(R.array == correct); 627 } 628 629 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN, 630 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 631 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe 632 { 633 static if (DMD_with_DSIMD) 634 return cast(__m128) __simd(XMM.CMPSS, a, b, 7); 635 else 636 return cast(__m128) cmpss!(FPComparison.ord)(a, b); 637 } 638 unittest 639 { 640 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 641 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 642 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 643 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 644 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 645 __m128i R1 = cast(__m128i) _mm_cmpord_ss(A, B); 646 __m128i R2 = cast(__m128i) _mm_cmpord_ss(A, C); 647 __m128i R3 = cast(__m128i) _mm_cmpord_ss(A, D); 648 __m128i R4 = cast(__m128i) _mm_cmpord_ss(A, E); 649 int[4] correct1 = [-1, 0, 0, 0]; 650 int[4] correct2 = [-1, 0, 0, 0]; 651 int[4] correct3 = [0, 0, 0, 0]; 652 int[4] correct4 = [-1, 0, 0, 0]; 653 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 654 } 655 656 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN. 657 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe 658 { 659 static if (DMD_with_DSIMD) 660 return cast(__m128) __simd(XMM.CMPPS, a, b, 3); 661 else 662 return cast(__m128) cmpps!(FPComparison.uno)(a, b); 663 } 664 unittest 665 { 666 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 667 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 668 __m128i R = cast(__m128i) _mm_cmpunord_ps(A, B); 669 int[4] correct = [0, 0, 0, -1]; 670 assert(R.array == correct); 671 } 672 673 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN. 674 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 675 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe 676 { 677 static if (DMD_with_DSIMD) 678 return cast(__m128) __simd(XMM.CMPSS, a, b, 3); 679 else return cast(__m128) cmpss!(FPComparison.uno)(a, b); 680 } 681 unittest 682 { 683 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 684 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 685 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 686 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 687 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 688 __m128i R1 = cast(__m128i) _mm_cmpunord_ss(A, B); 689 __m128i R2 = cast(__m128i) _mm_cmpunord_ss(A, C); 690 __m128i R3 = cast(__m128i) _mm_cmpunord_ss(A, D); 691 __m128i R4 = cast(__m128i) _mm_cmpunord_ss(A, E); 692 int[4] correct1 = [0, 0, 0, 0]; 693 int[4] correct2 = [0, 0, 0, 0]; 694 int[4] correct3 = [-1, 0, 0, 0]; 695 int[4] correct4 = [0, 0, 0, 0]; 696 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 697 } 698 699 700 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for equality, 701 /// and return the boolean result (0 or 1). 702 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe 703 { 704 return a.array[0] == b.array[0]; 705 } 706 unittest 707 { 708 assert(1 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 709 assert(0 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 710 assert(0 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 711 assert(0 == _mm_comieq_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 712 assert(1 == _mm_comieq_ss(_mm_set_ss(0.0), _mm_set_ss(-0.0))); 713 } 714 715 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than-or-equal, 716 /// and return the boolean result (0 or 1). 717 int _mm_comige_ss (__m128 a, __m128 b) pure @safe 718 { 719 return a.array[0] >= b.array[0]; 720 } 721 unittest 722 { 723 assert(1 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 724 assert(1 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 725 assert(0 == _mm_comige_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f))); 726 assert(0 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 727 assert(0 == _mm_comige_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 728 assert(1 == _mm_comige_ss(_mm_set_ss(-0.0f), _mm_set_ss(0.0f))); 729 } 730 731 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than, 732 /// and return the boolean result (0 or 1). 733 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta 734 { 735 return a.array[0] > b.array[0]; 736 } 737 unittest 738 { 739 assert(0 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 740 assert(1 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 741 assert(0 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 742 assert(0 == _mm_comigt_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 743 assert(0 == _mm_comigt_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f))); 744 } 745 746 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than-or-equal, 747 /// and return the boolean result (0 or 1). 748 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe 749 { 750 return a.array[0] <= b.array[0]; 751 } 752 unittest 753 { 754 assert(1 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 755 assert(0 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 756 assert(1 == _mm_comile_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f))); 757 assert(0 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 758 assert(0 == _mm_comile_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 759 assert(1 == _mm_comile_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f))); 760 } 761 762 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than, 763 /// and return the boolean result (0 or 1). 764 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb 765 { 766 return a.array[0] < b.array[0]; 767 } 768 unittest 769 { 770 assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 771 assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 772 assert(1 == _mm_comilt_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f))); 773 assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 774 assert(0 == _mm_comilt_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 775 assert(0 == _mm_comilt_ss(_mm_set_ss(-0.0f), _mm_set_ss(0.0f))); 776 } 777 778 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for not-equal, 779 /// and return the boolean result (0 or 1). 780 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne 781 { 782 return a.array[0] != b.array[0]; 783 } 784 unittest 785 { 786 assert(0 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 787 assert(1 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 788 assert(1 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 789 assert(1 == _mm_comineq_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 790 assert(0 == _mm_comineq_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f))); 791 } 792 793 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 794 /// floating-point elements, store the results in the lower 2 elements, 795 /// and copy the upper 2 packed elements from `a` to the upper elements of result. 796 alias _mm_cvt_pi2ps = _mm_cvtpi32_ps; 797 798 /// Convert 2 lower packed single-precision (32-bit) floating-point elements in `a` 799 /// to packed 32-bit integers. 800 __m64 _mm_cvt_ps2pi (__m128 a) @safe 801 { 802 return to_m64(_mm_cvtps_epi32(a)); 803 } 804 805 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 806 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 807 /// upper elements of the result. 808 __m128 _mm_cvt_si2ss (__m128 v, int x) pure @trusted 809 { 810 v.ptr[0] = cast(float)x; 811 return v; 812 } 813 unittest 814 { 815 __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42); 816 assert(a.array == [42f, 0, 0, 0]); 817 } 818 819 /// Convert packed 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements. 820 __m128 _mm_cvtpi16_ps (__m64 a) pure @safe 821 { 822 __m128i ma = to_m128i(a); 823 ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit 824 ma = _mm_srai_epi32(_mm_slli_epi32(ma, 16), 16); // Replicate sign bit 825 return _mm_cvtepi32_ps(ma); 826 } 827 unittest 828 { 829 __m64 A = _mm_setr_pi16(-1, 2, -3, 4); 830 __m128 R = _mm_cvtpi16_ps(A); 831 float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f]; 832 assert(R.array == correct); 833 } 834 835 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 836 /// floating-point elements, store the results in the lower 2 elements, 837 /// and copy the upper 2 packed elements from `a` to the upper elements of result. 838 __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) pure @trusted 839 { 840 __m128 fb = _mm_cvtepi32_ps(to_m128i(b)); 841 a.ptr[0] = fb.array[0]; 842 a.ptr[1] = fb.array[1]; 843 return a; 844 } 845 unittest 846 { 847 __m128 R = _mm_cvtpi32_ps(_mm_set1_ps(4.0f), _mm_setr_pi32(1, 2)); 848 float[4] correct = [1.0f, 2.0f, 4.0f, 4.0f]; 849 assert(R.array == correct); 850 } 851 852 /// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point elements, 853 /// store the results in the lower 2 elements, then covert the packed signed 32-bit integers in `b` to 854 /// single-precision (32-bit) floating-point element, and store the results in the upper 2 elements. 855 __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) pure @trusted 856 { 857 long2 l; 858 l.ptr[0] = a.array[0]; 859 l.ptr[1] = b.array[0]; 860 return _mm_cvtepi32_ps(cast(__m128i)l); 861 } 862 unittest 863 { 864 __m64 A = _mm_setr_pi32(-45, 128); 865 __m64 B = _mm_setr_pi32(0, 1000); 866 __m128 R = _mm_cvtpi32x2_ps(A, B); 867 float[4] correct = [-45.0f, 128.0f, 0.0f, 1000.0f]; 868 assert(R.array == correct); 869 } 870 871 /// Convert the lower packed 8-bit integers in `a` to packed single-precision (32-bit) floating-point elements. 872 __m128 _mm_cvtpi8_ps (__m64 a) pure @safe 873 { 874 __m128i b = to_m128i(a); 875 876 // Zero extend to 32-bit 877 b = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 878 b = _mm_unpacklo_epi16(b, _mm_setzero_si128()); 879 880 // Replicate sign bit 881 b = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24); // Replicate sign bit 882 return _mm_cvtepi32_ps(b); 883 } 884 unittest 885 { 886 __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0); 887 __m128 R = _mm_cvtpi8_ps(A); 888 float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f]; 889 assert(R.array == correct); 890 } 891 892 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 16-bit integers. 893 /// Note: this intrinsic will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and 0x7FFFFFFF. 894 __m64 _mm_cvtps_pi16 (__m128 a) @safe 895 { 896 // The C++ version of this intrinsic convert to 32-bit float, then use packssdw 897 // Which means the 16-bit integers should be saturated 898 __m128i b = _mm_cvtps_epi32(a); 899 b = _mm_packs_epi32(b, b); 900 return to_m64(b); 901 } 902 unittest 903 { 904 __m128 A = _mm_setr_ps(-1.0f, 2.0f, -33000.0f, 70000.0f); 905 short4 R = cast(short4) _mm_cvtps_pi16(A); 906 short[4] correct = [-1, 2, -32768, 32767]; 907 assert(R.array == correct); 908 } 909 910 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers. 911 __m64 _mm_cvtps_pi32 (__m128 a) @safe 912 { 913 return to_m64(_mm_cvtps_epi32(a)); 914 } 915 unittest 916 { 917 __m128 A = _mm_setr_ps(-33000.0f, 70000.0f, -1.0f, 2.0f, ); 918 int2 R = cast(int2) _mm_cvtps_pi32(A); 919 int[2] correct = [-33000, 70000]; 920 assert(R.array == correct); 921 } 922 923 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 8-bit integers, 924 /// and store the results in lower 4 elements. 925 /// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values between 0x7F and 0x7FFFFFFF. 926 __m64 _mm_cvtps_pi8 (__m128 a) @safe 927 { 928 // The C++ version of this intrinsic convert to 32-bit float, then use packssdw + packsswb 929 // Which means the 8-bit integers should be saturated 930 __m128i b = _mm_cvtps_epi32(a); 931 b = _mm_packs_epi32(b, _mm_setzero_si128()); 932 b = _mm_packs_epi16(b, _mm_setzero_si128()); 933 return to_m64(b); 934 } 935 unittest 936 { 937 __m128 A = _mm_setr_ps(-1.0f, 2.0f, -129.0f, 128.0f); 938 byte8 R = cast(byte8) _mm_cvtps_pi8(A); 939 byte[8] correct = [-1, 2, -128, 127, 0, 0, 0, 0]; 940 assert(R.array == correct); 941 } 942 943 /// Convert packed unsigned 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements. 944 __m128 _mm_cvtpu16_ps (__m64 a) pure @safe 945 { 946 __m128i ma = to_m128i(a); 947 ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit 948 return _mm_cvtepi32_ps(ma); 949 } 950 unittest 951 { 952 __m64 A = _mm_setr_pi16(-1, 2, -3, 4); 953 __m128 R = _mm_cvtpu16_ps(A); 954 float[4] correct = [65535.0f, 2.0f, 65533.0f, 4.0f]; 955 assert(R.array == correct); 956 } 957 958 /// Convert the lower packed unsigned 8-bit integers in `a` to packed single-precision (32-bit) floating-point element. 959 __m128 _mm_cvtpu8_ps (__m64 a) pure @safe 960 { 961 __m128i b = to_m128i(a); 962 963 // Zero extend to 32-bit 964 b = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 965 b = _mm_unpacklo_epi16(b, _mm_setzero_si128()); 966 return _mm_cvtepi32_ps(b); 967 } 968 unittest 969 { 970 __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0); 971 __m128 R = _mm_cvtpu8_ps(A); 972 float[4] correct = [255.0f, 2.0f, 253.0f, 4.0f]; 973 assert(R.array == correct); 974 } 975 976 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 977 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 978 /// upper elements of result. 979 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @trusted 980 { 981 v.ptr[0] = cast(float)x; 982 return v; 983 } 984 unittest 985 { 986 __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42); 987 assert(a.array == [42.0f, 0, 0, 0]); 988 } 989 990 991 /// Convert the signed 64-bit integer `b` to a single-precision (32-bit) floating-point element, 992 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 993 /// upper elements of result. 994 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @trusted 995 { 996 v.ptr[0] = cast(float)x; 997 return v; 998 } 999 unittest 1000 { 1001 __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42); 1002 assert(a.array == [42.0f, 0, 0, 0]); 1003 } 1004 1005 /// Take the lower single-precision (32-bit) floating-point element of `a`. 1006 float _mm_cvtss_f32(__m128 a) pure @safe 1007 { 1008 return a.array[0]; 1009 } 1010 1011 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit integer. 1012 int _mm_cvtss_si32 (__m128 a) @safe // PERF GDC 1013 { 1014 static if (GDC_with_SSE) 1015 { 1016 return __builtin_ia32_cvtss2si(a); 1017 } 1018 else static if (LDC_with_SSE) 1019 { 1020 return __builtin_ia32_cvtss2si(a); 1021 } 1022 else static if (DMD_with_DSIMD) 1023 { 1024 __m128 b; 1025 __m128i r = cast(__m128i) __simd(XMM.CVTPS2DQ, a); // Note: converts 4 integers. 1026 return r.array[0]; 1027 } 1028 else 1029 { 1030 return convertFloatToInt32UsingMXCSR(a.array[0]); 1031 } 1032 } 1033 unittest 1034 { 1035 assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 1036 } 1037 1038 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer. 1039 long _mm_cvtss_si64 (__m128 a) @safe 1040 { 1041 version(LDC) 1042 { 1043 version(X86_64) 1044 { 1045 return __builtin_ia32_cvtss2si64(a); 1046 } 1047 else 1048 { 1049 // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer 1050 // using SSE instructions only. So the builtin doesn't exit for this arch. 1051 return convertFloatToInt64UsingMXCSR(a.array[0]); 1052 } 1053 } 1054 else 1055 { 1056 return convertFloatToInt64UsingMXCSR(a.array[0]); 1057 } 1058 } 1059 unittest 1060 { 1061 assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 1062 1063 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1064 1065 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1066 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.49f))); 1067 1068 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1069 assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f))); 1070 1071 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1072 assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f))); 1073 1074 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1075 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f))); 1076 1077 _MM_SET_ROUNDING_MODE(savedRounding); 1078 } 1079 1080 1081 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit 1082 /// integer with truncation. 1083 int _mm_cvtt_ss2si (__m128 a) pure @safe 1084 { 1085 // x86: cvttss2si always generated, even in -O0 1086 return cast(int)(a.array[0]); 1087 } 1088 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; ///ditto 1089 unittest 1090 { 1091 assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1092 } 1093 1094 1095 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit 1096 /// integers with truncation. 1097 __m64 _mm_cvtt_ps2pi (__m128 a) pure @safe 1098 { 1099 return to_m64(_mm_cvttps_epi32(a)); 1100 } 1101 1102 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit 1103 /// integer with truncation. 1104 long _mm_cvttss_si64 (__m128 a) pure @safe 1105 { 1106 return cast(long)(a.array[0]); 1107 } 1108 unittest 1109 { 1110 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1111 } 1112 1113 /// Divide packed single-precision (32-bit) floating-point elements in `a` by packed elements in `b`. 1114 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe 1115 { 1116 pragma(inline, true); 1117 return a / b; 1118 } 1119 unittest 1120 { 1121 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1122 a = _mm_div_ps(a, a); 1123 float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f]; 1124 assert(a.array == correct); 1125 } 1126 1127 /// Divide the lower single-precision (32-bit) floating-point element in `a` by the lower 1128 /// single-precision (32-bit) floating-point element in `b`, store the result in the lower 1129 /// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result. 1130 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe 1131 { 1132 static if (DMD_with_DSIMD) 1133 return cast(__m128) __simd(XMM.DIVSS, a, b); 1134 else static if (GDC_with_SSE) 1135 return __builtin_ia32_divss(a, b); 1136 else 1137 { 1138 a[0] /= b[0]; 1139 return a; 1140 } 1141 } 1142 unittest 1143 { 1144 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1145 a = _mm_div_ss(a, a); 1146 float[4] correct = [1.0f, -2.0, 3.0f, 1.0f]; 1147 assert(a.array == correct); 1148 } 1149 1150 /// Extract a 16-bit unsigned integer from `a`, selected with `imm8`. Zero-extended. 1151 int _mm_extract_pi16 (__m64 a, int imm8) 1152 { 1153 short4 sa = cast(short4)a; 1154 return cast(ushort)(sa.array[imm8]); 1155 } 1156 unittest 1157 { 1158 __m64 A = _mm_setr_pi16(-1, 6, 0, 4); 1159 assert(_mm_extract_pi16(A, 0) == 65535); 1160 assert(_mm_extract_pi16(A, 1) == 6); 1161 assert(_mm_extract_pi16(A, 2) == 0); 1162 assert(_mm_extract_pi16(A, 3) == 4); 1163 } 1164 1165 /// Free aligned memory that was allocated with `_mm_malloc`. 1166 void _mm_free(void * mem_addr) @trusted 1167 { 1168 // support for free(NULL) 1169 if (mem_addr is null) 1170 return; 1171 1172 // Technically we don't need to store size and alignement in the chunk, but we do in case we 1173 // have to implement _mm_realloc 1174 1175 size_t pointerSize = (void*).sizeof; 1176 void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof); 1177 size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize); 1178 size_t alignment = *alignmentLocation; 1179 assert(alignment != 0); 1180 assert(isPointerAligned(mem_addr, alignment)); 1181 free(*rawLocation); 1182 } 1183 1184 /// Get the exception mask bits from the MXCSR control and status register. 1185 /// The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, 1186 /// `_MM_MASK_DIV_ZERO`, `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`. 1187 /// Note: won't correspond to reality on non-x86, where MXCSR this is emulated. 1188 uint _MM_GET_EXCEPTION_MASK() @safe 1189 { 1190 return _mm_getcsr() & _MM_MASK_MASK; 1191 } 1192 1193 /// Get the exception state bits from the MXCSR control and status register. 1194 /// The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 1195 /// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`. 1196 /// Note: won't correspond to reality on non-x86, where MXCSR this is emulated. No exception reported. 1197 uint _MM_GET_EXCEPTION_STATE() @safe 1198 { 1199 return _mm_getcsr() & _MM_EXCEPT_MASK; 1200 } 1201 1202 /// Get the flush zero bits from the MXCSR control and status register. 1203 /// The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF` 1204 uint _MM_GET_FLUSH_ZERO_MODE() @safe 1205 { 1206 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 1207 } 1208 1209 /// Get the rounding mode bits from the MXCSR control and status register. The rounding mode may 1210 /// contain any of the following flags: `_MM_ROUND_NEAREST, `_MM_ROUND_DOWN`, `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`. 1211 uint _MM_GET_ROUNDING_MODE() @safe 1212 { 1213 return _mm_getcsr() & _MM_ROUND_MASK; 1214 } 1215 1216 /// Get the unsigned 32-bit value of the MXCSR control and status register. 1217 /// Note: this is emulated on ARM, because there is no MXCSR register then. 1218 uint _mm_getcsr() @trusted 1219 { 1220 static if (LDC_with_ARM) 1221 { 1222 // Note: we convert the ARM FPSCR into a x86 SSE control word. 1223 // However, only rounding mode and flush to zero are actually set. 1224 // The returned control word will have all exceptions masked, and no exception detected. 1225 1226 uint fpscr = arm_get_fpcr(); 1227 1228 uint cw = 0; // No exception detected 1229 if (fpscr & _MM_FLUSH_ZERO_MASK_ARM) 1230 { 1231 // ARM has one single flag for ARM. 1232 // It does both x86 bits. 1233 // https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode 1234 cw |= _MM_FLUSH_ZERO_ON; 1235 cw |= 0x40; // set "denormals are zeros" 1236 } 1237 cw |= _MM_MASK_MASK; // All exception maske 1238 1239 // Rounding mode 1240 switch(fpscr & _MM_ROUND_MASK_ARM) 1241 { 1242 default: 1243 case _MM_ROUND_NEAREST_ARM: cw |= _MM_ROUND_NEAREST; break; 1244 case _MM_ROUND_DOWN_ARM: cw |= _MM_ROUND_DOWN; break; 1245 case _MM_ROUND_UP_ARM: cw |= _MM_ROUND_UP; break; 1246 case _MM_ROUND_TOWARD_ZERO_ARM: cw |= _MM_ROUND_TOWARD_ZERO; break; 1247 } 1248 return cw; 1249 } 1250 else version(GNU) 1251 { 1252 static if (GDC_with_SSE) 1253 { 1254 return __builtin_ia32_stmxcsr(); 1255 } 1256 else version(X86) 1257 { 1258 uint sseRounding = 0; 1259 asm pure nothrow @nogc @trusted 1260 { 1261 "stmxcsr %0;\n" 1262 : "=m" (sseRounding) 1263 : 1264 : ; 1265 } 1266 return sseRounding; 1267 } 1268 else 1269 static assert(false); 1270 } 1271 else version (InlineX86Asm) 1272 { 1273 uint controlWord; 1274 asm nothrow @nogc pure @safe 1275 { 1276 stmxcsr controlWord; 1277 } 1278 return controlWord; 1279 } 1280 else 1281 static assert(0, "Not yet supported"); 1282 } 1283 unittest 1284 { 1285 uint csr = _mm_getcsr(); 1286 } 1287 1288 /// Insert a 16-bit integer `i` inside `a` at the location specified by `imm8`. 1289 __m64 _mm_insert_pi16 (__m64 v, int i, int imm8) pure @trusted 1290 { 1291 short4 r = cast(short4)v; 1292 r.ptr[imm8 & 3] = cast(short)i; 1293 return cast(__m64)r; 1294 } 1295 unittest 1296 { 1297 __m64 A = _mm_set_pi16(3, 2, 1, 0); 1298 short4 R = cast(short4) _mm_insert_pi16(A, 42, 1 | 4); 1299 short[4] correct = [0, 42, 2, 3]; 1300 assert(R.array == correct); 1301 } 1302 1303 /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory. 1304 // `p` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1305 __m128 _mm_load_ps(const(float)*p) pure @trusted // FUTURE shouldn't be trusted, see #62 1306 { 1307 pragma(inline, true); 1308 return *cast(__m128*)p; 1309 } 1310 unittest 1311 { 1312 static immutable align(16) float[4] correct = [1.0f, 2.0f, 3.0f, 4.0f]; 1313 __m128 A = _mm_load_ps(correct.ptr); 1314 assert(A.array == correct); 1315 } 1316 1317 /// Load a single-precision (32-bit) floating-point element from memory into all elements. 1318 __m128 _mm_load_ps1(const(float)*p) pure @trusted 1319 { 1320 return __m128(*p); 1321 } 1322 unittest 1323 { 1324 float n = 2.5f; 1325 float[4] correct = [2.5f, 2.5f, 2.5f, 2.5f]; 1326 __m128 A = _mm_load_ps1(&n); 1327 assert(A.array == correct); 1328 } 1329 1330 /// Load a single-precision (32-bit) floating-point element from memory into the lower of dst, and zero the upper 3 1331 /// elements. `mem_addr` does not need to be aligned on any particular boundary. 1332 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted 1333 { 1334 pragma(inline, true); 1335 static if (DMD_with_DSIMD) 1336 { 1337 return cast(__m128)__simd(XMM.LODSS, *cast(__m128*)mem_addr); 1338 } 1339 else 1340 { 1341 __m128 r; // PERf =void; 1342 r.ptr[0] = *mem_addr; 1343 r.ptr[1] = 0; 1344 r.ptr[2] = 0; 1345 r.ptr[3] = 0; 1346 return r; 1347 } 1348 } 1349 unittest 1350 { 1351 float n = 2.5f; 1352 float[4] correct = [2.5f, 0.0f, 0.0f, 0.0f]; 1353 __m128 A = _mm_load_ss(&n); 1354 assert(A.array == correct); 1355 } 1356 1357 /// Load a single-precision (32-bit) floating-point element from memory into all elements. 1358 alias _mm_load1_ps = _mm_load_ps1; 1359 1360 /// Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of result, 1361 /// and copy the lower 2 elements from `a` to result. `mem_addr does` not need to be aligned on any particular boundary. 1362 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @trusted 1363 { 1364 pragma(inline, true); 1365 static if (DMD_with_DSIMD) 1366 { 1367 return cast(__m128) __simd(XMM.LODHPS, a, *cast(const(__m128)*)mem_addr); 1368 } 1369 else 1370 { 1371 // x86: movlhps generated since LDC 1.9.0 -O1 1372 long2 la = cast(long2)a; 1373 la.ptr[1] = (*mem_addr).array[0]; 1374 return cast(__m128)la; 1375 } 1376 } 1377 unittest 1378 { 1379 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1380 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1381 __m64 M = to_m64(cast(__m128i)B); 1382 __m128 R = _mm_loadh_pi(A, &M); 1383 float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f]; 1384 assert(R.array == correct); 1385 } 1386 1387 /// Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of result, 1388 /// and copy the upper 2 elements from `a` to result. `mem_addr` does not need to be aligned on any particular boundary. 1389 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @trusted 1390 { 1391 pragma(inline, true); 1392 1393 // Disabled because of https://issues.dlang.org/show_bug.cgi?id=23046 1394 /* 1395 static if (DMD_with_DSIMD) 1396 { 1397 return cast(__m128) __simd(XMM.LODLPS, a, *cast(const(__m128)*)mem_addr); 1398 } 1399 else */ 1400 { 1401 // x86: movlpd/movlps generated with all LDC -01 1402 long2 la = cast(long2)a; 1403 la.ptr[0] = (*mem_addr).array[0]; 1404 return cast(__m128)la; 1405 } 1406 } 1407 unittest 1408 { 1409 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1410 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1411 __m64 M = to_m64(cast(__m128i)B); 1412 __m128 R = _mm_loadl_pi(A, &M); 1413 float[4] correct = [5.0f, 6.0f, 3.0f, 4.0f]; 1414 assert(R.array == correct); 1415 } 1416 1417 /// Load 4 single-precision (32-bit) floating-point elements from memory in reverse order. 1418 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1419 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted // FUTURE shouldn't be trusted, see #62 1420 { 1421 __m128* aligned = cast(__m128*)mem_addr; // x86: movaps + shups since LDC 1.0.0 -O1 1422 __m128 a = *aligned; 1423 static if (DMD_with_DSIMD) 1424 { 1425 return cast(__m128) __simd(XMM.SHUFPS, a, a, 27); 1426 } 1427 else 1428 { 1429 __m128 r; // PERF =void; 1430 r.ptr[0] = a.array[3]; 1431 r.ptr[1] = a.array[2]; 1432 r.ptr[2] = a.array[1]; 1433 r.ptr[3] = a.array[0]; 1434 return r; 1435 } 1436 } 1437 unittest 1438 { 1439 align(16) static immutable float[4] arr = [ 1.0f, 2.0f, 3.0f, 8.0f ]; 1440 __m128 A = _mm_loadr_ps(arr.ptr); 1441 float[4] correct = [ 8.0f, 3.0f, 2.0f, 1.0f ]; 1442 assert(A.array == correct); 1443 } 1444 1445 /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory. 1446 /// `mem_addr` does not need to be aligned on any particular boundary. 1447 __m128 _mm_loadu_ps(const(float)* mem_addr) pure @trusted 1448 { 1449 pragma(inline, true); 1450 static if (GDC_with_SSE2) 1451 { 1452 return __builtin_ia32_loadups(mem_addr); 1453 } 1454 else version(LDC) 1455 { 1456 return loadUnaligned!(__m128)(mem_addr); 1457 } 1458 else version(DigitalMars) 1459 { 1460 static if (DMD_with_DSIMD) 1461 { 1462 return cast(__m128)__simd(XMM.LODUPS, *cast(const(float4*))mem_addr); 1463 } 1464 else static if (SSESizedVectorsAreEmulated) 1465 { 1466 // Since this vector is emulated, it doesn't have alignement constraints 1467 // and as such we can just cast it. 1468 return *cast(__m128*)(mem_addr); 1469 } 1470 else 1471 { 1472 __m128 result; 1473 result.ptr[0] = mem_addr[0]; 1474 result.ptr[1] = mem_addr[1]; 1475 result.ptr[2] = mem_addr[2]; 1476 result.ptr[3] = mem_addr[3]; 1477 return result; 1478 } 1479 } 1480 else 1481 { 1482 __m128 result; 1483 result.ptr[0] = mem_addr[0]; 1484 result.ptr[1] = mem_addr[1]; 1485 result.ptr[2] = mem_addr[2]; 1486 result.ptr[3] = mem_addr[3]; 1487 return result; 1488 } 1489 } 1490 unittest 1491 { 1492 align(16) static immutable float[5] arr = [ 1.0f, 2.0f, 3.0f, 8.0f, 9.0f ]; // force unaligned load 1493 __m128 A = _mm_loadu_ps(&arr[1]); 1494 float[4] correct = [ 2.0f, 3.0f, 8.0f, 9.0f ]; 1495 assert(A.array == correct); 1496 } 1497 1498 /// Load unaligned 16-bit integer from memory into the first element, fill with zeroes otherwise. 1499 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted 1500 { 1501 static if (DMD_with_DSIMD) 1502 { 1503 int r = *cast(short*)(mem_addr); 1504 return cast(__m128i) __simd(XMM.LODD, *cast(__m128i*)&r); 1505 } 1506 else version(DigitalMars) 1507 { 1508 // Workaround issue: https://issues.dlang.org/show_bug.cgi?id=21672 1509 // DMD cannot handle the below code... 1510 align(16) short[8] r = [0, 0, 0, 0, 0, 0, 0, 0]; 1511 r[0] = *cast(short*)(mem_addr); 1512 return *cast(int4*)(r.ptr); 1513 } 1514 else 1515 { 1516 short r = *cast(short*)(mem_addr); 1517 short8 result = [0, 0, 0, 0, 0, 0, 0, 0]; 1518 result.ptr[0] = r; 1519 return cast(__m128i)result; 1520 } 1521 } 1522 unittest 1523 { 1524 short r = 13; 1525 short8 A = cast(short8) _mm_loadu_si16(&r); 1526 short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0]; 1527 assert(A.array == correct); 1528 } 1529 1530 /// Load unaligned 64-bit integer from memory into the first element of result. 1531 /// Upper 64-bit is zeroed. 1532 __m128i _mm_loadu_si64(const(void)* mem_addr) pure @trusted 1533 { 1534 pragma(inline, true); 1535 static if (DMD_with_DSIMD) 1536 { 1537 return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr); 1538 } 1539 else 1540 { 1541 long r = *cast(long*)(mem_addr); 1542 long2 result = [0, 0]; 1543 result.ptr[0] = r; 1544 return cast(__m128i)result; 1545 } 1546 } 1547 unittest 1548 { 1549 long r = 446446446446; 1550 long2 A = cast(long2) _mm_loadu_si64(&r); 1551 long[2] correct = [446446446446, 0]; 1552 assert(A.array == correct); 1553 } 1554 1555 /// Allocate size bytes of memory, aligned to the alignment specified in align, 1556 /// and return a pointer to the allocated memory. `_mm_free` should be used to free 1557 /// memory that is allocated with `_mm_malloc`. 1558 void* _mm_malloc(size_t size, size_t alignment) @trusted 1559 { 1560 assert(alignment != 0); 1561 size_t request = requestedSize(size, alignment); 1562 void* raw = malloc(request); 1563 if (request > 0 && raw == null) // malloc(0) can validly return anything 1564 onOutOfMemoryError(); 1565 return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size 1566 } 1567 1568 /// Conditionally store 8-bit integer elements from a into memory using mask (elements are not stored when the highest 1569 /// bit is not set in the corresponding element) and a non-temporal memory hint. 1570 void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr) @trusted 1571 { 1572 // this works since mask is zero-extended 1573 return _mm_maskmoveu_si128 (to_m128i(a), to_m128i(mask), mem_addr); 1574 } 1575 1576 deprecated("Use _mm_maskmove_si64 instead") alias _m_maskmovq = _mm_maskmove_si64;/// 1577 1578 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum value. 1579 __m64 _mm_max_pi16 (__m64 a, __m64 b) pure @safe 1580 { 1581 return to_m64(_mm_max_epi16(to_m128i(a), to_m128i(b))); 1582 } 1583 1584 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values. 1585 __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe 1586 { 1587 static if (DMD_with_DSIMD) 1588 { 1589 return cast(__m128) __simd(XMM.MAXPS, a, b); 1590 } 1591 else static if (GDC_with_SSE) 1592 { 1593 return __builtin_ia32_maxps(a, b); 1594 } 1595 else static if (LDC_with_SSE) 1596 { 1597 return __builtin_ia32_maxps(a, b); 1598 } 1599 else 1600 { 1601 // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02 1602 __m128 r; // PERF =void; 1603 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 1604 r[1] = (a[1] > b[1]) ? a[1] : b[1]; 1605 r[2] = (a[2] > b[2]) ? a[2] : b[2]; 1606 r[3] = (a[3] > b[3]) ? a[3] : b[3]; 1607 return r; 1608 } 1609 } 1610 unittest 1611 { 1612 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 1613 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 1614 __m128 M = _mm_max_ps(A, B); 1615 assert(M.array[0] == 4); 1616 assert(M.array[1] == 2); 1617 assert(M.array[2] == 4); // in case of NaN, second operand prevails (as it seems) 1618 assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems) 1619 } 1620 1621 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed maximum values. 1622 __m64 _mm_max_pu8 (__m64 a, __m64 b) pure @safe 1623 { 1624 return to_m64(_mm_max_epu8(to_m128i(a), to_m128i(b))); 1625 } 1626 1627 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the maximum value in the 1628 /// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result. 1629 __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe 1630 { 1631 static if (DMD_with_DSIMD) 1632 { 1633 return cast(__m128) __simd(XMM.MAXSS, a, b); 1634 } 1635 else static if (GDC_with_SSE) 1636 { 1637 return __builtin_ia32_maxss(a, b); 1638 } 1639 else static if (LDC_with_SSE) 1640 { 1641 return __builtin_ia32_maxss(a, b); 1642 } 1643 else 1644 { 1645 __m128 r = a; 1646 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 1647 return r; 1648 } 1649 } 1650 unittest 1651 { 1652 __m128 A = _mm_setr_ps(1, 2, 3, 4); 1653 __m128 B = _mm_setr_ps(4, 1, 4, 1); 1654 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 1655 __m128 M = _mm_max_ss(A, B); 1656 assert(M.array[0] == 4); 1657 assert(M.array[1] == 2); 1658 assert(M.array[2] == 3); 1659 assert(M.array[3] == 4); 1660 M = _mm_max_ps(A, C); // in case of NaN, second operand prevails 1661 assert(M.array[0] != M.array[0]); 1662 M = _mm_max_ps(C, A); // in case of NaN, second operand prevails 1663 assert(M.array[0] == 1); 1664 } 1665 1666 /// Compare packed signed 16-bit integers in a and b, and return packed minimum values. 1667 __m64 _mm_min_pi16 (__m64 a, __m64 b) pure @safe 1668 { 1669 return to_m64(_mm_min_epi16(to_m128i(a), to_m128i(b))); 1670 } 1671 1672 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values. 1673 __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe 1674 { 1675 static if (DMD_with_DSIMD) 1676 { 1677 return cast(__m128) __simd(XMM.MINPS, a, b); 1678 } 1679 else static if (GDC_with_SSE) 1680 { 1681 return __builtin_ia32_minps(a, b); 1682 } 1683 else static if (LDC_with_SSE) 1684 { 1685 // not technically needed, but better perf in debug mode 1686 return __builtin_ia32_minps(a, b); 1687 } 1688 else 1689 { 1690 // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02 1691 __m128 r; // PERF =void; 1692 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 1693 r[1] = (a[1] < b[1]) ? a[1] : b[1]; 1694 r[2] = (a[2] < b[2]) ? a[2] : b[2]; 1695 r[3] = (a[3] < b[3]) ? a[3] : b[3]; 1696 return r; 1697 } 1698 } 1699 unittest 1700 { 1701 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 1702 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 1703 __m128 M = _mm_min_ps(A, B); 1704 assert(M.array[0] == 1); 1705 assert(M.array[1] == 1); 1706 assert(M.array[2] == 4); // in case of NaN, second operand prevails (as it seems) 1707 assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems) 1708 } 1709 1710 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values. 1711 __m64 _mm_min_pu8 (__m64 a, __m64 b) pure @safe 1712 { 1713 return to_m64(_mm_min_epu8(to_m128i(a), to_m128i(b))); 1714 } 1715 1716 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the minimum value in the 1717 /// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result. 1718 __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe 1719 { 1720 static if (DMD_with_DSIMD) 1721 { 1722 return cast(__m128) __simd(XMM.MINSS, a, b); 1723 } 1724 else static if (GDC_with_SSE) 1725 { 1726 return __builtin_ia32_minss(a, b); 1727 } 1728 else static if (LDC_with_SSE) 1729 { 1730 return __builtin_ia32_minss(a, b); 1731 } 1732 else 1733 { 1734 // Generates minss since LDC 1.3 -O1 1735 __m128 r = a; 1736 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 1737 return r; 1738 } 1739 } 1740 unittest 1741 { 1742 __m128 A = _mm_setr_ps(1, 2, 3, 4); 1743 __m128 B = _mm_setr_ps(4, 1, 4, 1); 1744 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 1745 __m128 M = _mm_min_ss(A, B); 1746 assert(M.array[0] == 1); 1747 assert(M.array[1] == 2); 1748 assert(M.array[2] == 3); 1749 assert(M.array[3] == 4); 1750 M = _mm_min_ps(A, C); // in case of NaN, second operand prevails 1751 assert(M.array[0] != M.array[0]); 1752 M = _mm_min_ps(C, A); // in case of NaN, second operand prevails 1753 assert(M.array[0] == 1); 1754 } 1755 1756 /// Move the lower single-precision (32-bit) floating-point element from `b` to the lower element of result, and copy 1757 /// the upper 3 packed elements from `a` to the upper elements of result. 1758 __m128 _mm_move_ss (__m128 a, __m128 b) pure @trusted 1759 { 1760 // Workaround https://issues.dlang.org/show_bug.cgi?id=21673 1761 // inlining of this function fails. 1762 version(DigitalMars) asm nothrow @nogc pure { nop; } 1763 1764 a.ptr[0] = b.array[0]; 1765 return a; 1766 } 1767 unittest 1768 { 1769 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1770 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1771 __m128 R = _mm_move_ss(A, B); 1772 float[4] correct = [5.0f, 2.0f, 3.0f, 4.0f]; 1773 assert(R.array == correct); 1774 } 1775 1776 /// Move the upper 2 single-precision (32-bit) floating-point elements from `b` to the lower 2 elements of result, and 1777 /// copy the upper 2 elements from `a` to the upper 2 elements of dst. 1778 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @trusted 1779 { 1780 // PERF DMD 1781 // Disabled because of https://issues.dlang.org/show_bug.cgi?id=19443 1782 /* 1783 static if (DMD_with_DSIMD) 1784 { 1785 1786 return cast(__m128) __simd(XMM.MOVHLPS, a, b); 1787 } 1788 else */ 1789 { 1790 a.ptr[0] = b.array[2]; 1791 a.ptr[1] = b.array[3]; 1792 return a; 1793 } 1794 } 1795 unittest 1796 { 1797 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1798 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1799 __m128 R = _mm_movehl_ps(A, B); 1800 float[4] correct = [7.0f, 8.0f, 3.0f, 4.0f]; 1801 assert(R.array == correct); 1802 } 1803 1804 /// Move the lower 2 single-precision (32-bit) floating-point elements from `b` to the upper 2 elements of result, and 1805 /// copy the lower 2 elements from `a` to the lower 2 elements of result 1806 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @trusted 1807 { 1808 // PERF DMD 1809 // Disabled because of https://issues.dlang.org/show_bug.cgi?id=19443 1810 /* 1811 static if (DMD_with_DSIMD) 1812 { 1813 return cast(__m128) __simd(XMM.MOVLHPS, a, b); 1814 } 1815 else 1816 */ 1817 { 1818 a.ptr[2] = b.array[0]; 1819 a.ptr[3] = b.array[1]; 1820 return a; 1821 } 1822 } 1823 unittest 1824 { 1825 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1826 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1827 __m128 R = _mm_movelh_ps(A, B); 1828 float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f]; 1829 assert(R.array == correct); 1830 } 1831 1832 /// Create mask from the most significant bit of each 8-bit element in `a`. 1833 int _mm_movemask_pi8 (__m64 a) pure @safe 1834 { 1835 return _mm_movemask_epi8(to_m128i(a)); 1836 } 1837 unittest 1838 { 1839 assert(0x9C == _mm_movemask_pi8(_mm_set_pi8(-1, 0, 0, -1, -1, -1, 0, 0))); 1840 } 1841 1842 /// Set each bit of result based on the most significant bit of the corresponding packed single-precision (32-bit) 1843 /// floating-point element in `a`. 1844 int _mm_movemask_ps (__m128 a) pure @trusted 1845 { 1846 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 1847 static if (GDC_with_SSE) 1848 { 1849 return __builtin_ia32_movmskps(a); 1850 } 1851 else static if (LDC_with_SSE) 1852 { 1853 return __builtin_ia32_movmskps(a); 1854 } 1855 else static if (LDC_with_ARM) 1856 { 1857 int4 ai = cast(int4)a; 1858 int4 shift31 = [31, 31, 31, 31]; 1859 ai = ai >>> shift31; 1860 int4 shift = [0, 1, 2, 3]; 1861 ai = ai << shift; // 4-way shift, only efficient on ARM. 1862 int r = ai.array[0] + (ai.array[1]) + (ai.array[2]) + (ai.array[3]); 1863 return r; 1864 } 1865 else 1866 { 1867 int4 ai = cast(int4)a; 1868 int r = 0; 1869 if (ai.array[0] < 0) r += 1; 1870 if (ai.array[1] < 0) r += 2; 1871 if (ai.array[2] < 0) r += 4; 1872 if (ai.array[3] < 0) r += 8; 1873 return r; 1874 } 1875 } 1876 unittest 1877 { 1878 int4 A = [-1, 0, -43, 0]; 1879 assert(5 == _mm_movemask_ps(cast(float4)A)); 1880 } 1881 1882 /// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`. 1883 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe 1884 { 1885 pragma(inline, true); 1886 return a * b; 1887 } 1888 unittest 1889 { 1890 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1891 a = _mm_mul_ps(a, a); 1892 float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f]; 1893 assert(a.array == correct); 1894 } 1895 1896 /// Multiply the lower single-precision (32-bit) floating-point element in `a` and `b`, store the result in the lower 1897 /// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result. 1898 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe 1899 { 1900 static if (DMD_with_DSIMD) 1901 return cast(__m128) __simd(XMM.MULSS, a, b); 1902 else static if (GDC_with_SSE) 1903 return __builtin_ia32_mulss(a, b); 1904 else 1905 { 1906 a[0] *= b[0]; 1907 return a; 1908 } 1909 } 1910 unittest 1911 { 1912 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1913 a = _mm_mul_ss(a, a); 1914 float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f]; 1915 assert(a.array == correct); 1916 } 1917 1918 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 1919 /// and return the high 16 bits of the intermediate integers. 1920 __m64 _mm_mulhi_pu16 (__m64 a, __m64 b) pure @safe 1921 { 1922 return to_m64(_mm_mulhi_epu16(to_m128i(a), to_m128i(b))); 1923 } 1924 unittest 1925 { 1926 __m64 A = _mm_setr_pi16(0, -16, 2, 3); 1927 __m64 B = _mm_set1_pi16(16384); 1928 short4 R = cast(short4)_mm_mulhi_pu16(A, B); 1929 short[4] correct = [0, 0x3FFC, 0, 0]; 1930 assert(R.array == correct); 1931 } 1932 1933 /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in `a` and `b`, and 1934 /// return the result. 1935 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe 1936 { 1937 static if (DMD_with_DSIMD) 1938 return cast(__m128)__simd(XMM.ORPS, a, b); 1939 else 1940 return cast(__m128)(cast(__m128i)a | cast(__m128i)b); 1941 } 1942 unittest 1943 { 1944 __m128 A = cast(__m128) _mm_set1_epi32(0x80000000); 1945 __m128 B = _mm_setr_ps(4.0f, -5.0, -9.5f, float.infinity); 1946 __m128 C = _mm_or_ps(A, B); 1947 float[4] correct = [-4.0f, -5.0, -9.5f, -float.infinity]; 1948 assert(C.array == correct); 1949 } 1950 1951 deprecated("Use _mm_avg_pu8 instead") alias _m_pavgb = _mm_avg_pu8;/// 1952 deprecated("Use _mm_avg_pu16 instead") alias _m_pavgw = _mm_avg_pu16;/// 1953 deprecated("Use _mm_extract_pi16 instead") alias _m_pextrw = _mm_extract_pi16;/// 1954 deprecated("Use _mm_insert_pi16 instead") alias _m_pinsrw = _mm_insert_pi16;/// 1955 deprecated("Use _mm_max_pi16 instead") alias _m_pmaxsw = _mm_max_pi16;/// 1956 deprecated("Use _mm_max_pu8 instead") alias _m_pmaxub = _mm_max_pu8;/// 1957 deprecated("Use _mm_min_pi16 instead") alias _m_pminsw = _mm_min_pi16;/// 1958 deprecated("Use _mm_min_pu8 instead") alias _m_pminub = _mm_min_pu8;/// 1959 deprecated("Use _mm_movemask_pi8 instead") alias _m_pmovmskb = _mm_movemask_pi8;/// 1960 deprecated("Use _mm_mulhi_pu16 instead") alias _m_pmulhuw = _mm_mulhi_pu16;/// 1961 1962 enum _MM_HINT_T0 = 3; /// 1963 enum _MM_HINT_T1 = 2; /// 1964 enum _MM_HINT_T2 = 1; /// 1965 enum _MM_HINT_NTA = 0; /// 1966 1967 1968 version(LDC) 1969 { 1970 // Starting with LLVM 10, it seems llvm.prefetch has changed its name. 1971 // Was reported at: https://github.com/ldc-developers/ldc/issues/3397 1972 static if (__VERSION__ >= 2091) 1973 { 1974 pragma(LDC_intrinsic, "llvm.prefetch.p0i8") // was "llvm.prefetch" 1975 void llvm_prefetch_fixed(void* ptr, uint rw, uint locality, uint cachetype) pure @safe; 1976 } 1977 } 1978 1979 /// Fetch the line of data from memory that contains address `p` to a location in the 1980 /// cache hierarchy specified by the locality hint i. 1981 /// 1982 /// Warning: `locality` is a compile-time parameter, unlike in Intel Intrinsics API. 1983 void _mm_prefetch(int locality)(const(void)* p) pure @trusted 1984 { 1985 static if (GDC_with_SSE) 1986 { 1987 return __builtin_prefetch(p, (locality & 0x4) >> 2, locality & 0x3); 1988 } 1989 else static if (DMD_with_DSIMD) 1990 { 1991 enum bool isWrite = (locality & 0x4) != 0; 1992 enum level = locality & 3; 1993 return prefetch!(isWrite, level)(p); 1994 } 1995 else version(LDC) 1996 { 1997 static if (__VERSION__ >= 2091) 1998 { 1999 // const_cast here. `llvm_prefetch` wants a mutable pointer 2000 llvm_prefetch_fixed( cast(void*)p, 0, locality, 1); 2001 } 2002 else 2003 { 2004 // const_cast here. `llvm_prefetch` wants a mutable pointer 2005 llvm_prefetch( cast(void*)p, 0, locality, 1); 2006 } 2007 } 2008 else version(D_InlineAsm_X86_64) 2009 { 2010 static if (locality == _MM_HINT_NTA) 2011 { 2012 asm pure nothrow @nogc @trusted 2013 { 2014 mov RAX, p; 2015 prefetchnta [RAX]; 2016 } 2017 } 2018 else static if (locality == _MM_HINT_T0) 2019 { 2020 asm pure nothrow @nogc @trusted 2021 { 2022 mov RAX, p; 2023 prefetcht0 [RAX]; 2024 } 2025 } 2026 else static if (locality == _MM_HINT_T1) 2027 { 2028 asm pure nothrow @nogc @trusted 2029 { 2030 mov RAX, p; 2031 prefetcht1 [RAX]; 2032 } 2033 } 2034 else static if (locality == _MM_HINT_T2) 2035 { 2036 asm pure nothrow @nogc @trusted 2037 { 2038 mov RAX, p; 2039 prefetcht2 [RAX]; 2040 } 2041 } 2042 else 2043 assert(false); // invalid locality hint 2044 } 2045 else version(D_InlineAsm_X86) 2046 { 2047 static if (locality == _MM_HINT_NTA) 2048 { 2049 asm pure nothrow @nogc @trusted 2050 { 2051 mov EAX, p; 2052 prefetchnta [EAX]; 2053 } 2054 } 2055 else static if (locality == _MM_HINT_T0) 2056 { 2057 asm pure nothrow @nogc @trusted 2058 { 2059 mov EAX, p; 2060 prefetcht0 [EAX]; 2061 } 2062 } 2063 else static if (locality == _MM_HINT_T1) 2064 { 2065 asm pure nothrow @nogc @trusted 2066 { 2067 mov EAX, p; 2068 prefetcht1 [EAX]; 2069 } 2070 } 2071 else static if (locality == _MM_HINT_T2) 2072 { 2073 asm pure nothrow @nogc @trusted 2074 { 2075 mov EAX, p; 2076 prefetcht2 [EAX]; 2077 } 2078 } 2079 else 2080 assert(false); // invalid locality hint 2081 } 2082 else 2083 { 2084 // Generic version: do nothing. From bitter experience, 2085 // it's unlikely you get ANY speed-up with manual prefetching. 2086 // Prefetching or not doesn't change program behaviour. 2087 } 2088 } 2089 unittest 2090 { 2091 // From Intel documentation: 2092 // "The amount of data prefetched is also processor implementation-dependent. It will, however, be a minimum of 2093 // 32 bytes." 2094 ubyte[256] cacheline; // though it seems it cannot generate GP fault 2095 _mm_prefetch!_MM_HINT_T0(cacheline.ptr); 2096 _mm_prefetch!_MM_HINT_T1(cacheline.ptr); 2097 _mm_prefetch!_MM_HINT_T2(cacheline.ptr); 2098 _mm_prefetch!_MM_HINT_NTA(cacheline.ptr); 2099 } 2100 2101 deprecated("Use _mm_sad_pu8 instead") alias _m_psadbw = _mm_sad_pu8;/// 2102 deprecated("Use _mm_shuffle_pi16 instead") alias _m_pshufw = _mm_shuffle_pi16;/// 2103 2104 2105 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a`` , 2106 /// and return the results. The maximum relative error for this approximation is less than 1.5*2^-12. 2107 __m128 _mm_rcp_ps (__m128 a) pure @trusted 2108 { 2109 static if (DMD_with_DSIMD) 2110 { 2111 return cast(__m128) __simd(XMM.RCPPS, a); 2112 } 2113 else static if (GDC_with_SSE) 2114 { 2115 return __builtin_ia32_rcpps(a); 2116 } 2117 else static if (LDC_with_SSE) 2118 { 2119 return __builtin_ia32_rcpps(a); 2120 } 2121 else 2122 { 2123 a.ptr[0] = 1.0f / a.array[0]; 2124 a.ptr[1] = 1.0f / a.array[1]; 2125 a.ptr[2] = 1.0f / a.array[2]; 2126 a.ptr[3] = 1.0f / a.array[3]; 2127 return a; 2128 } 2129 } 2130 unittest 2131 { 2132 __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f); 2133 __m128 groundTruth = _mm_set1_ps(1.0f) / A; 2134 __m128 result = _mm_rcp_ps(A); 2135 foreach(i; 0..4) 2136 { 2137 double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1; 2138 assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 2139 } 2140 } 2141 2142 /// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in `a`, store it 2143 /// in the lower element of the result, and copy the upper 3 packed elements from `a` to the upper elements of result. 2144 /// The maximum relative error for this approximation is less than 1.5*2^-12. 2145 __m128 _mm_rcp_ss (__m128 a) pure @trusted 2146 { 2147 // Disabled, see https://issues.dlang.org/show_bug.cgi?id=23049 2148 /*static if (DMD_with_DSIMD) 2149 { 2150 return cast(__m128) __simd(XMM.RCPSS, a); 2151 } 2152 else*/ 2153 static if (GDC_with_SSE) 2154 { 2155 return __builtin_ia32_rcpss(a); 2156 } 2157 else static if (LDC_with_SSE) 2158 { 2159 return __builtin_ia32_rcpss(a); 2160 } 2161 else 2162 { 2163 a.ptr[0] = 1.0f / a.array[0]; 2164 return a; 2165 } 2166 } 2167 unittest 2168 { 2169 __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f); 2170 __m128 correct = _mm_setr_ps(1 / 2.34f, -70000.0f, 0.00001f, 345.5f); 2171 __m128 R = _mm_rcp_ss(A); 2172 double relError = (cast(double)(correct.array[0]) / R.array[0]) - 1; 2173 assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 2174 assert(R.array[1] == correct.array[1]); 2175 assert(R.array[2] == correct.array[2]); 2176 assert(R.array[3] == correct.array[3]); 2177 } 2178 2179 /// Reallocate `size` bytes of memory, aligned to the alignment specified in `alignment`, 2180 /// and return a pointer to the newly allocated memory. 2181 /// `_mm_free` or `alignedRealloc` with size 0 should be used to free memory that is 2182 /// allocated with `_mm_malloc` or `_mm_realloc`. 2183 /// Previous data is preserved. 2184 void* _mm_realloc(void* aligned, size_t size, size_t alignment) nothrow @nogc // #BONUS 2185 { 2186 return alignedReallocImpl!true(aligned, size, alignment); 2187 } 2188 unittest 2189 { 2190 enum NALLOC = 8; 2191 enum size_t[8] ALIGNMENTS = [1, 2, 4, 8, 16, 32, 64, 128]; 2192 2193 void*[NALLOC] alloc; 2194 2195 foreach(t; 0..100) 2196 { 2197 foreach(n; 0..NALLOC) 2198 { 2199 size_t alignment = ALIGNMENTS[n]; 2200 size_t s = ( (n + t * 69096) & 0xffff ); 2201 alloc[n] = _mm_realloc(alloc[n], s, alignment); 2202 assert(isPointerAligned(alloc[n], alignment)); 2203 foreach(b; 0..s) 2204 (cast(ubyte*)alloc[n])[b] = cast(ubyte)n; 2205 } 2206 } 2207 foreach(n; 0..NALLOC) 2208 { 2209 alloc[n] = _mm_realloc(alloc[n], 0, ALIGNMENTS[n]); 2210 } 2211 } 2212 2213 /// Reallocate `size` bytes of memory, aligned to the alignment specified in `alignment`, 2214 /// and return a pointer to the newly allocated memory. 2215 /// `_mm_free` or `alignedRealloc` with size 0 should be used to free memory that is 2216 /// allocated with `_mm_malloc` or `_mm_realloc`. 2217 /// Previous data is discarded. 2218 void* _mm_realloc_discard(void* aligned, size_t size, size_t alignment) nothrow @nogc // #BONUS 2219 { 2220 return alignedReallocImpl!false(aligned, size, alignment); 2221 } 2222 2223 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in `a`. 2224 /// The maximum relative error for this approximation is less than 1.5*2^-12. 2225 __m128 _mm_rsqrt_ps (__m128 a) pure @trusted 2226 { 2227 static if (DMD_with_DSIMD) 2228 { 2229 return cast(__m128) __simd(XMM.RSQRTPS, a); 2230 } 2231 else static if (GDC_with_SSE) 2232 { 2233 return __builtin_ia32_rsqrtps(a); 2234 } 2235 else static if (LDC_with_SSE) 2236 { 2237 return __builtin_ia32_rsqrtps(a); 2238 } 2239 else version(LDC) 2240 { 2241 a[0] = 1.0f / llvm_sqrt(a[0]); 2242 a[1] = 1.0f / llvm_sqrt(a[1]); 2243 a[2] = 1.0f / llvm_sqrt(a[2]); 2244 a[3] = 1.0f / llvm_sqrt(a[3]); 2245 return a; 2246 } 2247 else 2248 { 2249 a.ptr[0] = 1.0f / sqrt(a.array[0]); 2250 a.ptr[1] = 1.0f / sqrt(a.array[1]); 2251 a.ptr[2] = 1.0f / sqrt(a.array[2]); 2252 a.ptr[3] = 1.0f / sqrt(a.array[3]); 2253 return a; 2254 } 2255 } 2256 unittest 2257 { 2258 __m128 A = _mm_setr_ps(2.34f, 70000.0f, 0.00001f, 345.5f); 2259 __m128 groundTruth = _mm_setr_ps(0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f); 2260 __m128 result = _mm_rsqrt_ps(A); 2261 foreach(i; 0..4) 2262 { 2263 double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1; 2264 assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 2265 } 2266 } 2267 2268 /// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in `a`, 2269 /// store the result in the lower element. Copy the upper 3 packed elements from `a` to the upper elements of result. 2270 /// The maximum relative error for this approximation is less than 1.5*2^-12. 2271 __m128 _mm_rsqrt_ss (__m128 a) pure @trusted 2272 { 2273 static if (DMD_with_DSIMD) 2274 { 2275 return cast(__m128) __simd(XMM.RSQRTSS, a); 2276 } 2277 else static if (GDC_with_SSE) 2278 { 2279 return __builtin_ia32_rsqrtss(a); 2280 } 2281 else static if (LDC_with_SSE) 2282 { 2283 return __builtin_ia32_rsqrtss(a); 2284 } 2285 else version(LDC) 2286 { 2287 a[0] = 1.0f / llvm_sqrt(a[0]); 2288 return a; 2289 } 2290 else 2291 { 2292 a[0] = 1.0f / sqrt(a[0]); 2293 return a; 2294 } 2295 } 2296 unittest // this one test 4 different intrinsics: _mm_rsqrt_ss, _mm_rsqrt_ps, _mm_rcp_ps, _mm_rcp_ss 2297 { 2298 double maxRelativeError = 0.000245; // -72 dB, stuff is apparently more precise than said in the doc? 2299 void testApproximateSSE(float number) nothrow @nogc 2300 { 2301 __m128 A = _mm_set1_ps(number); 2302 2303 // test _mm_rcp_ps 2304 __m128 B = _mm_rcp_ps(A); 2305 foreach(i; 0..4) 2306 { 2307 double exact = 1.0f / A.array[i]; 2308 double ratio = cast(double)(B.array[i]) / cast(double)(exact); 2309 assert(abs_double(ratio - 1) <= maxRelativeError); 2310 } 2311 2312 // test _mm_rcp_ss 2313 { 2314 B = _mm_rcp_ss(A); 2315 double exact = 1.0f / A.array[0]; 2316 double ratio = cast(double)(B.array[0]) / cast(double)(exact); 2317 assert(abs_double(ratio - 1) <= maxRelativeError); 2318 } 2319 2320 // test _mm_rsqrt_ps 2321 B = _mm_rsqrt_ps(A); 2322 foreach(i; 0..4) 2323 { 2324 double exact = 1.0f / sqrt(A.array[i]); 2325 double ratio = cast(double)(B.array[i]) / cast(double)(exact); 2326 assert(abs_double(ratio - 1) <= maxRelativeError); 2327 } 2328 2329 // test _mm_rsqrt_ss 2330 { 2331 B = _mm_rsqrt_ss(A); 2332 double exact = 1.0f / sqrt(A.array[0]); 2333 double ratio = cast(double)(B.array[0]) / cast(double)(exact); 2334 assert(abs_double(ratio - 1) <= maxRelativeError); 2335 } 2336 } 2337 2338 testApproximateSSE(0.00001f); 2339 testApproximateSSE(1.1f); 2340 testApproximateSSE(345.0f); 2341 testApproximateSSE(2.45674864151f); 2342 testApproximateSSE(700000.0f); 2343 testApproximateSSE(10000000.0f); 2344 testApproximateSSE(27841456468.0f); 2345 } 2346 2347 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 2348 /// consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 2349 /// low 16 bits of result. 2350 __m64 _mm_sad_pu8 (__m64 a, __m64 b) pure @safe 2351 { 2352 return to_m64(_mm_sad_epu8(to_m128i(a), to_m128i(b))); 2353 } 2354 2355 /// Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer 2356 /// `_MM_MASK_xxxx`. The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, `_MM_MASK_DIV_ZERO`, 2357 /// `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`. 2358 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) @safe 2359 { 2360 // Note: unsupported on ARM 2361 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx); 2362 } 2363 2364 /// Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer 2365 /// `_MM_EXCEPT_xxxx`. The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 2366 /// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`. 2367 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) @safe 2368 { 2369 // Note: unsupported on ARM 2370 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx); 2371 } 2372 2373 /// Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer 2374 /// `_MM_FLUSH_xxxx`. The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF`. 2375 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) @safe 2376 { 2377 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx); 2378 } 2379 2380 /// Set packed single-precision (32-bit) floating-point elements with the supplied values. 2381 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted 2382 { 2383 __m128 r = void; 2384 r.ptr[0] = e0; 2385 r.ptr[1] = e1; 2386 r.ptr[2] = e2; 2387 r.ptr[3] = e3; 2388 return r; 2389 } 2390 unittest 2391 { 2392 __m128 A = _mm_set_ps(3, 2, 1, 546); 2393 float[4] correct = [546.0f, 1.0f, 2.0f, 3.0f]; 2394 assert(A.array == correct); 2395 } 2396 2397 deprecated("Use _mm_set1_ps instead") alias _mm_set_ps1 = _mm_set1_ps; /// 2398 2399 /// Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer 2400 /// `_MM_ROUND_xxxx`. The rounding mode may contain any of the following flags: `_MM_ROUND_NEAREST`, `_MM_ROUND_DOWN`, 2401 /// `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`. 2402 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) @safe 2403 { 2404 // Work-around for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607 2405 version(GNU) asm nothrow @nogc @trusted { "" : : : "memory"; } 2406 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx); 2407 } 2408 2409 /// Copy single-precision (32-bit) floating-point element `a` to the lower element of result, and zero the upper 3 elements. 2410 __m128 _mm_set_ss (float a) pure @trusted 2411 { 2412 static if (DMD_with_DSIMD) 2413 { 2414 return cast(__m128) __simd(XMM.LODSS, a); 2415 } 2416 else 2417 { 2418 __m128 r = _mm_setzero_ps(); 2419 r.ptr[0] = a; 2420 return r; 2421 } 2422 } 2423 unittest 2424 { 2425 float[4] correct = [42.0f, 0.0f, 0.0f, 0.0f]; 2426 __m128 A = _mm_set_ss(42.0f); 2427 assert(A.array == correct); 2428 } 2429 2430 /// Broadcast single-precision (32-bit) floating-point value `a` to all elements. 2431 __m128 _mm_set1_ps (float a) pure @trusted 2432 { 2433 pragma(inline, true); 2434 __m128 r = a; 2435 return r; 2436 } 2437 unittest 2438 { 2439 float[4] correct = [42.0f, 42.0f, 42.0f, 42.0f]; 2440 __m128 A = _mm_set1_ps(42.0f); 2441 assert(A.array == correct); 2442 } 2443 2444 /// Set the MXCSR control and status register with the value in unsigned 32-bit integer `controlWord`. 2445 void _mm_setcsr(uint controlWord) @trusted 2446 { 2447 static if (LDC_with_ARM) 2448 { 2449 // Convert from SSE to ARM control word. This is done _partially_ 2450 // and only support rounding mode changes. 2451 2452 // "To alter some bits of a VFP system register without 2453 // affecting other bits, use a read-modify-write procedure" 2454 uint fpscr = arm_get_fpcr(); 2455 2456 // Bits 23 to 22 are rounding modes, however not used in NEON 2457 fpscr = fpscr & ~_MM_ROUND_MASK_ARM; 2458 switch(controlWord & _MM_ROUND_MASK) 2459 { 2460 default: 2461 case _MM_ROUND_NEAREST: fpscr |= _MM_ROUND_NEAREST_ARM; break; 2462 case _MM_ROUND_DOWN: fpscr |= _MM_ROUND_DOWN_ARM; break; 2463 case _MM_ROUND_UP: fpscr |= _MM_ROUND_UP_ARM; break; 2464 case _MM_ROUND_TOWARD_ZERO: fpscr |= _MM_ROUND_TOWARD_ZERO_ARM; break; 2465 } 2466 fpscr = fpscr & ~_MM_FLUSH_ZERO_MASK_ARM; 2467 if (controlWord & _MM_FLUSH_ZERO_MASK) 2468 fpscr |= _MM_FLUSH_ZERO_MASK_ARM; 2469 arm_set_fpcr(fpscr); 2470 } 2471 else version(GNU) 2472 { 2473 static if (GDC_with_SSE) 2474 { 2475 // Work-around for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607 2476 version(GNU) asm nothrow @nogc @trusted { "" : : : "memory"; } 2477 __builtin_ia32_ldmxcsr(controlWord); 2478 } 2479 else version(X86) 2480 { 2481 asm nothrow @nogc @trusted 2482 { 2483 "ldmxcsr %0;\n" 2484 : 2485 : "m" (controlWord) 2486 : ; 2487 } 2488 } 2489 else 2490 static assert(false); 2491 } 2492 else version (InlineX86Asm) 2493 { 2494 asm nothrow @nogc @safe 2495 { 2496 ldmxcsr controlWord; 2497 } 2498 } 2499 else 2500 static assert(0, "Not yet supported"); 2501 } 2502 unittest 2503 { 2504 _mm_setcsr(_mm_getcsr()); 2505 } 2506 2507 /// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order. 2508 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted 2509 { 2510 pragma(inline, true); 2511 2512 // This small = void here wins a bit in all optimization levels in GDC 2513 // and in -O0 in LDC. 2514 __m128 r = void; 2515 r.ptr[0] = e3; 2516 r.ptr[1] = e2; 2517 r.ptr[2] = e1; 2518 r.ptr[3] = e0; 2519 return r; 2520 } 2521 unittest 2522 { 2523 __m128 A = _mm_setr_ps(3, 2, 1, 546); 2524 float[4] correct = [3.0f, 2.0f, 1.0f, 546.0f]; 2525 assert(A.array == correct); 2526 } 2527 2528 /// Return vector of type `__m128` with all elements set to zero. 2529 __m128 _mm_setzero_ps() pure @trusted 2530 { 2531 pragma(inline, true); 2532 2533 // Note: for all compilers, this works best in debug builds, and in DMD -O 2534 int4 r; 2535 return cast(__m128)r; 2536 } 2537 unittest 2538 { 2539 __m128 R = _mm_setzero_ps(); 2540 float[4] correct = [0.0f, 0, 0, 0]; 2541 assert(R.array == correct); 2542 } 2543 2544 /// Do a serializing operation on all store-to-memory instructions that were issued prior 2545 /// to this instruction. Guarantees that every store instruction that precedes, in program order, 2546 /// is globally visible before any store instruction which follows the fence in program order. 2547 void _mm_sfence() @trusted 2548 { 2549 version(GNU) 2550 { 2551 static if (GDC_with_SSE) 2552 { 2553 __builtin_ia32_sfence(); 2554 } 2555 else version(X86) 2556 { 2557 asm pure nothrow @nogc @trusted 2558 { 2559 "sfence;\n" : : : ; 2560 } 2561 } 2562 else 2563 static assert(false); 2564 } 2565 else static if (LDC_with_SSE) 2566 { 2567 __builtin_ia32_sfence(); 2568 } 2569 else static if (DMD_with_asm) 2570 { 2571 // PERF: can't be inlined in DMD, probably because of that assembly. 2572 asm nothrow @nogc pure @safe 2573 { 2574 sfence; 2575 } 2576 } 2577 else static if (LDC_with_ARM64) 2578 { 2579 __builtin_arm_dmb(10); // dmb ishst 2580 } 2581 else version(LDC) 2582 { 2583 // When the architecture is unknown, generate a full memory barrier, 2584 // as the semantics of sfence do not really match those of atomics. 2585 llvm_memory_fence(); 2586 } 2587 else 2588 static assert(false); 2589 } 2590 unittest 2591 { 2592 _mm_sfence(); 2593 } 2594 2595 2596 __m64 _mm_shuffle_pi16(int imm8)(__m64 a) pure @trusted 2597 { 2598 // PERF DMD + D_SIMD 2599 version(LDC) 2600 { 2601 return cast(__m64) shufflevectorLDC!(short4, ( (imm8 >> 0) & 3 ), 2602 ( (imm8 >> 2) & 3 ), 2603 ( (imm8 >> 4) & 3 ), 2604 ( (imm8 >> 6) & 3 ))(cast(short4)a, cast(short4)a); 2605 } 2606 else 2607 { 2608 // GDC optimizes that correctly starting with -O2 2609 short4 sa = cast(short4)a; 2610 short4 r = void; 2611 r.ptr[0] = sa.array[ (imm8 >> 0) & 3 ]; 2612 r.ptr[1] = sa.array[ (imm8 >> 2) & 3 ]; 2613 r.ptr[2] = sa.array[ (imm8 >> 4) & 3 ]; 2614 r.ptr[3] = sa.array[ (imm8 >> 6) & 3 ]; 2615 return cast(__m64)r; 2616 } 2617 } 2618 unittest 2619 { 2620 __m64 A = _mm_setr_pi16(0, 1, 2, 3); 2621 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 2622 short4 B = cast(short4) _mm_shuffle_pi16!SHUFFLE(A); 2623 short[4] expectedB = [ 3, 2, 1, 0 ]; 2624 assert(B.array == expectedB); 2625 } 2626 2627 /// Shuffle single-precision (32-bit) floating-point elements in `a` and `b` using the control in `imm8`, 2628 /// Warning: the immediate shuffle value `imm` is given at compile-time instead of runtime. 2629 __m128 _mm_shuffle_ps(ubyte imm8)(__m128 a, __m128 b) pure @trusted 2630 { 2631 static if (GDC_with_SSE) 2632 { 2633 return __builtin_ia32_shufps(a, b, imm8); 2634 } 2635 else static if (DMD_with_DSIMD) 2636 { 2637 return cast(__m128) __simd(XMM.SHUFPS, a, b, imm8); 2638 } 2639 else version(LDC) 2640 { 2641 return shufflevectorLDC!(__m128, imm8 & 3, (imm8>>2) & 3, 2642 4 + ((imm8>>4) & 3), 4 + ((imm8>>6) & 3) )(a, b); 2643 } 2644 else 2645 { 2646 float4 r = void; 2647 r.ptr[0] = a.array[ (imm8 >> 0) & 3 ]; 2648 r.ptr[1] = a.array[ (imm8 >> 2) & 3 ]; 2649 r.ptr[2] = b.array[ (imm8 >> 4) & 3 ]; 2650 r.ptr[3] = b.array[ (imm8 >> 6) & 3 ]; 2651 return r; 2652 } 2653 } 2654 unittest 2655 { 2656 __m128 A = _mm_setr_ps(0, 1, 2, 3); 2657 __m128 B = _mm_setr_ps(4, 5, 6, 7); 2658 __m128 C = _mm_shuffle_ps!0x9c(A, B); 2659 float[4] correct = [0.0f, 3, 5, 6]; 2660 assert(C.array == correct); 2661 } 2662 2663 /// Compute the square root of packed single-precision (32-bit) floating-point elements in `a`. 2664 __m128 _mm_sqrt_ps(__m128 a) @trusted 2665 { 2666 static if (GDC_with_SSE) 2667 { 2668 return __builtin_ia32_sqrtps(a); 2669 } 2670 else static if (DMD_with_DSIMD) 2671 { 2672 return cast(__m128) __simd(XMM.SQRTPS, a); 2673 } 2674 else version(LDC) 2675 { 2676 // Disappeared with LDC 1.11 2677 static if (__VERSION__ < 2081) 2678 return __builtin_ia32_sqrtps(a); 2679 else 2680 { 2681 // PERF: use llvm_sqrt on the vector, works better 2682 a[0] = llvm_sqrt(a[0]); 2683 a[1] = llvm_sqrt(a[1]); 2684 a[2] = llvm_sqrt(a[2]); 2685 a[3] = llvm_sqrt(a[3]); 2686 return a; 2687 } 2688 } 2689 else 2690 { 2691 a.ptr[0] = sqrt(a.array[0]); 2692 a.ptr[1] = sqrt(a.array[1]); 2693 a.ptr[2] = sqrt(a.array[2]); 2694 a.ptr[3] = sqrt(a.array[3]); 2695 return a; 2696 } 2697 } 2698 unittest 2699 { 2700 __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f)); 2701 assert(A.array[0] == 2.0f); 2702 assert(A.array[1] == 2.0f); 2703 assert(A.array[2] == 2.0f); 2704 assert(A.array[3] == 2.0f); 2705 } 2706 2707 /// Compute the square root of the lower single-precision (32-bit) floating-point element in `a`, store it in the lower 2708 /// element, and copy the upper 3 packed elements from `a` to the upper elements of result. 2709 __m128 _mm_sqrt_ss(__m128 a) @trusted 2710 { 2711 static if (GDC_with_SSE) 2712 { 2713 return __builtin_ia32_sqrtss(a); 2714 } 2715 // PERF DMD 2716 // TODO: enable when https://issues.dlang.org/show_bug.cgi?id=23437 is fixed for good 2717 /*else static if (DMD_with_DSIMD) 2718 { 2719 return cast(__m128) __simd(XMM.SQRTSS, a); 2720 }*/ 2721 else version(LDC) 2722 { 2723 a.ptr[0] = llvm_sqrt(a.array[0]); 2724 return a; 2725 } 2726 else 2727 { 2728 a.ptr[0] = sqrt(a.array[0]); 2729 return a; 2730 } 2731 } 2732 unittest 2733 { 2734 __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f)); 2735 assert(A.array[0] == 2.0f); 2736 assert(A.array[1] == 4.0f); 2737 assert(A.array[2] == 4.0f); 2738 assert(A.array[3] == 4.0f); 2739 } 2740 2741 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 2742 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 2743 void _mm_store_ps (float* mem_addr, __m128 a) pure 2744 { 2745 pragma(inline, true); 2746 __m128* aligned = cast(__m128*)mem_addr; 2747 *aligned = a; 2748 } 2749 2750 deprecated("Use _mm_store1_ps instead") alias _mm_store_ps1 = _mm_store1_ps; /// 2751 2752 /// Store the lower single-precision (32-bit) floating-point element from `a` into memory. 2753 /// `mem_addr` does not need to be aligned on any particular boundary. 2754 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe 2755 { 2756 pragma(inline, true); 2757 *mem_addr = a.array[0]; 2758 } 2759 unittest 2760 { 2761 float a; 2762 _mm_store_ss(&a, _mm_set_ps(3, 2, 1, 546)); 2763 assert(a == 546); 2764 } 2765 2766 /// Store the lower single-precision (32-bit) floating-point element from `a` into 4 contiguous elements in memory. 2767 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 2768 void _mm_store1_ps(float* mem_addr, __m128 a) pure @trusted // FUTURE: shouldn't be trusted, see #62 2769 { 2770 __m128* aligned = cast(__m128*)mem_addr; 2771 static if (DMD_with_DSIMD) 2772 { 2773 __m128 r = cast(__m128) __simd(XMM.SHUFPS, a, a, 0); 2774 } 2775 else 2776 { 2777 __m128 r; // PERF =void; 2778 r.ptr[0] = a.array[0]; 2779 r.ptr[1] = a.array[0]; 2780 r.ptr[2] = a.array[0]; 2781 r.ptr[3] = a.array[0]; 2782 } 2783 *aligned = r; 2784 } 2785 unittest 2786 { 2787 align(16) float[4] A; 2788 _mm_store1_ps(A.ptr, _mm_set_ss(42.0f)); 2789 float[4] correct = [42.0f, 42, 42, 42]; 2790 assert(A == correct); 2791 } 2792 2793 /// Store the upper 2 single-precision (32-bit) floating-point elements from `a` into memory. 2794 void _mm_storeh_pi(__m64* p, __m128 a) pure @trusted 2795 { 2796 pragma(inline, true); 2797 long2 la = cast(long2)a; 2798 (*p).ptr[0] = la.array[1]; 2799 } 2800 unittest 2801 { 2802 __m64 R = _mm_setzero_si64(); 2803 long2 A = [13, 25]; 2804 _mm_storeh_pi(&R, cast(__m128)A); 2805 assert(R.array[0] == 25); 2806 } 2807 2808 /// Store the lower 2 single-precision (32-bit) floating-point elements from `a` into memory. 2809 void _mm_storel_pi(__m64* p, __m128 a) pure @trusted 2810 { 2811 pragma(inline, true); 2812 long2 la = cast(long2)a; 2813 (*p).ptr[0] = la.array[0]; 2814 } 2815 unittest 2816 { 2817 __m64 R = _mm_setzero_si64(); 2818 long2 A = [13, 25]; 2819 _mm_storel_pi(&R, cast(__m128)A); 2820 assert(R.array[0] == 13); 2821 } 2822 2823 /// Store 4 single-precision (32-bit) floating-point elements from `a` into memory in reverse order. 2824 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 2825 void _mm_storer_ps(float* mem_addr, __m128 a) pure @trusted // FUTURE should not be trusted 2826 { 2827 __m128* aligned = cast(__m128*)mem_addr; 2828 static if (DMD_with_DSIMD) 2829 { 2830 __m128 r = cast(__m128) __simd(XMM.SHUFPS, a, a, 27); 2831 } 2832 else 2833 { 2834 __m128 r; // PERF =void; 2835 r.ptr[0] = a.array[3]; 2836 r.ptr[1] = a.array[2]; 2837 r.ptr[2] = a.array[1]; 2838 r.ptr[3] = a.array[0]; 2839 } 2840 *aligned = r; 2841 } 2842 unittest 2843 { 2844 align(16) float[4] A; 2845 _mm_storer_ps(A.ptr, _mm_setr_ps(1.0f, 2, 3, 4)); 2846 float[4] correct = [4.0f, 3.0f, 2.0f, 1.0f]; 2847 assert(A == correct); 2848 } 2849 2850 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 2851 /// `mem_addr` does not need to be aligned on any particular boundary. 2852 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @trusted // FUTURE should not be trusted, see #62 2853 { 2854 pragma(inline, true); 2855 static if (DMD_with_DSIMD) 2856 { 2857 cast(void) __simd_sto(XMM.STOUPS, *cast(void16*)(cast(float*)mem_addr), a); 2858 } 2859 else static if (GDC_with_SSE) 2860 { 2861 __builtin_ia32_storeups(mem_addr, a); // better in -O0 2862 } 2863 else version(LDC) 2864 { 2865 storeUnaligned!(float4)(a, mem_addr); 2866 } 2867 else 2868 { 2869 mem_addr[0] = a.array[0]; 2870 mem_addr[1] = a.array[1]; 2871 mem_addr[2] = a.array[2]; 2872 mem_addr[3] = a.array[3]; 2873 } 2874 } 2875 unittest 2876 { 2877 __m128 A = _mm_setr_ps(1.0f, 2, 3, 4); 2878 align(16) float[6] R = [0.0f, 0, 0, 0, 0, 0]; 2879 float[4] correct = [1.0f, 2, 3, 4]; 2880 _mm_storeu_ps(&R[1], A); 2881 assert(R[1..5] == correct); 2882 } 2883 2884 /// Store 64-bits of integer data from `a` into memory using a non-temporal memory hint. 2885 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 2886 void _mm_stream_pi (__m64* mem_addr, __m64 a) pure @trusted 2887 { 2888 _mm_stream_si64(cast(long*)mem_addr, a.array[0]); 2889 } 2890 2891 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 2892 /// `a`s into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 16-byte 2893 /// boundary or a general-protection exception may be generated. 2894 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 2895 void _mm_stream_ps (float* mem_addr, __m128 a) 2896 { 2897 // PERF DMD D_SIMD 2898 static if (GDC_with_SSE) 2899 { 2900 return __builtin_ia32_movntps(mem_addr, a); 2901 } 2902 else version(LDC) 2903 { 2904 enum prefix = `!0 = !{ i32 1 }`; 2905 enum ir = ` 2906 store <4 x float> %1, <4 x float>* %0, align 16, !nontemporal !0 2907 ret void`; 2908 LDCInlineIREx!(prefix, ir, "", void, __m128*, float4)(cast(__m128*)mem_addr, a); 2909 2910 } 2911 else 2912 { 2913 // Regular store instead. 2914 __m128* dest = cast(__m128*)mem_addr; 2915 *dest = a; // it's a regular move instead 2916 } 2917 } 2918 unittest 2919 { 2920 align(16) float[4] A; 2921 _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f)); 2922 assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f); 2923 } 2924 2925 /// Subtract packed single-precision (32-bit) floating-point elements in `b` from packed single-precision (32-bit) 2926 /// floating-point elements in `a`. 2927 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe 2928 { 2929 pragma(inline, true); 2930 return a - b; 2931 } 2932 unittest 2933 { 2934 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 2935 a = _mm_sub_ps(a, a); 2936 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 2937 assert(a.array == correct); 2938 } 2939 2940 /// Subtract the lower single-precision (32-bit) floating-point element in `b` from the lower single-precision (32-bit) 2941 /// floating-point element in `a`, store the subtration result in the lower element of result, and copy the upper 3 2942 /// packed elements from a to the upper elements of result. 2943 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe 2944 { 2945 static if (DMD_with_DSIMD) 2946 return cast(__m128) __simd(XMM.SUBSS, a, b); 2947 else static if (GDC_with_SSE) 2948 return __builtin_ia32_subss(a, b); 2949 else 2950 { 2951 a[0] -= b[0]; 2952 return a; 2953 } 2954 } 2955 unittest 2956 { 2957 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 2958 a = _mm_sub_ss(a, a); 2959 float[4] correct = [0.0f, -2.0, 3.0f, 1.0f]; 2960 assert(a.array == correct); 2961 } 2962 2963 /// Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in row0, row1, 2964 /// row2, and row3, and store the transposed matrix in these vectors (row0 now contains column 0, etc.). 2965 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe 2966 { 2967 __m128 tmp3, tmp2, tmp1, tmp0; 2968 tmp0 = _mm_unpacklo_ps(row0, row1); 2969 tmp2 = _mm_unpacklo_ps(row2, row3); 2970 tmp1 = _mm_unpackhi_ps(row0, row1); 2971 tmp3 = _mm_unpackhi_ps(row2, row3); 2972 row0 = _mm_movelh_ps(tmp0, tmp2); 2973 row1 = _mm_movehl_ps(tmp2, tmp0); 2974 row2 = _mm_movelh_ps(tmp1, tmp3); 2975 row3 = _mm_movehl_ps(tmp3, tmp1); 2976 } 2977 unittest 2978 { 2979 __m128 l0 = _mm_setr_ps(0, 1, 2, 3); 2980 __m128 l1 = _mm_setr_ps(4, 5, 6, 7); 2981 __m128 l2 = _mm_setr_ps(8, 9, 10, 11); 2982 __m128 l3 = _mm_setr_ps(12, 13, 14, 15); 2983 _MM_TRANSPOSE4_PS(l0, l1, l2, l3); 2984 float[4] r0 = [0.0f, 4, 8, 12]; 2985 float[4] r1 = [1.0f, 5, 9, 13]; 2986 float[4] r2 = [2.0f, 6, 10, 14]; 2987 float[4] r3 = [3.0f, 7, 11, 15]; 2988 assert(l0.array == r0); 2989 assert(l1.array == r1); 2990 assert(l2.array == r2); 2991 assert(l3.array == r3); 2992 } 2993 2994 // Note: the only difference between these intrinsics is the signalling 2995 // behaviour of quiet NaNs. This is incorrect but the case where 2996 // you would want to differentiate between qNaN and sNaN and then 2997 // treat them differently on purpose seems extremely rare. 2998 alias _mm_ucomieq_ss = _mm_comieq_ss; 2999 alias _mm_ucomige_ss = _mm_comige_ss; 3000 alias _mm_ucomigt_ss = _mm_comigt_ss; 3001 alias _mm_ucomile_ss = _mm_comile_ss; 3002 alias _mm_ucomilt_ss = _mm_comilt_ss; 3003 alias _mm_ucomineq_ss = _mm_comineq_ss; 3004 3005 /// Return vector of type `__m128` with undefined elements. 3006 __m128 _mm_undefined_ps() pure @safe 3007 { 3008 pragma(inline, true); 3009 __m128 undef = void; 3010 return undef; 3011 } 3012 3013 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half `a` and `b`. 3014 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @trusted 3015 { 3016 // PERF GDC use intrinsic 3017 static if (DMD_with_DSIMD) 3018 { 3019 return cast(__m128) __simd(XMM.UNPCKHPS, a, b); 3020 } 3021 else version(LDC) 3022 { 3023 return shufflevectorLDC!(__m128, 2, 6, 3, 7)(a, b); 3024 } 3025 else 3026 { 3027 __m128 r; // PERF =void; 3028 r.ptr[0] = a.array[2]; 3029 r.ptr[1] = b.array[2]; 3030 r.ptr[2] = a.array[3]; 3031 r.ptr[3] = b.array[3]; 3032 return r; 3033 } 3034 } 3035 unittest 3036 { 3037 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 3038 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 3039 __m128 R = _mm_unpackhi_ps(A, B); 3040 float[4] correct = [3.0f, 7.0f, 4.0f, 8.0f]; 3041 assert(R.array == correct); 3042 } 3043 3044 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of `a` and `b`. 3045 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @trusted 3046 { 3047 // PERF GDC use intrinsic 3048 static if (DMD_with_DSIMD) 3049 { 3050 return cast(__m128) __simd(XMM.UNPCKLPS, a, b); 3051 } 3052 else version(LDC) 3053 { 3054 return shufflevectorLDC!(__m128, 0, 4, 1, 5)(a, b); 3055 } 3056 else 3057 { 3058 __m128 r; // PERF =void; 3059 r.ptr[0] = a.array[0]; 3060 r.ptr[1] = b.array[0]; 3061 r.ptr[2] = a.array[1]; 3062 r.ptr[3] = b.array[1]; 3063 return r; 3064 } 3065 } 3066 unittest 3067 { 3068 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 3069 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 3070 __m128 R = _mm_unpacklo_ps(A, B); 3071 float[4] correct = [1.0f, 5.0f, 2.0f, 6.0f]; 3072 assert(R.array == correct); 3073 } 3074 3075 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`. 3076 __m128 _mm_xor_ps (__m128 a, __m128 b) pure @safe 3077 { 3078 return cast(__m128)(cast(__m128i)a ^ cast(__m128i)b); 3079 } 3080 unittest 3081 { 3082 __m128 A = cast(__m128) _mm_set1_epi32(0x80000000); 3083 __m128 B = _mm_setr_ps(4.0f, -5.0, -9.5f, float.infinity); 3084 __m128 C = _mm_xor_ps(A, B); 3085 float[4] correct = [-4.0f, 5.0, 9.5f, -float.infinity]; 3086 assert(C.array == correct); 3087 } 3088 3089 private 3090 { 3091 // Returns: `true` if the pointer is suitably aligned. 3092 bool isPointerAligned(void* p, size_t alignment) pure 3093 { 3094 assert(alignment != 0); 3095 return ( cast(size_t)p & (alignment - 1) ) == 0; 3096 } 3097 3098 // Returns: next pointer aligned with alignment bytes. 3099 void* nextAlignedPointer(void* start, size_t alignment) pure 3100 { 3101 return cast(void*)nextMultipleOf(cast(size_t)(start), alignment); 3102 } 3103 3104 // Returns number of bytes to actually allocate when asking 3105 // for a particular alignment 3106 @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure 3107 { 3108 enum size_t pointerSize = size_t.sizeof; 3109 return askedSize + alignment - 1 + pointerSize * 3; 3110 } 3111 3112 // Store pointer given by malloc + size + alignment 3113 @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure 3114 { 3115 enum size_t pointerSize = size_t.sizeof; 3116 char* start = cast(char*)raw + pointerSize * 3; 3117 void* aligned = nextAlignedPointer(start, alignment); 3118 void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize); 3119 *rawLocation = raw; 3120 size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize); 3121 *sizeLocation = size; 3122 size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize); 3123 *alignmentLocation = alignment; 3124 assert( isPointerAligned(aligned, alignment) ); 3125 return aligned; 3126 } 3127 3128 // Returns: x, multiple of powerOfTwo, so that x >= n. 3129 @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow 3130 { 3131 // check power-of-two 3132 assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0)); 3133 3134 size_t mask = ~(powerOfTwo - 1); 3135 return (n + powerOfTwo - 1) & mask; 3136 } 3137 3138 void* alignedReallocImpl(bool PreserveDataIfResized)(void* aligned, size_t size, size_t alignment) 3139 { 3140 if (aligned is null) 3141 return _mm_malloc(size, alignment); 3142 3143 assert(alignment != 0); 3144 assert(isPointerAligned(aligned, alignment)); 3145 3146 size_t previousSize = *cast(size_t*)(cast(char*)aligned - size_t.sizeof * 2); 3147 size_t prevAlignment = *cast(size_t*)(cast(char*)aligned - size_t.sizeof * 3); 3148 3149 // It is illegal to change the alignment across calls. 3150 assert(prevAlignment == alignment); 3151 3152 void* raw = *cast(void**)(cast(char*)aligned - size_t.sizeof); 3153 size_t request = requestedSize(size, alignment); 3154 size_t previousRequest = requestedSize(previousSize, alignment); 3155 assert(previousRequest - request == previousSize - size); 3156 3157 // Heuristic: if a requested size is within 50% to 100% of what is already allocated 3158 // then exit with the same pointer 3159 // PERF it seems like `realloc` should do that, not us. 3160 if ( (previousRequest < request * 4) && (request <= previousRequest) ) 3161 return aligned; 3162 3163 void* newRaw = malloc(request); 3164 if (request > 0 && newRaw == null) // realloc(0) can validly return anything 3165 onOutOfMemoryError(); 3166 3167 void* newAligned = storeRawPointerPlusInfo(newRaw, size, alignment); 3168 3169 static if (PreserveDataIfResized) 3170 { 3171 size_t minSize = size < previousSize ? size : previousSize; 3172 memcpy(newAligned, aligned, minSize); // ok to use memcpy: newAligned is into new memory, always different from aligned 3173 } 3174 3175 // Free previous data 3176 _mm_free(aligned); 3177 assert(isPointerAligned(newAligned, alignment)); 3178 return newAligned; 3179 } 3180 } 3181 3182 unittest 3183 { 3184 assert(nextMultipleOf(0, 4) == 0); 3185 assert(nextMultipleOf(1, 4) == 4); 3186 assert(nextMultipleOf(2, 4) == 4); 3187 assert(nextMultipleOf(3, 4) == 4); 3188 assert(nextMultipleOf(4, 4) == 4); 3189 assert(nextMultipleOf(5, 4) == 8); 3190 3191 { 3192 void* p = _mm_malloc(23, 16); 3193 assert(p !is null); 3194 assert(((cast(size_t)p) & 0xf) == 0); 3195 _mm_free(p); 3196 } 3197 3198 void* nullAlloc = _mm_malloc(0, 32); 3199 assert(nullAlloc != null); 3200 _mm_free(nullAlloc); 3201 } 3202 3203 // For some reason, order of declaration is important for this one 3204 // so it is misplaced. 3205 // Note: is just another name for _mm_cvtss_si32 3206 alias _mm_cvt_ss2si = _mm_cvtss_si32;