1 /** 2 * SSE intrinsics. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.xmmintrin; 8 9 public import inteli.types; 10 11 import inteli.internals; 12 13 import inteli.mmx; 14 import inteli.emmintrin; 15 16 import core.stdc.stdlib: malloc, free; 17 import core.stdc.string: memcpy; 18 import core.exception: onOutOfMemoryError; 19 20 version(D_InlineAsm_X86) 21 version = InlineX86Asm; 22 else version(D_InlineAsm_X86_64) 23 version = InlineX86Asm; 24 25 26 // SSE1 27 28 nothrow @nogc: 29 30 31 enum int _MM_EXCEPT_INVALID = 0x0001; /// MXCSR Exception states. 32 enum int _MM_EXCEPT_DENORM = 0x0002; ///ditto 33 enum int _MM_EXCEPT_DIV_ZERO = 0x0004; ///ditto 34 enum int _MM_EXCEPT_OVERFLOW = 0x0008; ///ditto 35 enum int _MM_EXCEPT_UNDERFLOW = 0x0010; ///ditto 36 enum int _MM_EXCEPT_INEXACT = 0x0020; ///ditto 37 enum int _MM_EXCEPT_MASK = 0x003f; /// MXCSR Exception states mask. 38 39 enum int _MM_MASK_INVALID = 0x0080; /// MXCSR Exception masks. 40 enum int _MM_MASK_DENORM = 0x0100; ///ditto 41 enum int _MM_MASK_DIV_ZERO = 0x0200; ///ditto 42 enum int _MM_MASK_OVERFLOW = 0x0400; ///ditto 43 enum int _MM_MASK_UNDERFLOW = 0x0800; ///ditto 44 enum int _MM_MASK_INEXACT = 0x1000; ///ditto 45 enum int _MM_MASK_MASK = 0x1f80; /// MXCSR Exception masks mask. 46 47 enum int _MM_ROUND_NEAREST = 0x0000; /// MXCSR Rounding mode. 48 enum int _MM_ROUND_DOWN = 0x2000; ///ditto 49 enum int _MM_ROUND_UP = 0x4000; ///ditto 50 enum int _MM_ROUND_TOWARD_ZERO = 0x6000; ///ditto 51 enum int _MM_ROUND_MASK = 0x6000; /// MXCSR Rounding mode mask. 52 53 enum int _MM_FLUSH_ZERO_MASK = 0x8000; /// MXCSR Denormal flush to zero mask. 54 enum int _MM_FLUSH_ZERO_ON = 0x8000; /// MXCSR Denormal flush to zero modes. 55 enum int _MM_FLUSH_ZERO_OFF = 0x0000; ///ditto 56 57 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`. 58 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe 59 { 60 return a + b; 61 } 62 unittest 63 { 64 __m128 a = [1, 2, 3, 4]; 65 a = _mm_add_ps(a, a); 66 assert(a.array[0] == 2); 67 assert(a.array[1] == 4); 68 assert(a.array[2] == 6); 69 assert(a.array[3] == 8); 70 } 71 72 /// Add the lower single-precision (32-bit) floating-point element 73 /// in `a` and `b`, store the result in the lower element of result, 74 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 75 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe 76 { 77 static if (GDC_with_SSE) 78 { 79 return __builtin_ia32_addss(a, b); 80 } 81 else static if (DMD_with_DSIMD) 82 { 83 return cast(__m128) __simd(XMM.ADDSS, a, b); 84 } 85 else 86 { 87 a[0] += b[0]; 88 return a; 89 } 90 } 91 unittest 92 { 93 __m128 a = [1, 2, 3, 4]; 94 a = _mm_add_ss(a, a); 95 assert(a.array == [2.0f, 2, 3, 4]); 96 } 97 98 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`. 99 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe 100 { 101 return cast(__m128)(cast(__m128i)a & cast(__m128i)b); 102 } 103 unittest 104 { 105 float a = 4.32f; 106 float b = -78.99f; 107 int correct = (*cast(int*)(&a)) & (*cast(int*)(&b)); 108 __m128 A = _mm_set_ps(a, b, a, b); 109 __m128 B = _mm_set_ps(b, a, b, a); 110 int4 R = cast(int4)( _mm_and_ps(A, B) ); 111 assert(R.array[0] == correct); 112 assert(R.array[1] == correct); 113 assert(R.array[2] == correct); 114 assert(R.array[3] == correct); 115 } 116 117 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` and then AND with `b`. 118 __m128 _mm_andnot_ps (__m128 a, __m128 b) pure @safe 119 { 120 static if (DMD_with_DSIMD) 121 return cast(__m128) __simd(XMM.ANDNPS, a, b); 122 else 123 return cast(__m128)( (~cast(__m128i)a) & cast(__m128i)b ); 124 } 125 unittest 126 { 127 float a = 4.32f; 128 float b = -78.99f; 129 int correct = ~(*cast(int*)(&a)) & (*cast(int*)(&b)); 130 int correct2 = (*cast(int*)(&a)) & ~(*cast(int*)(&b)); 131 __m128 A = _mm_set_ps(a, b, a, b); 132 __m128 B = _mm_set_ps(b, a, b, a); 133 int4 R = cast(int4)( _mm_andnot_ps(A, B) ); 134 assert(R.array[0] == correct2); 135 assert(R.array[1] == correct); 136 assert(R.array[2] == correct2); 137 assert(R.array[3] == correct); 138 } 139 140 /// Average packed unsigned 16-bit integers in ``a` and `b`. 141 __m64 _mm_avg_pu16 (__m64 a, __m64 b) pure @safe 142 { 143 return to_m64(_mm_avg_epu16(to_m128i(a), to_m128i(b))); 144 } 145 146 /// Average packed unsigned 8-bit integers in ``a` and `b`. 147 __m64 _mm_avg_pu8 (__m64 a, __m64 b) pure @safe 148 { 149 return to_m64(_mm_avg_epu8(to_m128i(a), to_m128i(b))); 150 } 151 152 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for equality. 153 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe 154 { 155 static if (DMD_with_DSIMD) 156 return cast(__m128) __simd(XMM.CMPPS, a, b, 0); 157 else 158 return cast(__m128) cmpps!(FPComparison.oeq)(a, b); 159 } 160 unittest 161 { 162 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 163 __m128 B = _mm_setr_ps(3.0f, 2.0f, float.nan, float.nan); 164 __m128i R = cast(__m128i) _mm_cmpeq_ps(A, B); 165 int[4] correct = [0, -1, 0, 0]; 166 assert(R.array == correct); 167 } 168 169 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for equality, 170 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 171 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe 172 { 173 static if (DMD_with_DSIMD) 174 return cast(__m128) __simd(XMM.CMPSS, a, b, 0); 175 else 176 return cast(__m128) cmpss!(FPComparison.oeq)(a, b); 177 } 178 unittest 179 { 180 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 181 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 182 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 183 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 184 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 185 __m128i R1 = cast(__m128i) _mm_cmpeq_ss(A, B); 186 __m128i R2 = cast(__m128i) _mm_cmpeq_ss(A, C); 187 __m128i R3 = cast(__m128i) _mm_cmpeq_ss(A, D); 188 __m128i R4 = cast(__m128i) _mm_cmpeq_ss(A, E); 189 int[4] correct1 = [-1, 0, 0, 0]; 190 int[4] correct2 = [0, 0, 0, 0]; 191 int[4] correct3 = [0, 0, 0, 0]; 192 int[4] correct4 = [0, 0, 0, 0]; 193 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 194 } 195 196 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal. 197 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe 198 { 199 static if (DMD_with_DSIMD) 200 return cast(__m128) __simd(XMM.CMPPS, b, a, 2); 201 else 202 return cast(__m128) cmpps!(FPComparison.oge)(a, b); 203 } 204 unittest 205 { 206 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 207 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 208 __m128i R = cast(__m128i) _mm_cmpge_ps(A, B); 209 int[4] correct = [0, -1,-1, 0]; 210 assert(R.array == correct); 211 } 212 213 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal, 214 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 215 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe 216 { 217 static if (DMD_with_DSIMD) 218 { 219 __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 2); 220 a[0] = c[0]; 221 return a; 222 } 223 else 224 return cast(__m128) cmpss!(FPComparison.oge)(a, b); 225 } 226 unittest 227 { 228 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 229 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 230 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 231 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 232 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 233 __m128i R1 = cast(__m128i) _mm_cmpge_ss(A, B); 234 __m128i R2 = cast(__m128i) _mm_cmpge_ss(A, C); 235 __m128i R3 = cast(__m128i) _mm_cmpge_ss(A, D); 236 __m128i R4 = cast(__m128i) _mm_cmpge_ss(A, E); 237 int[4] correct1 = [-1, 0, 0, 0]; 238 int[4] correct2 = [-1, 0, 0, 0]; 239 int[4] correct3 = [0, 0, 0, 0]; 240 int[4] correct4 = [0, 0, 0, 0]; 241 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 242 } 243 244 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than. 245 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe 246 { 247 static if (DMD_with_DSIMD) 248 return cast(__m128) __simd(XMM.CMPPS, b, a, 1); 249 else 250 return cast(__m128) cmpps!(FPComparison.ogt)(a, b); 251 } 252 unittest 253 { 254 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 255 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 256 __m128i R = cast(__m128i) _mm_cmpgt_ps(A, B); 257 int[4] correct = [0, 0,-1, 0]; 258 assert(R.array == correct); 259 } 260 261 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than, 262 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 263 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe 264 { 265 static if (DMD_with_DSIMD) 266 { 267 __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 1); 268 a[0] = c[0]; 269 return a; 270 } 271 else 272 return cast(__m128) cmpss!(FPComparison.ogt)(a, b); 273 } 274 unittest 275 { 276 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 277 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 278 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 279 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 280 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 281 __m128i R1 = cast(__m128i) _mm_cmpgt_ss(A, B); 282 __m128i R2 = cast(__m128i) _mm_cmpgt_ss(A, C); 283 __m128i R3 = cast(__m128i) _mm_cmpgt_ss(A, D); 284 __m128i R4 = cast(__m128i) _mm_cmpgt_ss(A, E); 285 int[4] correct1 = [0, 0, 0, 0]; 286 int[4] correct2 = [-1, 0, 0, 0]; 287 int[4] correct3 = [0, 0, 0, 0]; 288 int[4] correct4 = [0, 0, 0, 0]; 289 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 290 } 291 292 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal. 293 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe 294 { 295 static if (DMD_with_DSIMD) 296 return cast(__m128) __simd(XMM.CMPPS, a, b, 2); 297 else 298 return cast(__m128) cmpps!(FPComparison.ole)(a, b); 299 } 300 unittest 301 { 302 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 303 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 304 __m128i R = cast(__m128i) _mm_cmple_ps(A, B); 305 int[4] correct = [-1, -1, 0, 0]; 306 assert(R.array == correct); 307 } 308 309 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal, 310 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 311 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe 312 { 313 static if (DMD_with_DSIMD) 314 return cast(__m128) __simd(XMM.CMPSS, a, b, 2); 315 else 316 return cast(__m128) cmpss!(FPComparison.ole)(a, b); 317 } 318 unittest 319 { 320 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 321 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 322 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 323 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 324 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 325 __m128i R1 = cast(__m128i) _mm_cmple_ss(A, B); 326 __m128i R2 = cast(__m128i) _mm_cmple_ss(A, C); 327 __m128i R3 = cast(__m128i) _mm_cmple_ss(A, D); 328 __m128i R4 = cast(__m128i) _mm_cmple_ss(A, E); 329 int[4] correct1 = [-1, 0, 0, 0]; 330 int[4] correct2 = [0, 0, 0, 0]; 331 int[4] correct3 = [0, 0, 0, 0]; 332 int[4] correct4 = [-1, 0, 0, 0]; 333 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 334 } 335 336 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than. 337 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe 338 { 339 static if (DMD_with_DSIMD) 340 return cast(__m128) __simd(XMM.CMPPS, a, b, 1); 341 else 342 return cast(__m128) cmpps!(FPComparison.olt)(a, b); 343 } 344 unittest 345 { 346 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 347 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 348 __m128i R = cast(__m128i) _mm_cmplt_ps(A, B); 349 int[4] correct = [-1, 0, 0, 0]; 350 assert(R.array == correct); 351 } 352 353 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than, 354 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 355 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe 356 { 357 static if (DMD_with_DSIMD) 358 return cast(__m128) __simd(XMM.CMPSS, a, b, 1); 359 else 360 return cast(__m128) cmpss!(FPComparison.olt)(a, b); 361 } 362 unittest 363 { 364 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 365 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 366 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 367 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 368 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 369 __m128i R1 = cast(__m128i) _mm_cmplt_ss(A, B); 370 __m128i R2 = cast(__m128i) _mm_cmplt_ss(A, C); 371 __m128i R3 = cast(__m128i) _mm_cmplt_ss(A, D); 372 __m128i R4 = cast(__m128i) _mm_cmplt_ss(A, E); 373 int[4] correct1 = [0, 0, 0, 0]; 374 int[4] correct2 = [0, 0, 0, 0]; 375 int[4] correct3 = [0, 0, 0, 0]; 376 int[4] correct4 = [-1, 0, 0, 0]; 377 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 378 } 379 380 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-equal. 381 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe 382 { 383 static if (DMD_with_DSIMD) 384 return cast(__m128) __simd(XMM.CMPPS, a, b, 4); 385 else 386 return cast(__m128) cmpps!(FPComparison.une)(a, b); 387 } 388 unittest 389 { 390 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 391 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 392 __m128i R = cast(__m128i) _mm_cmpneq_ps(A, B); 393 int[4] correct = [-1, 0, -1, -1]; 394 assert(R.array == correct); 395 } 396 397 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-equal, 398 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 399 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe 400 { 401 static if (DMD_with_DSIMD) 402 return cast(__m128) __simd(XMM.CMPSS, a, b, 4); 403 else 404 return cast(__m128) cmpss!(FPComparison.une)(a, b); 405 } 406 unittest 407 { 408 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 409 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 410 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 411 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 412 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 413 __m128i R1 = cast(__m128i) _mm_cmpneq_ss(A, B); 414 __m128i R2 = cast(__m128i) _mm_cmpneq_ss(A, C); 415 __m128i R3 = cast(__m128i) _mm_cmpneq_ss(A, D); 416 __m128i R4 = cast(__m128i) _mm_cmpneq_ss(A, E); 417 int[4] correct1 = [0, 0, 0, 0]; 418 int[4] correct2 = [-1, 0, 0, 0]; 419 int[4] correct3 = [-1, 0, 0, 0]; 420 int[4] correct4 = [-1, 0, 0, 0]; 421 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 422 } 423 424 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal. 425 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe 426 { 427 static if (DMD_with_DSIMD) 428 return cast(__m128) __simd(XMM.CMPPS, b, a, 6); 429 else 430 return cast(__m128) cmpps!(FPComparison.ult)(a, b); 431 } 432 unittest 433 { 434 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 435 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 436 __m128i R = cast(__m128i) _mm_cmpnge_ps(A, B); 437 int[4] correct = [-1, 0, 0, -1]; 438 assert(R.array == correct); 439 } 440 441 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal, 442 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 443 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe 444 { 445 static if (DMD_with_DSIMD) 446 { 447 __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 6); 448 a[0] = c[0]; 449 return a; 450 } 451 else 452 return cast(__m128) cmpss!(FPComparison.ult)(a, b); 453 } 454 unittest 455 { 456 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 457 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 458 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 459 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 460 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 461 __m128i R1 = cast(__m128i) _mm_cmpnge_ss(A, B); 462 __m128i R2 = cast(__m128i) _mm_cmpnge_ss(A, C); 463 __m128i R3 = cast(__m128i) _mm_cmpnge_ss(A, D); 464 __m128i R4 = cast(__m128i) _mm_cmpnge_ss(A, E); 465 int[4] correct1 = [0, 0, 0, 0]; 466 int[4] correct2 = [0, 0, 0, 0]; 467 int[4] correct3 = [-1, 0, 0, 0]; 468 int[4] correct4 = [-1, 0, 0, 0]; 469 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 470 } 471 472 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than. 473 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe 474 { 475 static if (DMD_with_DSIMD) 476 return cast(__m128) __simd(XMM.CMPPS, b, a, 5); 477 else 478 return cast(__m128) cmpps!(FPComparison.ule)(a, b); 479 } 480 unittest 481 { 482 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 483 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 484 __m128i R = cast(__m128i) _mm_cmpngt_ps(A, B); 485 int[4] correct = [-1, -1, 0, -1]; 486 assert(R.array == correct); 487 } 488 489 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than, 490 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 491 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe 492 { 493 static if (DMD_with_DSIMD) 494 { 495 __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 5); 496 a[0] = c[0]; 497 return a; 498 } 499 else 500 return cast(__m128) cmpss!(FPComparison.ule)(a, b); 501 } 502 unittest 503 { 504 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 505 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 506 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 507 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 508 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 509 __m128i R1 = cast(__m128i) _mm_cmpngt_ss(A, B); 510 __m128i R2 = cast(__m128i) _mm_cmpngt_ss(A, C); 511 __m128i R3 = cast(__m128i) _mm_cmpngt_ss(A, D); 512 __m128i R4 = cast(__m128i) _mm_cmpngt_ss(A, E); 513 int[4] correct1 = [-1, 0, 0, 0]; 514 int[4] correct2 = [0, 0, 0, 0]; 515 int[4] correct3 = [-1, 0, 0, 0]; 516 int[4] correct4 = [-1, 0, 0, 0]; 517 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 518 } 519 520 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal. 521 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe 522 { 523 static if (DMD_with_DSIMD) 524 return cast(__m128) __simd(XMM.CMPPS, a, b, 6); 525 else 526 return cast(__m128) cmpps!(FPComparison.ugt)(a, b); 527 } 528 unittest 529 { 530 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 531 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 532 __m128i R = cast(__m128i) _mm_cmpnle_ps(A, B); 533 int[4] correct = [0, 0, -1, -1]; 534 assert(R.array == correct); 535 } 536 537 538 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal, 539 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 540 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe 541 { 542 static if (DMD_with_DSIMD) 543 return cast(__m128) __simd(XMM.CMPSS, a, b, 6); 544 else 545 return cast(__m128) cmpss!(FPComparison.ugt)(a, b); 546 } 547 unittest 548 { 549 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 550 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 551 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 552 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 553 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 554 __m128i R1 = cast(__m128i) _mm_cmpnle_ss(A, B); 555 __m128i R2 = cast(__m128i) _mm_cmpnle_ss(A, C); 556 __m128i R3 = cast(__m128i) _mm_cmpnle_ss(A, D); 557 __m128i R4 = cast(__m128i) _mm_cmpnle_ss(A, E); 558 int[4] correct1 = [0, 0, 0, 0]; 559 int[4] correct2 = [-1, 0, 0, 0]; 560 int[4] correct3 = [-1, 0, 0, 0]; 561 int[4] correct4 = [0, 0, 0, 0]; 562 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 563 } 564 565 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than. 566 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe 567 { 568 static if (DMD_with_DSIMD) 569 return cast(__m128) __simd(XMM.CMPPS, a, b, 5); 570 else 571 return cast(__m128) cmpps!(FPComparison.uge)(a, b); 572 } 573 unittest 574 { 575 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 576 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 577 __m128i R = cast(__m128i) _mm_cmpnlt_ps(A, B); 578 int[4] correct = [0, -1, -1, -1]; 579 assert(R.array == correct); 580 } 581 582 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than, 583 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 584 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe 585 { 586 static if (DMD_with_DSIMD) 587 return cast(__m128) __simd(XMM.CMPSS, a, b, 5); 588 else 589 return cast(__m128) cmpss!(FPComparison.uge)(a, b); 590 } 591 unittest 592 { 593 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 594 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 595 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 596 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 597 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 598 __m128i R1 = cast(__m128i) _mm_cmpnlt_ss(A, B); 599 __m128i R2 = cast(__m128i) _mm_cmpnlt_ss(A, C); 600 __m128i R3 = cast(__m128i) _mm_cmpnlt_ss(A, D); 601 __m128i R4 = cast(__m128i) _mm_cmpnlt_ss(A, E); 602 int[4] correct1 = [-1, 0, 0, 0]; 603 int[4] correct2 = [-1, 0, 0, 0]; 604 int[4] correct3 = [-1, 0, 0, 0]; 605 int[4] correct4 = [0, 0, 0, 0]; 606 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 607 } 608 609 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN. 610 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe 611 { 612 static if (DMD_with_DSIMD) 613 return cast(__m128) __simd(XMM.CMPPS, a, b, 7); 614 else 615 return cast(__m128) cmpps!(FPComparison.ord)(a, b); 616 } 617 unittest 618 { 619 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 620 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 621 __m128i R = cast(__m128i) _mm_cmpord_ps(A, B); 622 int[4] correct = [-1, -1, -1, 0]; 623 assert(R.array == correct); 624 } 625 626 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN, 627 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 628 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe 629 { 630 static if (DMD_with_DSIMD) 631 return cast(__m128) __simd(XMM.CMPSS, a, b, 7); 632 else 633 return cast(__m128) cmpss!(FPComparison.ord)(a, b); 634 } 635 unittest 636 { 637 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 638 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 639 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 640 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 641 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 642 __m128i R1 = cast(__m128i) _mm_cmpord_ss(A, B); 643 __m128i R2 = cast(__m128i) _mm_cmpord_ss(A, C); 644 __m128i R3 = cast(__m128i) _mm_cmpord_ss(A, D); 645 __m128i R4 = cast(__m128i) _mm_cmpord_ss(A, E); 646 int[4] correct1 = [-1, 0, 0, 0]; 647 int[4] correct2 = [-1, 0, 0, 0]; 648 int[4] correct3 = [0, 0, 0, 0]; 649 int[4] correct4 = [-1, 0, 0, 0]; 650 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 651 } 652 653 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN. 654 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe 655 { 656 static if (DMD_with_DSIMD) 657 return cast(__m128) __simd(XMM.CMPPS, a, b, 3); 658 else 659 return cast(__m128) cmpps!(FPComparison.uno)(a, b); 660 } 661 unittest 662 { 663 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 664 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 665 __m128i R = cast(__m128i) _mm_cmpunord_ps(A, B); 666 int[4] correct = [0, 0, 0, -1]; 667 assert(R.array == correct); 668 } 669 670 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN. 671 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 672 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe 673 { 674 static if (DMD_with_DSIMD) 675 return cast(__m128) __simd(XMM.CMPSS, a, b, 3); 676 else return cast(__m128) cmpss!(FPComparison.uno)(a, b); 677 } 678 unittest 679 { 680 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 681 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 682 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 683 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 684 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 685 __m128i R1 = cast(__m128i) _mm_cmpunord_ss(A, B); 686 __m128i R2 = cast(__m128i) _mm_cmpunord_ss(A, C); 687 __m128i R3 = cast(__m128i) _mm_cmpunord_ss(A, D); 688 __m128i R4 = cast(__m128i) _mm_cmpunord_ss(A, E); 689 int[4] correct1 = [0, 0, 0, 0]; 690 int[4] correct2 = [0, 0, 0, 0]; 691 int[4] correct3 = [-1, 0, 0, 0]; 692 int[4] correct4 = [0, 0, 0, 0]; 693 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 694 } 695 696 697 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for equality, 698 /// and return the boolean result (0 or 1). 699 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe 700 { 701 return a.array[0] == b.array[0]; 702 } 703 unittest 704 { 705 assert(1 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 706 assert(0 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 707 assert(0 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 708 assert(0 == _mm_comieq_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 709 assert(1 == _mm_comieq_ss(_mm_set_ss(0.0), _mm_set_ss(-0.0))); 710 } 711 712 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than-or-equal, 713 /// and return the boolean result (0 or 1). 714 int _mm_comige_ss (__m128 a, __m128 b) pure @safe 715 { 716 return a.array[0] >= b.array[0]; 717 } 718 unittest 719 { 720 assert(1 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 721 assert(1 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 722 assert(0 == _mm_comige_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f))); 723 assert(0 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 724 assert(0 == _mm_comige_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 725 assert(1 == _mm_comige_ss(_mm_set_ss(-0.0f), _mm_set_ss(0.0f))); 726 } 727 728 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than, 729 /// and return the boolean result (0 or 1). 730 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta 731 { 732 return a.array[0] > b.array[0]; 733 } 734 unittest 735 { 736 assert(0 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 737 assert(1 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 738 assert(0 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 739 assert(0 == _mm_comigt_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 740 assert(0 == _mm_comigt_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f))); 741 } 742 743 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than-or-equal, 744 /// and return the boolean result (0 or 1). 745 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe 746 { 747 return a.array[0] <= b.array[0]; 748 } 749 unittest 750 { 751 assert(1 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 752 assert(0 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 753 assert(1 == _mm_comile_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f))); 754 assert(0 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 755 assert(0 == _mm_comile_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 756 assert(1 == _mm_comile_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f))); 757 } 758 759 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than, 760 /// and return the boolean result (0 or 1). 761 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb 762 { 763 return a.array[0] < b.array[0]; 764 } 765 unittest 766 { 767 assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 768 assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 769 assert(1 == _mm_comilt_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f))); 770 assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 771 assert(0 == _mm_comilt_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 772 assert(0 == _mm_comilt_ss(_mm_set_ss(-0.0f), _mm_set_ss(0.0f))); 773 } 774 775 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for not-equal, 776 /// and return the boolean result (0 or 1). 777 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne 778 { 779 return a.array[0] != b.array[0]; 780 } 781 unittest 782 { 783 assert(0 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 784 assert(1 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 785 assert(1 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 786 assert(1 == _mm_comineq_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 787 assert(0 == _mm_comineq_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f))); 788 } 789 790 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 791 /// floating-point elements, store the results in the lower 2 elements, 792 /// and copy the upper 2 packed elements from `a` to the upper elements of result. 793 alias _mm_cvt_pi2ps = _mm_cvtpi32_ps; 794 795 /// Convert 2 lower packed single-precision (32-bit) floating-point elements in `a` 796 /// to packed 32-bit integers. 797 __m64 _mm_cvt_ps2pi (__m128 a) @safe 798 { 799 return to_m64(_mm_cvtps_epi32(a)); 800 } 801 802 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 803 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 804 /// upper elements of the result. 805 __m128 _mm_cvt_si2ss (__m128 v, int x) pure @trusted 806 { 807 v.ptr[0] = cast(float)x; 808 return v; 809 } 810 unittest 811 { 812 __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42); 813 assert(a.array == [42f, 0, 0, 0]); 814 } 815 816 /// Convert packed 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements. 817 __m128 _mm_cvtpi16_ps (__m64 a) pure @safe 818 { 819 __m128i ma = to_m128i(a); 820 ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit 821 ma = _mm_srai_epi32(_mm_slli_epi32(ma, 16), 16); // Replicate sign bit 822 return _mm_cvtepi32_ps(ma); 823 } 824 unittest 825 { 826 __m64 A = _mm_setr_pi16(-1, 2, -3, 4); 827 __m128 R = _mm_cvtpi16_ps(A); 828 float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f]; 829 assert(R.array == correct); 830 } 831 832 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 833 /// floating-point elements, store the results in the lower 2 elements, 834 /// and copy the upper 2 packed elements from `a` to the upper elements of result. 835 __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) pure @trusted 836 { 837 __m128 fb = _mm_cvtepi32_ps(to_m128i(b)); 838 a.ptr[0] = fb.array[0]; 839 a.ptr[1] = fb.array[1]; 840 return a; 841 } 842 unittest 843 { 844 __m128 R = _mm_cvtpi32_ps(_mm_set1_ps(4.0f), _mm_setr_pi32(1, 2)); 845 float[4] correct = [1.0f, 2.0f, 4.0f, 4.0f]; 846 assert(R.array == correct); 847 } 848 849 /// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point elements, 850 /// store the results in the lower 2 elements, then covert the packed signed 32-bit integers in `b` to 851 /// single-precision (32-bit) floating-point element, and store the results in the upper 2 elements. 852 __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) pure @trusted 853 { 854 long2 l; 855 l.ptr[0] = a.array[0]; 856 l.ptr[1] = b.array[0]; 857 return _mm_cvtepi32_ps(cast(__m128i)l); 858 } 859 unittest 860 { 861 __m64 A = _mm_setr_pi32(-45, 128); 862 __m64 B = _mm_setr_pi32(0, 1000); 863 __m128 R = _mm_cvtpi32x2_ps(A, B); 864 float[4] correct = [-45.0f, 128.0f, 0.0f, 1000.0f]; 865 assert(R.array == correct); 866 } 867 868 /// Convert the lower packed 8-bit integers in `a` to packed single-precision (32-bit) floating-point elements. 869 __m128 _mm_cvtpi8_ps (__m64 a) pure @safe 870 { 871 __m128i b = to_m128i(a); 872 873 // Zero extend to 32-bit 874 b = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 875 b = _mm_unpacklo_epi16(b, _mm_setzero_si128()); 876 877 // Replicate sign bit 878 b = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24); // Replicate sign bit 879 return _mm_cvtepi32_ps(b); 880 } 881 unittest 882 { 883 __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0); 884 __m128 R = _mm_cvtpi8_ps(A); 885 float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f]; 886 assert(R.array == correct); 887 } 888 889 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 16-bit integers. 890 /// Note: this intrinsic will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and 0x7FFFFFFF. 891 __m64 _mm_cvtps_pi16 (__m128 a) @safe 892 { 893 // The C++ version of this intrinsic convert to 32-bit float, then use packssdw 894 // Which means the 16-bit integers should be saturated 895 __m128i b = _mm_cvtps_epi32(a); 896 b = _mm_packs_epi32(b, b); 897 return to_m64(b); 898 } 899 unittest 900 { 901 __m128 A = _mm_setr_ps(-1.0f, 2.0f, -33000.0f, 70000.0f); 902 short4 R = cast(short4) _mm_cvtps_pi16(A); 903 short[4] correct = [-1, 2, -32768, 32767]; 904 assert(R.array == correct); 905 } 906 907 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers. 908 __m64 _mm_cvtps_pi32 (__m128 a) @safe 909 { 910 return to_m64(_mm_cvtps_epi32(a)); 911 } 912 unittest 913 { 914 __m128 A = _mm_setr_ps(-33000.0f, 70000.0f, -1.0f, 2.0f, ); 915 int2 R = cast(int2) _mm_cvtps_pi32(A); 916 int[2] correct = [-33000, 70000]; 917 assert(R.array == correct); 918 } 919 920 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 8-bit integers, 921 /// and store the results in lower 4 elements. 922 /// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values between 0x7F and 0x7FFFFFFF. 923 __m64 _mm_cvtps_pi8 (__m128 a) @safe 924 { 925 // The C++ version of this intrinsic convert to 32-bit float, then use packssdw + packsswb 926 // Which means the 8-bit integers should be saturated 927 __m128i b = _mm_cvtps_epi32(a); 928 b = _mm_packs_epi32(b, _mm_setzero_si128()); 929 b = _mm_packs_epi16(b, _mm_setzero_si128()); 930 return to_m64(b); 931 } 932 unittest 933 { 934 __m128 A = _mm_setr_ps(-1.0f, 2.0f, -129.0f, 128.0f); 935 byte8 R = cast(byte8) _mm_cvtps_pi8(A); 936 byte[8] correct = [-1, 2, -128, 127, 0, 0, 0, 0]; 937 assert(R.array == correct); 938 } 939 940 /// Convert packed unsigned 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements. 941 __m128 _mm_cvtpu16_ps (__m64 a) pure @safe 942 { 943 __m128i ma = to_m128i(a); 944 ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit 945 return _mm_cvtepi32_ps(ma); 946 } 947 unittest 948 { 949 __m64 A = _mm_setr_pi16(-1, 2, -3, 4); 950 __m128 R = _mm_cvtpu16_ps(A); 951 float[4] correct = [65535.0f, 2.0f, 65533.0f, 4.0f]; 952 assert(R.array == correct); 953 } 954 955 /// Convert the lower packed unsigned 8-bit integers in `a` to packed single-precision (32-bit) floating-point element. 956 __m128 _mm_cvtpu8_ps (__m64 a) pure @safe 957 { 958 __m128i b = to_m128i(a); 959 960 // Zero extend to 32-bit 961 b = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 962 b = _mm_unpacklo_epi16(b, _mm_setzero_si128()); 963 return _mm_cvtepi32_ps(b); 964 } 965 unittest 966 { 967 __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0); 968 __m128 R = _mm_cvtpu8_ps(A); 969 float[4] correct = [255.0f, 2.0f, 253.0f, 4.0f]; 970 assert(R.array == correct); 971 } 972 973 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 974 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 975 /// upper elements of result. 976 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @trusted 977 { 978 v.ptr[0] = cast(float)x; 979 return v; 980 } 981 unittest 982 { 983 __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42); 984 assert(a.array == [42.0f, 0, 0, 0]); 985 } 986 987 988 /// Convert the signed 64-bit integer `b` to a single-precision (32-bit) floating-point element, 989 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 990 /// upper elements of result. 991 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @trusted 992 { 993 v.ptr[0] = cast(float)x; 994 return v; 995 } 996 unittest 997 { 998 __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42); 999 assert(a.array == [42.0f, 0, 0, 0]); 1000 } 1001 1002 /// Take the lower single-precision (32-bit) floating-point element of `a`. 1003 float _mm_cvtss_f32(__m128 a) pure @safe 1004 { 1005 return a.array[0]; 1006 } 1007 1008 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit integer. 1009 int _mm_cvtss_si32 (__m128 a) @safe // PERF GDC 1010 { 1011 static if (GDC_with_SSE) 1012 { 1013 return __builtin_ia32_cvtss2si(a); 1014 } 1015 else static if (LDC_with_SSE1) 1016 { 1017 return __builtin_ia32_cvtss2si(a); 1018 } 1019 else static if (DMD_with_DSIMD) 1020 { 1021 __m128 b; 1022 __m128i r = cast(__m128i) __simd(XMM.CVTPS2DQ, a); // Note: converts 4 integers. 1023 return r.array[0]; 1024 } 1025 else 1026 { 1027 return convertFloatToInt32UsingMXCSR(a.array[0]); 1028 } 1029 } 1030 unittest 1031 { 1032 assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 1033 } 1034 1035 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer. 1036 long _mm_cvtss_si64 (__m128 a) @safe 1037 { 1038 version(LDC) 1039 { 1040 version(X86_64) 1041 { 1042 return __builtin_ia32_cvtss2si64(a); 1043 } 1044 else 1045 { 1046 // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer 1047 // using SSE instructions only. So the builtin doesn't exit for this arch. 1048 return convertFloatToInt64UsingMXCSR(a.array[0]); 1049 } 1050 } 1051 else 1052 { 1053 return convertFloatToInt64UsingMXCSR(a.array[0]); 1054 } 1055 } 1056 unittest 1057 { 1058 assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 1059 1060 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1061 1062 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1063 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.49f))); 1064 1065 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1066 assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f))); 1067 1068 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1069 assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f))); 1070 1071 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1072 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f))); 1073 1074 _MM_SET_ROUNDING_MODE(savedRounding); 1075 } 1076 1077 1078 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit 1079 /// integer with truncation. 1080 int _mm_cvtt_ss2si (__m128 a) pure @safe 1081 { 1082 // x86: cvttss2si always generated, even in -O0 1083 return cast(int)(a.array[0]); 1084 } 1085 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; ///ditto 1086 unittest 1087 { 1088 assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1089 } 1090 1091 1092 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit 1093 /// integers with truncation. 1094 __m64 _mm_cvtt_ps2pi (__m128 a) pure @safe 1095 { 1096 return to_m64(_mm_cvttps_epi32(a)); 1097 } 1098 1099 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit 1100 /// integer with truncation. 1101 long _mm_cvttss_si64 (__m128 a) pure @safe 1102 { 1103 return cast(long)(a.array[0]); 1104 } 1105 unittest 1106 { 1107 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1108 } 1109 1110 /// Divide packed single-precision (32-bit) floating-point elements in `a` by packed elements in `b`. 1111 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe 1112 { 1113 return a / b; 1114 } 1115 unittest 1116 { 1117 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1118 a = _mm_div_ps(a, a); 1119 float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f]; 1120 assert(a.array == correct); 1121 } 1122 1123 /// Divide the lower single-precision (32-bit) floating-point element in `a` by the lower 1124 /// single-precision (32-bit) floating-point element in `b`, store the result in the lower 1125 /// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result. 1126 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe 1127 { 1128 static if (DMD_with_DSIMD) 1129 return cast(__m128) __simd(XMM.DIVSS, a, b); 1130 else static if (GDC_with_SSE) 1131 return __builtin_ia32_divss(a, b); 1132 else 1133 { 1134 a[0] /= b[0]; 1135 return a; 1136 } 1137 } 1138 unittest 1139 { 1140 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1141 a = _mm_div_ss(a, a); 1142 float[4] correct = [1.0f, -2.0, 3.0f, 1.0f]; 1143 assert(a.array == correct); 1144 } 1145 1146 /// Extract a 16-bit unsigned integer from `a`, selected with `imm8`. Zero-extended. 1147 int _mm_extract_pi16 (__m64 a, int imm8) 1148 { 1149 short4 sa = cast(short4)a; 1150 return cast(ushort)(sa.array[imm8]); 1151 } 1152 unittest 1153 { 1154 __m64 A = _mm_setr_pi16(-1, 6, 0, 4); 1155 assert(_mm_extract_pi16(A, 0) == 65535); 1156 assert(_mm_extract_pi16(A, 1) == 6); 1157 assert(_mm_extract_pi16(A, 2) == 0); 1158 assert(_mm_extract_pi16(A, 3) == 4); 1159 } 1160 1161 /// Free aligned memory that was allocated with `_mm_malloc`. 1162 void _mm_free(void * mem_addr) @trusted 1163 { 1164 // support for free(NULL) 1165 if (mem_addr is null) 1166 return; 1167 1168 // Technically we don't need to store size and alignement in the chunk, but we do in case we 1169 // have to implement _mm_realloc 1170 1171 size_t pointerSize = (void*).sizeof; 1172 void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof); 1173 size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize); 1174 size_t alignment = *alignmentLocation; 1175 assert(alignment != 0); 1176 assert(isPointerAligned(mem_addr, alignment)); 1177 free(*rawLocation); 1178 } 1179 1180 /// Get the exception mask bits from the MXCSR control and status register. 1181 /// The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, 1182 /// `_MM_MASK_DIV_ZERO`, `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`. 1183 /// Note: won't correspond to reality on non-x86, where MXCSR this is emulated. 1184 uint _MM_GET_EXCEPTION_MASK() @safe 1185 { 1186 return _mm_getcsr() & _MM_MASK_MASK; 1187 } 1188 1189 /// Get the exception state bits from the MXCSR control and status register. 1190 /// The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 1191 /// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`. 1192 /// Note: won't correspond to reality on non-x86, where MXCSR this is emulated. No exception reported. 1193 uint _MM_GET_EXCEPTION_STATE() @safe 1194 { 1195 return _mm_getcsr() & _MM_EXCEPT_MASK; 1196 } 1197 1198 /// Get the flush zero bits from the MXCSR control and status register. 1199 /// The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF` 1200 uint _MM_GET_FLUSH_ZERO_MODE() @safe 1201 { 1202 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 1203 } 1204 1205 /// Get the rounding mode bits from the MXCSR control and status register. The rounding mode may 1206 /// contain any of the following flags: `_MM_ROUND_NEAREST, `_MM_ROUND_DOWN`, `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`. 1207 uint _MM_GET_ROUNDING_MODE() @safe 1208 { 1209 return _mm_getcsr() & _MM_ROUND_MASK; 1210 } 1211 1212 /// Get the unsigned 32-bit value of the MXCSR control and status register. 1213 /// Note: this is emulated on ARM, because there is no MXCSR register then. 1214 uint _mm_getcsr() @trusted 1215 { 1216 static if (LDC_with_ARM) 1217 { 1218 // Note: we convert the ARM FPSCR into a x86 SSE control word. 1219 // However, only rounding mode and flush to zero are actually set. 1220 // The returned control word will have all exceptions masked, and no exception detected. 1221 1222 uint fpscr = arm_get_fpcr(); 1223 1224 uint cw = 0; // No exception detected 1225 if (fpscr & _MM_FLUSH_ZERO_MASK_ARM) 1226 { 1227 // ARM has one single flag for ARM. 1228 // It does both x86 bits. 1229 // https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode 1230 cw |= _MM_FLUSH_ZERO_ON; 1231 cw |= 0x40; // set "denormals are zeros" 1232 } 1233 cw |= _MM_MASK_MASK; // All exception maske 1234 1235 // Rounding mode 1236 switch(fpscr & _MM_ROUND_MASK_ARM) 1237 { 1238 default: 1239 case _MM_ROUND_NEAREST_ARM: cw |= _MM_ROUND_NEAREST; break; 1240 case _MM_ROUND_DOWN_ARM: cw |= _MM_ROUND_DOWN; break; 1241 case _MM_ROUND_UP_ARM: cw |= _MM_ROUND_UP; break; 1242 case _MM_ROUND_TOWARD_ZERO_ARM: cw |= _MM_ROUND_TOWARD_ZERO; break; 1243 } 1244 return cw; 1245 } 1246 else version(GNU) 1247 { 1248 static if (GDC_with_SSE) 1249 { 1250 return __builtin_ia32_stmxcsr(); 1251 } 1252 else version(X86) 1253 { 1254 uint sseRounding = 0; 1255 asm pure nothrow @nogc @trusted 1256 { 1257 "stmxcsr %0;\n" 1258 : "=m" (sseRounding) 1259 : 1260 : ; 1261 } 1262 return sseRounding; 1263 } 1264 else 1265 static assert(false); 1266 } 1267 else version (InlineX86Asm) 1268 { 1269 uint controlWord; 1270 asm nothrow @nogc pure @safe 1271 { 1272 stmxcsr controlWord; 1273 } 1274 return controlWord; 1275 } 1276 else 1277 static assert(0, "Not yet supported"); 1278 } 1279 unittest 1280 { 1281 uint csr = _mm_getcsr(); 1282 } 1283 1284 /// Insert a 16-bit integer `i` inside `a` at the location specified by `imm8`. 1285 __m64 _mm_insert_pi16 (__m64 v, int i, int imm8) pure @trusted 1286 { 1287 short4 r = cast(short4)v; 1288 r.ptr[imm8 & 3] = cast(short)i; 1289 return cast(__m64)r; 1290 } 1291 unittest 1292 { 1293 __m64 A = _mm_set_pi16(3, 2, 1, 0); 1294 short4 R = cast(short4) _mm_insert_pi16(A, 42, 1 | 4); 1295 short[4] correct = [0, 42, 2, 3]; 1296 assert(R.array == correct); 1297 } 1298 1299 /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory. 1300 // `p` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1301 __m128 _mm_load_ps(const(float)*p) pure @trusted // TODO shouldn't be trusted 1302 { 1303 return *cast(__m128*)p; 1304 } 1305 unittest 1306 { 1307 static immutable align(16) float[4] correct = [1.0f, 2.0f, 3.0f, 4.0f]; 1308 __m128 A = _mm_load_ps(correct.ptr); 1309 assert(A.array == correct); 1310 } 1311 1312 /// Load a single-precision (32-bit) floating-point element from memory into all elements. 1313 __m128 _mm_load_ps1(const(float)*p) pure @trusted 1314 { 1315 return __m128(*p); 1316 } 1317 unittest 1318 { 1319 float n = 2.5f; 1320 float[4] correct = [2.5f, 2.5f, 2.5f, 2.5f]; 1321 __m128 A = _mm_load_ps1(&n); 1322 assert(A.array == correct); 1323 } 1324 1325 /// Load a single-precision (32-bit) floating-point element from memory into the lower of dst, and zero the upper 3 1326 /// elements. `mem_addr` does not need to be aligned on any particular boundary. 1327 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted 1328 { 1329 static if (DMD_with_DSIMD) 1330 { 1331 return cast(__m128)__simd(XMM.LODSS, *cast(__m128*)mem_addr); 1332 } 1333 else 1334 { 1335 __m128 r; 1336 r.ptr[0] = *mem_addr; 1337 r.ptr[1] = 0; 1338 r.ptr[2] = 0; 1339 r.ptr[3] = 0; 1340 return r; 1341 } 1342 } 1343 unittest 1344 { 1345 float n = 2.5f; 1346 float[4] correct = [2.5f, 0.0f, 0.0f, 0.0f]; 1347 __m128 A = _mm_load_ss(&n); 1348 assert(A.array == correct); 1349 } 1350 1351 /// Load a single-precision (32-bit) floating-point element from memory into all elements. 1352 alias _mm_load1_ps = _mm_load_ps1; 1353 1354 /// Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of result, 1355 /// and copy the lower 2 elements from `a` to result. `mem_addr does` not need to be aligned on any particular boundary. 1356 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @trusted 1357 { 1358 static if (DMD_with_DSIMD) 1359 { 1360 return cast(__m128) __simd(XMM.LODHPS, a, *cast(const(__m128)*)mem_addr); 1361 } 1362 else 1363 { 1364 // x86: movlhps generated since LDC 1.9.0 -O1 1365 long2 la = cast(long2)a; 1366 la.ptr[1] = (*mem_addr).array[0]; 1367 return cast(__m128)la; 1368 } 1369 } 1370 unittest 1371 { 1372 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1373 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1374 __m64 M = to_m64(cast(__m128i)B); 1375 __m128 R = _mm_loadh_pi(A, &M); 1376 float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f]; 1377 assert(R.array == correct); 1378 } 1379 1380 /// Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of result, 1381 /// and copy the upper 2 elements from `a` to result. `mem_addr` does not need to be aligned on any particular boundary. 1382 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @trusted 1383 { 1384 static if (DMD_with_DSIMD) 1385 { 1386 return cast(__m128) __simd(XMM.LODLPS, a, *cast(const(__m128)*)mem_addr); 1387 } 1388 else 1389 { 1390 // x86: movlpd/movlps generated with all LDC -01 1391 long2 la = cast(long2)a; 1392 la.ptr[0] = (*mem_addr).array[0]; 1393 return cast(__m128)la; 1394 } 1395 } 1396 unittest 1397 { 1398 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1399 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1400 __m64 M = to_m64(cast(__m128i)B); 1401 __m128 R = _mm_loadl_pi(A, &M); 1402 float[4] correct = [5.0f, 6.0f, 3.0f, 4.0f]; 1403 assert(R.array == correct); 1404 } 1405 1406 /// Load 4 single-precision (32-bit) floating-point elements from memory in reverse order. 1407 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1408 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted // TODO shouldn't be trusted 1409 { 1410 __m128* aligned = cast(__m128*)mem_addr; // x86: movaps + shups since LDC 1.0.0 -O1 1411 __m128 a = *aligned; 1412 static if (DMD_with_DSIMD) 1413 { 1414 return cast(__m128) __simd(XMM.SHUFPS, a, a, 27); 1415 } 1416 else 1417 { 1418 __m128 r; 1419 r.ptr[0] = a.array[3]; 1420 r.ptr[1] = a.array[2]; 1421 r.ptr[2] = a.array[1]; 1422 r.ptr[3] = a.array[0]; 1423 return r; 1424 } 1425 } 1426 unittest 1427 { 1428 align(16) static immutable float[4] arr = [ 1.0f, 2.0f, 3.0f, 8.0f ]; 1429 __m128 A = _mm_loadr_ps(arr.ptr); 1430 float[4] correct = [ 8.0f, 3.0f, 2.0f, 1.0f ]; 1431 assert(A.array == correct); 1432 } 1433 1434 /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory. 1435 /// `mem_addr` does not need to be aligned on any particular boundary. 1436 __m128 _mm_loadu_ps(const(float)* mem_addr) pure @trusted 1437 { 1438 static if (GDC_with_SSE2) 1439 { 1440 return __builtin_ia32_loadups(mem_addr); 1441 } 1442 else version(LDC) 1443 { 1444 return loadUnaligned!(__m128)(mem_addr); 1445 } 1446 else version(DigitalMars) 1447 { 1448 static if (DMD_with_DSIMD) 1449 { 1450 return cast(__m128)__simd(XMM.LODUPS, *mem_addr); 1451 } 1452 else static if (SSESizedVectorsAreEmulated) 1453 { 1454 // Since this vector is emulated, it doesn't have alignement constraints 1455 // and as such we can just cast it. 1456 return *cast(__m128*)(mem_addr); 1457 } 1458 else 1459 { 1460 __m128 result; 1461 result.ptr[0] = mem_addr[0]; 1462 result.ptr[1] = mem_addr[1]; 1463 result.ptr[2] = mem_addr[2]; 1464 result.ptr[3] = mem_addr[3]; 1465 return result; 1466 } 1467 } 1468 else 1469 { 1470 __m128 result; 1471 result.ptr[0] = mem_addr[0]; 1472 result.ptr[1] = mem_addr[1]; 1473 result.ptr[2] = mem_addr[2]; 1474 result.ptr[3] = mem_addr[3]; 1475 return result; 1476 } 1477 } 1478 unittest 1479 { 1480 align(16) static immutable float[5] arr = [ 1.0f, 2.0f, 3.0f, 8.0f, 9.0f ]; // force unaligned load 1481 __m128 A = _mm_loadu_ps(&arr[1]); 1482 float[4] correct = [ 2.0f, 3.0f, 8.0f, 9.0f ]; 1483 assert(A.array == correct); 1484 } 1485 1486 /// Load unaligned 16-bit integer from memory into the first element, fill with zeroes otherwise. 1487 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted 1488 { 1489 static if (DMD_with_DSIMD) 1490 { 1491 int r = *cast(short*)(mem_addr); 1492 return cast(__m128i) __simd(XMM.LODD, *cast(__m128i*)&r); 1493 } 1494 else version(DigitalMars) 1495 { 1496 // Workaround issue: https://issues.dlang.org/show_bug.cgi?id=21672 1497 // DMD cannot handle the below code... 1498 align(16) short[8] r = [0, 0, 0, 0, 0, 0, 0, 0]; 1499 r[0] = *cast(short*)(mem_addr); 1500 return *cast(int4*)(r.ptr); 1501 } 1502 else 1503 { 1504 short r = *cast(short*)(mem_addr); 1505 short8 result = [0, 0, 0, 0, 0, 0, 0, 0]; 1506 result.ptr[0] = r; 1507 return cast(__m128i)result; 1508 } 1509 } 1510 unittest 1511 { 1512 short r = 13; 1513 short8 A = cast(short8) _mm_loadu_si16(&r); 1514 short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0]; 1515 assert(A.array == correct); 1516 } 1517 1518 /// Load unaligned 64-bit integer from memory into the first element of result. 1519 /// Upper 64-bit is zeroed. 1520 __m128i _mm_loadu_si64(const(void)* mem_addr) pure @trusted 1521 { 1522 static if (DMD_with_DSIMD) 1523 { 1524 return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr); 1525 } 1526 else 1527 { 1528 long r = *cast(long*)(mem_addr); 1529 long2 result = [0, 0]; 1530 result.ptr[0] = r; 1531 return cast(__m128i)result; 1532 } 1533 } 1534 unittest 1535 { 1536 long r = 446446446446; 1537 long2 A = cast(long2) _mm_loadu_si64(&r); 1538 long[2] correct = [446446446446, 0]; 1539 assert(A.array == correct); 1540 } 1541 1542 /// Allocate size bytes of memory, aligned to the alignment specified in align, 1543 /// and return a pointer to the allocated memory. `_mm_free` should be used to free 1544 /// memory that is allocated with `_mm_malloc`. 1545 void* _mm_malloc(size_t size, size_t alignment) @trusted 1546 { 1547 assert(alignment != 0); 1548 size_t request = requestedSize(size, alignment); 1549 void* raw = malloc(request); 1550 if (request > 0 && raw == null) // malloc(0) can validly return anything 1551 onOutOfMemoryError(); 1552 return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size 1553 } 1554 1555 /// Conditionally store 8-bit integer elements from a into memory using mask (elements are not stored when the highest 1556 /// bit is not set in the corresponding element) and a non-temporal memory hint. 1557 void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr) @trusted 1558 { 1559 // this works since mask is zero-extended 1560 return _mm_maskmoveu_si128 (to_m128i(a), to_m128i(mask), mem_addr); 1561 } 1562 1563 deprecated("Use _mm_maskmove_si64 instead") alias _m_maskmovq = _mm_maskmove_si64;/// 1564 1565 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum value. 1566 __m64 _mm_max_pi16 (__m64 a, __m64 b) pure @safe 1567 { 1568 return to_m64(_mm_max_epi16(to_m128i(a), to_m128i(b))); 1569 } 1570 1571 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values. 1572 __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe 1573 { 1574 static if (DMD_with_DSIMD) 1575 { 1576 return cast(__m128) __simd(XMM.MAXPS, a, b); 1577 } 1578 else static if (GDC_with_SSE) 1579 { 1580 return __builtin_ia32_maxps(a, b); 1581 } 1582 else static if (LDC_with_SSE1) 1583 { 1584 return __builtin_ia32_maxps(a, b); 1585 } 1586 else 1587 { 1588 // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02 1589 __m128 r; 1590 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 1591 r[1] = (a[1] > b[1]) ? a[1] : b[1]; 1592 r[2] = (a[2] > b[2]) ? a[2] : b[2]; 1593 r[3] = (a[3] > b[3]) ? a[3] : b[3]; 1594 return r; 1595 } 1596 } 1597 unittest 1598 { 1599 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 1600 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 1601 __m128 M = _mm_max_ps(A, B); 1602 assert(M.array[0] == 4); 1603 assert(M.array[1] == 2); 1604 assert(M.array[2] == 4); // in case of NaN, second operand prevails (as it seems) 1605 assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems) 1606 } 1607 1608 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed maximum values. 1609 __m64 _mm_max_pu8 (__m64 a, __m64 b) pure @safe 1610 { 1611 return to_m64(_mm_max_epu8(to_m128i(a), to_m128i(b))); 1612 } 1613 1614 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the maximum value in the 1615 /// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result. 1616 __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe 1617 { 1618 static if (DMD_with_DSIMD) 1619 { 1620 return cast(__m128) __simd(XMM.MAXSS, a, b); 1621 } 1622 else static if (GDC_with_SSE) 1623 { 1624 return __builtin_ia32_maxss(a, b); 1625 } 1626 else static if (LDC_with_SSE1) 1627 { 1628 return __builtin_ia32_maxss(a, b); 1629 } 1630 else 1631 { 1632 __m128 r = a; 1633 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 1634 return r; 1635 } 1636 } 1637 unittest 1638 { 1639 __m128 A = _mm_setr_ps(1, 2, 3, 4); 1640 __m128 B = _mm_setr_ps(4, 1, 4, 1); 1641 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 1642 __m128 M = _mm_max_ss(A, B); 1643 assert(M.array[0] == 4); 1644 assert(M.array[1] == 2); 1645 assert(M.array[2] == 3); 1646 assert(M.array[3] == 4); 1647 M = _mm_max_ps(A, C); // in case of NaN, second operand prevails 1648 assert(M.array[0] != M.array[0]); 1649 M = _mm_max_ps(C, A); // in case of NaN, second operand prevails 1650 assert(M.array[0] == 1); 1651 } 1652 1653 /// Compare packed signed 16-bit integers in a and b, and return packed minimum values. 1654 __m64 _mm_min_pi16 (__m64 a, __m64 b) pure @safe 1655 { 1656 return to_m64(_mm_min_epi16(to_m128i(a), to_m128i(b))); 1657 } 1658 1659 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values. 1660 __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe 1661 { 1662 static if (DMD_with_DSIMD) 1663 { 1664 return cast(__m128) __simd(XMM.MINPS, a, b); 1665 } 1666 else static if (GDC_with_SSE) 1667 { 1668 return __builtin_ia32_minps(a, b); 1669 } 1670 else static if (LDC_with_SSE1) 1671 { 1672 // not technically needed, but better perf in debug mode 1673 return __builtin_ia32_minps(a, b); 1674 } 1675 else 1676 { 1677 // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02 1678 __m128 r; 1679 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 1680 r[1] = (a[1] < b[1]) ? a[1] : b[1]; 1681 r[2] = (a[2] < b[2]) ? a[2] : b[2]; 1682 r[3] = (a[3] < b[3]) ? a[3] : b[3]; 1683 return r; 1684 } 1685 } 1686 unittest 1687 { 1688 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 1689 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 1690 __m128 M = _mm_min_ps(A, B); 1691 assert(M.array[0] == 1); 1692 assert(M.array[1] == 1); 1693 assert(M.array[2] == 4); // in case of NaN, second operand prevails (as it seems) 1694 assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems) 1695 } 1696 1697 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values. 1698 __m64 _mm_min_pu8 (__m64 a, __m64 b) pure @safe 1699 { 1700 return to_m64(_mm_min_epu8(to_m128i(a), to_m128i(b))); 1701 } 1702 1703 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the minimum value in the 1704 /// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result. 1705 __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe 1706 { 1707 static if (DMD_with_DSIMD) 1708 { 1709 return cast(__m128) __simd(XMM.MINSS, a, b); 1710 } 1711 else static if (GDC_with_SSE) 1712 { 1713 return __builtin_ia32_minss(a, b); 1714 } 1715 else static if (LDC_with_SSE1) 1716 { 1717 return __builtin_ia32_minss(a, b); 1718 } 1719 else 1720 { 1721 // Generates minss since LDC 1.3 -O1 1722 __m128 r = a; 1723 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 1724 return r; 1725 } 1726 } 1727 unittest 1728 { 1729 __m128 A = _mm_setr_ps(1, 2, 3, 4); 1730 __m128 B = _mm_setr_ps(4, 1, 4, 1); 1731 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 1732 __m128 M = _mm_min_ss(A, B); 1733 assert(M.array[0] == 1); 1734 assert(M.array[1] == 2); 1735 assert(M.array[2] == 3); 1736 assert(M.array[3] == 4); 1737 M = _mm_min_ps(A, C); // in case of NaN, second operand prevails 1738 assert(M.array[0] != M.array[0]); 1739 M = _mm_min_ps(C, A); // in case of NaN, second operand prevails 1740 assert(M.array[0] == 1); 1741 } 1742 1743 /// Move the lower single-precision (32-bit) floating-point element from `b` to the lower element of result, and copy 1744 /// the upper 3 packed elements from `a` to the upper elements of result. 1745 __m128 _mm_move_ss (__m128 a, __m128 b) pure @trusted 1746 { 1747 // Workaround https://issues.dlang.org/show_bug.cgi?id=21673 1748 // inlining of this function fails. 1749 version(DigitalMars) asm nothrow @nogc pure { nop; } 1750 1751 a.ptr[0] = b.array[0]; 1752 return a; 1753 } 1754 unittest 1755 { 1756 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1757 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1758 __m128 R = _mm_move_ss(A, B); 1759 float[4] correct = [5.0f, 2.0f, 3.0f, 4.0f]; 1760 assert(R.array == correct); 1761 } 1762 1763 /// Move the upper 2 single-precision (32-bit) floating-point elements from `b` to the lower 2 elements of result, and 1764 /// copy the upper 2 elements from `a` to the upper 2 elements of dst. 1765 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @trusted 1766 { 1767 // Disabled because of https://issues.dlang.org/show_bug.cgi?id=19443 1768 /* 1769 static if (DMD_with_DSIMD) 1770 { 1771 1772 return cast(__m128) __simd(XMM.MOVHLPS, a, b); 1773 } 1774 else */ 1775 { 1776 a.ptr[0] = b.array[2]; 1777 a.ptr[1] = b.array[3]; 1778 return a; 1779 } 1780 } 1781 unittest 1782 { 1783 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1784 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1785 __m128 R = _mm_movehl_ps(A, B); 1786 float[4] correct = [7.0f, 8.0f, 3.0f, 4.0f]; 1787 assert(R.array == correct); 1788 } 1789 1790 /// Move the lower 2 single-precision (32-bit) floating-point elements from `b` to the upper 2 elements of result, and 1791 /// copy the lower 2 elements from `a` to the lower 2 elements of result 1792 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @trusted 1793 { 1794 // Disabled because of https://issues.dlang.org/show_bug.cgi?id=19443 1795 /* 1796 static if (DMD_with_DSIMD) 1797 { 1798 return cast(__m128) __simd(XMM.MOVLHPS, a, b); 1799 } 1800 else 1801 */ 1802 { 1803 a.ptr[2] = b.array[0]; 1804 a.ptr[3] = b.array[1]; 1805 return a; 1806 } 1807 } 1808 unittest 1809 { 1810 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1811 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1812 __m128 R = _mm_movelh_ps(A, B); 1813 float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f]; 1814 assert(R.array == correct); 1815 } 1816 1817 /// Create mask from the most significant bit of each 8-bit element in `a`. 1818 int _mm_movemask_pi8 (__m64 a) pure @safe 1819 { 1820 return _mm_movemask_epi8(to_m128i(a)); 1821 } 1822 unittest 1823 { 1824 assert(0x9C == _mm_movemask_pi8(_mm_set_pi8(-1, 0, 0, -1, -1, -1, 0, 0))); 1825 } 1826 1827 /// Set each bit of result based on the most significant bit of the corresponding packed single-precision (32-bit) 1828 /// floating-point element in `a`. 1829 int _mm_movemask_ps (__m128 a) pure @trusted 1830 { 1831 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 1832 static if (GDC_with_SSE) 1833 { 1834 return __builtin_ia32_movmskps(a); 1835 } 1836 else static if (LDC_with_SSE1) 1837 { 1838 return __builtin_ia32_movmskps(a); 1839 } 1840 else static if (LDC_with_ARM) 1841 { 1842 int4 ai = cast(int4)a; 1843 int4 shift31 = [31, 31, 31, 31]; 1844 ai = ai >>> shift31; 1845 int4 shift = [0, 1, 2, 3]; 1846 ai = ai << shift; // 4-way shift, only efficient on ARM. 1847 int r = ai.array[0] + (ai.array[1]) + (ai.array[2]) + (ai.array[3]); 1848 return r; 1849 } 1850 else 1851 { 1852 int4 ai = cast(int4)a; 1853 int r = 0; 1854 if (ai.array[0] < 0) r += 1; 1855 if (ai.array[1] < 0) r += 2; 1856 if (ai.array[2] < 0) r += 4; 1857 if (ai.array[3] < 0) r += 8; 1858 return r; 1859 } 1860 } 1861 unittest 1862 { 1863 int4 A = [-1, 0, -43, 0]; 1864 assert(5 == _mm_movemask_ps(cast(float4)A)); 1865 } 1866 1867 /// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`. 1868 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe 1869 { 1870 return a * b; 1871 } 1872 unittest 1873 { 1874 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1875 a = _mm_mul_ps(a, a); 1876 float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f]; 1877 assert(a.array == correct); 1878 } 1879 1880 /// Multiply the lower single-precision (32-bit) floating-point element in `a` and `b`, store the result in the lower 1881 /// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result. 1882 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe 1883 { 1884 static if (DMD_with_DSIMD) 1885 return cast(__m128) __simd(XMM.MULSS, a, b); 1886 else static if (GDC_with_SSE) 1887 return __builtin_ia32_mulss(a, b); 1888 else 1889 { 1890 a[0] *= b[0]; 1891 return a; 1892 } 1893 } 1894 unittest 1895 { 1896 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1897 a = _mm_mul_ss(a, a); 1898 float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f]; 1899 assert(a.array == correct); 1900 } 1901 1902 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 1903 /// and return the high 16 bits of the intermediate integers. 1904 __m64 _mm_mulhi_pu16 (__m64 a, __m64 b) pure @safe 1905 { 1906 return to_m64(_mm_mulhi_epu16(to_m128i(a), to_m128i(b))); 1907 } 1908 unittest 1909 { 1910 __m64 A = _mm_setr_pi16(0, -16, 2, 3); 1911 __m64 B = _mm_set1_pi16(16384); 1912 short4 R = cast(short4)_mm_mulhi_pu16(A, B); 1913 short[4] correct = [0, 0x3FFC, 0, 0]; 1914 assert(R.array == correct); 1915 } 1916 1917 /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in `a` and `b`, and 1918 /// return the result. 1919 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe 1920 { 1921 static if (DMD_with_DSIMD) 1922 return cast(__m128)__simd(XMM.ORPS, a, b); 1923 else 1924 return cast(__m128)(cast(__m128i)a | cast(__m128i)b); 1925 } 1926 1927 deprecated("Use _mm_avg_pu8 instead") alias _m_pavgb = _mm_avg_pu8;/// 1928 deprecated("Use _mm_avg_pu16 instead") alias _m_pavgw = _mm_avg_pu16;/// 1929 deprecated("Use _mm_extract_pi16 instead") alias _m_pextrw = _mm_extract_pi16;/// 1930 deprecated("Use _mm_insert_pi16 instead") alias _m_pinsrw = _mm_insert_pi16;/// 1931 deprecated("Use _mm_max_pi16 instead") alias _m_pmaxsw = _mm_max_pi16;/// 1932 deprecated("Use _mm_max_pu8 instead") alias _m_pmaxub = _mm_max_pu8;/// 1933 deprecated("Use _mm_min_pi16 instead") alias _m_pminsw = _mm_min_pi16;/// 1934 deprecated("Use _mm_min_pu8 instead") alias _m_pminub = _mm_min_pu8;/// 1935 deprecated("Use _mm_movemask_pi8 instead") alias _m_pmovmskb = _mm_movemask_pi8;/// 1936 deprecated("Use _mm_mulhi_pu16 instead") alias _m_pmulhuw = _mm_mulhi_pu16;/// 1937 1938 enum _MM_HINT_T0 = 3; /// 1939 enum _MM_HINT_T1 = 2; /// 1940 enum _MM_HINT_T2 = 1; /// 1941 enum _MM_HINT_NTA = 0; /// 1942 1943 1944 version(LDC) 1945 { 1946 // Starting with LLVM 10, it seems llvm.prefetch has changed its name. 1947 // Was reported at: https://github.com/ldc-developers/ldc/issues/3397 1948 static if (__VERSION__ >= 2091) 1949 { 1950 pragma(LDC_intrinsic, "llvm.prefetch.p0i8") // was "llvm.prefetch" 1951 void llvm_prefetch_fixed(void* ptr, uint rw, uint locality, uint cachetype) pure @safe; 1952 } 1953 } 1954 1955 /// Fetch the line of data from memory that contains address `p` to a location in the 1956 /// cache hierarchy specified by the locality hint i. 1957 /// 1958 /// Warning: `locality` is a compile-time parameter, unlike in Intel Intrinsics API. 1959 void _mm_prefetch(int locality)(const(void)* p) pure @trusted 1960 { 1961 static if (GDC_with_SSE) 1962 { 1963 return __builtin_prefetch(p, (locality & 0x4) >> 2, locality & 0x3); 1964 } 1965 else static if (DMD_with_DSIMD) 1966 { 1967 enum bool isWrite = (locality & 0x4) != 0; 1968 enum level = locality & 3; 1969 return prefetch!(isWrite, level)(p); 1970 } 1971 else version(LDC) 1972 { 1973 static if (__VERSION__ >= 2091) 1974 { 1975 // const_cast here. `llvm_prefetch` wants a mutable pointer 1976 llvm_prefetch_fixed( cast(void*)p, 0, locality, 1); 1977 } 1978 else 1979 { 1980 // const_cast here. `llvm_prefetch` wants a mutable pointer 1981 llvm_prefetch( cast(void*)p, 0, locality, 1); 1982 } 1983 } 1984 else version(D_InlineAsm_X86_64) 1985 { 1986 static if (locality == _MM_HINT_NTA) 1987 { 1988 asm pure nothrow @nogc @trusted 1989 { 1990 mov RAX, p; 1991 prefetchnta [RAX]; 1992 } 1993 } 1994 else static if (locality == _MM_HINT_T0) 1995 { 1996 asm pure nothrow @nogc @trusted 1997 { 1998 mov RAX, p; 1999 prefetcht0 [RAX]; 2000 } 2001 } 2002 else static if (locality == _MM_HINT_T1) 2003 { 2004 asm pure nothrow @nogc @trusted 2005 { 2006 mov RAX, p; 2007 prefetcht1 [RAX]; 2008 } 2009 } 2010 else static if (locality == _MM_HINT_T2) 2011 { 2012 asm pure nothrow @nogc @trusted 2013 { 2014 mov RAX, p; 2015 prefetcht2 [RAX]; 2016 } 2017 } 2018 else 2019 assert(false); // invalid locality hint 2020 } 2021 else version(D_InlineAsm_X86) 2022 { 2023 static if (locality == _MM_HINT_NTA) 2024 { 2025 asm pure nothrow @nogc @trusted 2026 { 2027 mov EAX, p; 2028 prefetchnta [EAX]; 2029 } 2030 } 2031 else static if (locality == _MM_HINT_T0) 2032 { 2033 asm pure nothrow @nogc @trusted 2034 { 2035 mov EAX, p; 2036 prefetcht0 [EAX]; 2037 } 2038 } 2039 else static if (locality == _MM_HINT_T1) 2040 { 2041 asm pure nothrow @nogc @trusted 2042 { 2043 mov EAX, p; 2044 prefetcht1 [EAX]; 2045 } 2046 } 2047 else static if (locality == _MM_HINT_T2) 2048 { 2049 asm pure nothrow @nogc @trusted 2050 { 2051 mov EAX, p; 2052 prefetcht2 [EAX]; 2053 } 2054 } 2055 else 2056 assert(false); // invalid locality hint 2057 } 2058 else 2059 { 2060 // Generic version: do nothing. From bitter experience, 2061 // it's unlikely you get ANY speed-up with manual prefetching. 2062 // Prefetching or not doesn't change program behaviour. 2063 } 2064 } 2065 unittest 2066 { 2067 // From Intel documentation: 2068 // "The amount of data prefetched is also processor implementation-dependent. It will, however, be a minimum of 2069 // 32 bytes." 2070 ubyte[256] cacheline; // though it seems it cannot generate GP fault 2071 _mm_prefetch!_MM_HINT_T0(cacheline.ptr); 2072 _mm_prefetch!_MM_HINT_T1(cacheline.ptr); 2073 _mm_prefetch!_MM_HINT_T2(cacheline.ptr); 2074 _mm_prefetch!_MM_HINT_NTA(cacheline.ptr); 2075 } 2076 2077 deprecated("Use _mm_sad_pu8 instead") alias _m_psadbw = _mm_sad_pu8;/// 2078 deprecated("Use _mm_shuffle_pi16 instead") alias _m_pshufw = _mm_shuffle_pi16;/// 2079 2080 2081 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a`` , 2082 /// and return the results. The maximum relative error for this approximation is less than 1.5*2^-12. 2083 __m128 _mm_rcp_ps (__m128 a) pure @trusted 2084 { 2085 static if (DMD_with_DSIMD) 2086 { 2087 return cast(__m128) __simd(XMM.RCPPS, a); 2088 } 2089 else static if (GDC_with_SSE) 2090 { 2091 return __builtin_ia32_rcpps(a); 2092 } 2093 else static if (LDC_with_SSE1) 2094 { 2095 return __builtin_ia32_rcpps(a); 2096 } 2097 else 2098 { 2099 a.ptr[0] = 1.0f / a.array[0]; 2100 a.ptr[1] = 1.0f / a.array[1]; 2101 a.ptr[2] = 1.0f / a.array[2]; 2102 a.ptr[3] = 1.0f / a.array[3]; 2103 return a; 2104 } 2105 } 2106 unittest 2107 { 2108 __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f); 2109 __m128 groundTruth = _mm_set1_ps(1.0f) / A; 2110 __m128 result = _mm_rcp_ps(A); 2111 foreach(i; 0..4) 2112 { 2113 double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1; 2114 assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 2115 } 2116 } 2117 2118 /// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in `a`, store it 2119 /// in the lower element of the result, and copy the upper 3 packed elements from `a` to the upper elements of result. 2120 /// The maximum relative error for this approximation is less than 1.5*2^-12. 2121 __m128 _mm_rcp_ss (__m128 a) pure @trusted 2122 { 2123 static if (DMD_with_DSIMD) 2124 { 2125 return cast(__m128) __simd(XMM.RCPSS, a); 2126 } 2127 else static if (GDC_with_SSE) 2128 { 2129 return __builtin_ia32_rcpss(a); 2130 } 2131 else static if (LDC_with_SSE1) 2132 { 2133 return __builtin_ia32_rcpss(a); 2134 } 2135 else 2136 { 2137 a.ptr[0] = 1.0f / a.array[0]; 2138 return a; 2139 } 2140 } 2141 unittest 2142 { 2143 __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f); 2144 __m128 correct = _mm_setr_ps(1 / 2.34f, -70000.0f, 0.00001f, 345.5f); 2145 __m128 R = _mm_rcp_ss(A); 2146 double relError = (cast(double)(correct.array[0]) / R.array[0]) - 1; 2147 assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 2148 assert(R.array[1] == correct.array[1]); 2149 assert(R.array[2] == correct.array[2]); 2150 assert(R.array[3] == correct.array[3]); 2151 } 2152 2153 /// Reallocate `size` bytes of memory, aligned to the alignment specified in `alignment`, 2154 /// and return a pointer to the newly allocated memory. 2155 /// `_mm_free` or `alignedRealloc` with size 0 should be used to free memory that is 2156 /// allocated with `_mm_malloc` or `_mm_realloc`. 2157 /// Previous data is preserved. 2158 void* _mm_realloc(void* aligned, size_t size, size_t alignment) nothrow @nogc // #BONUS 2159 { 2160 return alignedReallocImpl!true(aligned, size, alignment); 2161 } 2162 unittest 2163 { 2164 enum NALLOC = 8; 2165 enum size_t[8] ALIGNMENTS = [1, 2, 4, 8, 16, 32, 64, 128]; 2166 2167 void*[NALLOC] alloc; 2168 2169 foreach(t; 0..100) 2170 { 2171 foreach(n; 0..NALLOC) 2172 { 2173 size_t alignment = ALIGNMENTS[n]; 2174 size_t s = ( (n + t * 69096) & 0xffff ); 2175 alloc[n] = _mm_realloc(alloc[n], s, alignment); 2176 assert(isPointerAligned(alloc[n], alignment)); 2177 foreach(b; 0..s) 2178 (cast(ubyte*)alloc[n])[b] = cast(ubyte)n; 2179 } 2180 } 2181 foreach(n; 0..NALLOC) 2182 { 2183 alloc[n] = _mm_realloc(alloc[n], 0, ALIGNMENTS[n]); 2184 } 2185 } 2186 2187 /// Reallocate `size` bytes of memory, aligned to the alignment specified in `alignment`, 2188 /// and return a pointer to the newly allocated memory. 2189 /// `_mm_free` or `alignedRealloc` with size 0 should be used to free memory that is 2190 /// allocated with `_mm_malloc` or `_mm_realloc`. 2191 /// Previous data is discarded. 2192 void* _mm_realloc_discard(void* aligned, size_t size, size_t alignment) nothrow @nogc // #BONUS 2193 { 2194 return alignedReallocImpl!false(aligned, size, alignment); 2195 } 2196 2197 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in `a`. 2198 /// The maximum relative error for this approximation is less than 1.5*2^-12. 2199 __m128 _mm_rsqrt_ps (__m128 a) pure @trusted 2200 { 2201 static if (DMD_with_DSIMD) 2202 { 2203 return cast(__m128) __simd(XMM.RSQRTPS, a); 2204 } 2205 else static if (GDC_with_SSE) 2206 { 2207 return __builtin_ia32_rsqrtps(a); 2208 } 2209 else static if (LDC_with_SSE1) 2210 { 2211 return __builtin_ia32_rsqrtps(a); 2212 } 2213 else version(LDC) 2214 { 2215 a[0] = 1.0f / llvm_sqrt(a[0]); 2216 a[1] = 1.0f / llvm_sqrt(a[1]); 2217 a[2] = 1.0f / llvm_sqrt(a[2]); 2218 a[3] = 1.0f / llvm_sqrt(a[3]); 2219 return a; 2220 } 2221 else 2222 { 2223 a.ptr[0] = 1.0f / sqrt(a.array[0]); 2224 a.ptr[1] = 1.0f / sqrt(a.array[1]); 2225 a.ptr[2] = 1.0f / sqrt(a.array[2]); 2226 a.ptr[3] = 1.0f / sqrt(a.array[3]); 2227 return a; 2228 } 2229 } 2230 unittest 2231 { 2232 __m128 A = _mm_setr_ps(2.34f, 70000.0f, 0.00001f, 345.5f); 2233 __m128 groundTruth = _mm_setr_ps(0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f); 2234 __m128 result = _mm_rsqrt_ps(A); 2235 foreach(i; 0..4) 2236 { 2237 double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1; 2238 assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 2239 } 2240 } 2241 2242 /// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in `a`, 2243 /// store the result in the lower element. Copy the upper 3 packed elements from `a` to the upper elements of result. 2244 /// The maximum relative error for this approximation is less than 1.5*2^-12. 2245 __m128 _mm_rsqrt_ss (__m128 a) pure @trusted 2246 { 2247 static if (DMD_with_DSIMD) 2248 { 2249 return cast(__m128) __simd(XMM.RSQRTSS, a); 2250 } 2251 else static if (GDC_with_SSE) 2252 { 2253 return __builtin_ia32_rsqrtss(a); 2254 } 2255 else static if (LDC_with_SSE1) 2256 { 2257 return __builtin_ia32_rsqrtss(a); 2258 } 2259 else version(LDC) 2260 { 2261 a[0] = 1.0f / llvm_sqrt(a[0]); 2262 return a; 2263 } 2264 else 2265 { 2266 a[0] = 1.0f / sqrt(a[0]); 2267 return a; 2268 } 2269 } 2270 unittest // this one test 4 different intrinsics: _mm_rsqrt_ss, _mm_rsqrt_ps, _mm_rcp_ps, _mm_rcp_ss 2271 { 2272 double maxRelativeError = 0.000245; // -72 dB, stuff is apparently more precise than said in the doc? 2273 void testApproximateSSE(float number) nothrow @nogc 2274 { 2275 __m128 A = _mm_set1_ps(number); 2276 2277 // test _mm_rcp_ps 2278 __m128 B = _mm_rcp_ps(A); 2279 foreach(i; 0..4) 2280 { 2281 double exact = 1.0f / A.array[i]; 2282 double ratio = cast(double)(B.array[i]) / cast(double)(exact); 2283 assert(abs_double(ratio - 1) <= maxRelativeError); 2284 } 2285 2286 // test _mm_rcp_ss 2287 { 2288 B = _mm_rcp_ss(A); 2289 double exact = 1.0f / A.array[0]; 2290 double ratio = cast(double)(B.array[0]) / cast(double)(exact); 2291 assert(abs_double(ratio - 1) <= maxRelativeError); 2292 } 2293 2294 // test _mm_rsqrt_ps 2295 B = _mm_rsqrt_ps(A); 2296 foreach(i; 0..4) 2297 { 2298 double exact = 1.0f / sqrt(A.array[i]); 2299 double ratio = cast(double)(B.array[i]) / cast(double)(exact); 2300 assert(abs_double(ratio - 1) <= maxRelativeError); 2301 } 2302 2303 // test _mm_rsqrt_ss 2304 { 2305 B = _mm_rsqrt_ss(A); 2306 double exact = 1.0f / sqrt(A.array[0]); 2307 double ratio = cast(double)(B.array[0]) / cast(double)(exact); 2308 assert(abs_double(ratio - 1) <= maxRelativeError); 2309 } 2310 } 2311 2312 testApproximateSSE(0.00001f); 2313 testApproximateSSE(1.1f); 2314 testApproximateSSE(345.0f); 2315 testApproximateSSE(2.45674864151f); 2316 testApproximateSSE(700000.0f); 2317 testApproximateSSE(10000000.0f); 2318 testApproximateSSE(27841456468.0f); 2319 } 2320 2321 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 2322 /// consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 2323 /// low 16 bits of result. 2324 __m64 _mm_sad_pu8 (__m64 a, __m64 b) pure @safe 2325 { 2326 return to_m64(_mm_sad_epu8(to_m128i(a), to_m128i(b))); 2327 } 2328 2329 /// Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer 2330 /// `_MM_MASK_xxxx`. The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, `_MM_MASK_DIV_ZERO`, 2331 /// `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`. 2332 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) @safe 2333 { 2334 // Note: unsupported on ARM 2335 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx); 2336 } 2337 2338 /// Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer 2339 /// `_MM_EXCEPT_xxxx`. The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 2340 /// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`. 2341 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) @safe 2342 { 2343 // Note: unsupported on ARM 2344 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx); 2345 } 2346 2347 /// Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer 2348 /// `_MM_FLUSH_xxxx`. The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF`. 2349 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) @safe 2350 { 2351 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx); 2352 } 2353 2354 /// Set packed single-precision (32-bit) floating-point elements with the supplied values. 2355 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted 2356 { 2357 // Note: despite appearances, generates sensible code, 2358 // inlines correctly and is constant folded 2359 float[4] result = [e0, e1, e2, e3]; 2360 return loadUnaligned!(float4)(result.ptr); 2361 } 2362 unittest 2363 { 2364 __m128 A = _mm_set_ps(3, 2, 1, 546); 2365 float[4] correct = [546.0f, 1.0f, 2.0f, 3.0f]; 2366 assert(A.array == correct); 2367 } 2368 2369 deprecated("Use _mm_set1_ps instead") alias _mm_set_ps1 = _mm_set1_ps; /// 2370 2371 /// Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer 2372 /// `_MM_ROUND_xxxx`. The rounding mode may contain any of the following flags: `_MM_ROUND_NEAREST`, `_MM_ROUND_DOWN`, 2373 /// `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`. 2374 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) @safe 2375 { 2376 // Work-around for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607 2377 version(GNU) asm @trusted { "" : : : "memory"; } 2378 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx); 2379 } 2380 2381 /// Copy single-precision (32-bit) floating-point element `a` to the lower element of result, and zero the upper 3 elements. 2382 __m128 _mm_set_ss (float a) pure @trusted 2383 { 2384 static if (DMD_with_DSIMD) 2385 { 2386 return cast(__m128) __simd(XMM.LODSS, a); 2387 } 2388 else 2389 { 2390 __m128 r = _mm_setzero_ps(); 2391 r.ptr[0] = a; 2392 return r; 2393 } 2394 } 2395 unittest 2396 { 2397 float[4] correct = [42.0f, 0.0f, 0.0f, 0.0f]; 2398 __m128 A = _mm_set_ss(42.0f); 2399 assert(A.array == correct); 2400 } 2401 2402 /// Broadcast single-precision (32-bit) floating-point value `a` to all elements. 2403 __m128 _mm_set1_ps (float a) pure @trusted 2404 { 2405 __m128 r = a; 2406 return r; 2407 } 2408 unittest 2409 { 2410 float[4] correct = [42.0f, 42.0f, 42.0f, 42.0f]; 2411 __m128 A = _mm_set1_ps(42.0f); 2412 assert(A.array == correct); 2413 } 2414 2415 /// Set the MXCSR control and status register with the value in unsigned 32-bit integer `controlWord`. 2416 void _mm_setcsr(uint controlWord) @trusted 2417 { 2418 static if (LDC_with_ARM) 2419 { 2420 // Convert from SSE to ARM control word. This is done _partially_ 2421 // and only support rounding mode changes. 2422 2423 // "To alter some bits of a VFP system register without 2424 // affecting other bits, use a read-modify-write procedure" 2425 uint fpscr = arm_get_fpcr(); 2426 2427 // Bits 23 to 22 are rounding modes, however not used in NEON 2428 fpscr = fpscr & ~_MM_ROUND_MASK_ARM; 2429 switch(controlWord & _MM_ROUND_MASK) 2430 { 2431 default: 2432 case _MM_ROUND_NEAREST: fpscr |= _MM_ROUND_NEAREST_ARM; break; 2433 case _MM_ROUND_DOWN: fpscr |= _MM_ROUND_DOWN_ARM; break; 2434 case _MM_ROUND_UP: fpscr |= _MM_ROUND_UP_ARM; break; 2435 case _MM_ROUND_TOWARD_ZERO: fpscr |= _MM_ROUND_TOWARD_ZERO_ARM; break; 2436 } 2437 fpscr = fpscr & ~_MM_FLUSH_ZERO_MASK_ARM; 2438 if (controlWord & _MM_FLUSH_ZERO_MASK) 2439 fpscr |= _MM_FLUSH_ZERO_MASK_ARM; 2440 arm_set_fpcr(fpscr); 2441 } 2442 else version(GNU) 2443 { 2444 static if (GDC_with_SSE) 2445 { 2446 // Work-around for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607 2447 version(GNU) asm @trusted { "" : : : "memory"; } 2448 __builtin_ia32_ldmxcsr(controlWord); 2449 } 2450 else version(X86) 2451 { 2452 asm pure nothrow @nogc @trusted 2453 { 2454 "ldmxcsr %0;\n" 2455 : 2456 : "m" (controlWord) 2457 : ; 2458 } 2459 } 2460 else 2461 static assert(false); 2462 } 2463 else version (InlineX86Asm) 2464 { 2465 asm pure nothrow @nogc @safe 2466 { 2467 ldmxcsr controlWord; 2468 } 2469 } 2470 else 2471 static assert(0, "Not yet supported"); 2472 } 2473 unittest 2474 { 2475 _mm_setcsr(_mm_getcsr()); 2476 } 2477 2478 /// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order. 2479 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted 2480 { 2481 float[4] result = [e3, e2, e1, e0]; 2482 return loadUnaligned!(float4)(result.ptr); 2483 } 2484 unittest 2485 { 2486 __m128 A = _mm_setr_ps(3, 2, 1, 546); 2487 float[4] correct = [3.0f, 2.0f, 1.0f, 546.0f]; 2488 assert(A.array == correct); 2489 assert(A.array[0] == 3.0f); 2490 assert(A.array[1] == 2.0f); 2491 assert(A.array[2] == 1.0f); 2492 assert(A.array[3] == 546.0f); 2493 } 2494 2495 /// Return vector of type `__m128` with all elements set to zero. 2496 __m128 _mm_setzero_ps() pure @trusted 2497 { 2498 float4 r; 2499 r = 0.0f; 2500 return r; 2501 } 2502 unittest 2503 { 2504 __m128 R = _mm_setzero_ps(); 2505 float[4] correct = [0.0f, 0, 0, 0]; 2506 assert(R.array == correct); 2507 } 2508 2509 /// Perform a serializing operation on all store-to-memory instructions that were issued prior 2510 /// to this instruction. Guarantees that every store instruction that precedes, in program order, 2511 /// is globally visible before any store instruction which follows the fence in program order. 2512 void _mm_sfence() @trusted 2513 { 2514 version(GNU) 2515 { 2516 static if (GDC_with_SSE) 2517 { 2518 __builtin_ia32_sfence(); 2519 } 2520 else version(X86) 2521 { 2522 asm pure nothrow @nogc @trusted 2523 { 2524 "sfence;\n" : : : ; 2525 } 2526 } 2527 else 2528 static assert(false); 2529 } 2530 else static if (LDC_with_SSE1) 2531 { 2532 __builtin_ia32_sfence(); 2533 } 2534 else static if (DMD_with_asm) 2535 { 2536 asm nothrow @nogc pure @safe 2537 { 2538 sfence; 2539 } 2540 } 2541 else version(LDC) 2542 { 2543 llvm_memory_fence(); // PERF: this generates mfence instead of sfence 2544 } 2545 else 2546 static assert(false); 2547 } 2548 unittest 2549 { 2550 _mm_sfence(); 2551 } 2552 2553 /// Warning: the immediate shuffle value `imm8` is given at compile-time instead of runtime. 2554 __m64 _mm_shuffle_pi16(int imm8)(__m64 a) pure @safe 2555 { 2556 return cast(__m64) shufflevector!(short4, ( (imm8 >> 0) & 3 ), 2557 ( (imm8 >> 2) & 3 ), 2558 ( (imm8 >> 4) & 3 ), 2559 ( (imm8 >> 6) & 3 ))(cast(short4)a, cast(short4)a); 2560 } 2561 unittest 2562 { 2563 __m64 A = _mm_setr_pi16(0, 1, 2, 3); 2564 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 2565 short4 B = cast(short4) _mm_shuffle_pi16!SHUFFLE(A); 2566 short[4] expectedB = [ 3, 2, 1, 0 ]; 2567 assert(B.array == expectedB); 2568 } 2569 2570 /// Warning: the immediate shuffle value `imm8` is given at compile-time instead of runtime. 2571 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe 2572 { 2573 return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b); 2574 } 2575 2576 /// Compute the square root of packed single-precision (32-bit) floating-point elements in `a`. 2577 __m128 _mm_sqrt_ps(__m128 a) @trusted 2578 { 2579 static if (GDC_with_SSE) 2580 { 2581 return __builtin_ia32_sqrtps(a); 2582 } 2583 else version(LDC) 2584 { 2585 // Disappeared with LDC 1.11 2586 static if (__VERSION__ < 2081) 2587 return __builtin_ia32_sqrtps(a); 2588 else 2589 { 2590 a[0] = llvm_sqrt(a[0]); 2591 a[1] = llvm_sqrt(a[1]); 2592 a[2] = llvm_sqrt(a[2]); 2593 a[3] = llvm_sqrt(a[3]); 2594 return a; 2595 } 2596 } 2597 else 2598 { 2599 a.ptr[0] = sqrt(a.array[0]); 2600 a.ptr[1] = sqrt(a.array[1]); 2601 a.ptr[2] = sqrt(a.array[2]); 2602 a.ptr[3] = sqrt(a.array[3]); 2603 return a; 2604 } 2605 } 2606 unittest 2607 { 2608 __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f)); 2609 assert(A.array[0] == 2.0f); 2610 assert(A.array[1] == 2.0f); 2611 assert(A.array[2] == 2.0f); 2612 assert(A.array[3] == 2.0f); 2613 } 2614 2615 /// Compute the square root of the lower single-precision (32-bit) floating-point element in `a`, store it in the lower 2616 /// element, and copy the upper 3 packed elements from `a` to the upper elements of result. 2617 __m128 _mm_sqrt_ss(__m128 a) @trusted 2618 { 2619 static if (GDC_with_SSE) 2620 { 2621 return __builtin_ia32_sqrtss(a); 2622 } 2623 else version(LDC) 2624 { 2625 a.ptr[0] = llvm_sqrt(a.array[0]); 2626 return a; 2627 } 2628 else 2629 { 2630 a.ptr[0] = sqrt(a.array[0]); 2631 return a; 2632 } 2633 } 2634 unittest 2635 { 2636 __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f)); 2637 assert(A.array[0] == 2.0f); 2638 assert(A.array[1] == 4.0f); 2639 assert(A.array[2] == 4.0f); 2640 assert(A.array[3] == 4.0f); 2641 } 2642 2643 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 2644 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 2645 void _mm_store_ps (float* mem_addr, __m128 a) pure 2646 { 2647 __m128* aligned = cast(__m128*)mem_addr; 2648 *aligned = a; 2649 } 2650 2651 deprecated("Use _mm_store1_ps instead") alias _mm_store_ps1 = _mm_store1_ps; /// 2652 2653 /// Store the lower single-precision (32-bit) floating-point element from `a` into memory. 2654 /// `mem_addr` does not need to be aligned on any particular boundary. 2655 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe 2656 { 2657 *mem_addr = a.array[0]; 2658 } 2659 unittest 2660 { 2661 float a; 2662 _mm_store_ss(&a, _mm_set_ps(3, 2, 1, 546)); 2663 assert(a == 546); 2664 } 2665 2666 /// Store the lower single-precision (32-bit) floating-point element from `a` into 4 contiguous elements in memory. 2667 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 2668 void _mm_store1_ps(float* mem_addr, __m128 a) pure @trusted // TODO: shouldn't be trusted 2669 { 2670 __m128* aligned = cast(__m128*)mem_addr; 2671 __m128 r; 2672 r.ptr[0] = a.array[0]; 2673 r.ptr[1] = a.array[0]; 2674 r.ptr[2] = a.array[0]; 2675 r.ptr[3] = a.array[0]; 2676 *aligned = r; 2677 } 2678 unittest 2679 { 2680 align(16) float[4] A; 2681 _mm_store1_ps(A.ptr, _mm_set_ss(42.0f)); 2682 float[4] correct = [42.0f, 42, 42, 42]; 2683 assert(A == correct); 2684 } 2685 2686 /// Store the upper 2 single-precision (32-bit) floating-point elements from `a` into memory. 2687 void _mm_storeh_pi(__m64* p, __m128 a) pure @trusted 2688 { 2689 long2 la = cast(long2)a; 2690 (*p).ptr[0] = la.array[1]; 2691 } 2692 unittest 2693 { 2694 __m64 R = _mm_setzero_si64(); 2695 long2 A = [13, 25]; 2696 _mm_storeh_pi(&R, cast(__m128)A); 2697 assert(R.array[0] == 25); 2698 } 2699 2700 /// Store the lower 2 single-precision (32-bit) floating-point elements from `a` into memory. 2701 void _mm_storel_pi(__m64* p, __m128 a) pure @trusted 2702 { 2703 long2 la = cast(long2)a; 2704 (*p).ptr[0] = la.array[0]; 2705 } 2706 unittest 2707 { 2708 __m64 R = _mm_setzero_si64(); 2709 long2 A = [13, 25]; 2710 _mm_storel_pi(&R, cast(__m128)A); 2711 assert(R.array[0] == 13); 2712 } 2713 2714 /// Store 4 single-precision (32-bit) floating-point elements from `a` into memory in reverse order. 2715 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 2716 void _mm_storer_ps(float* mem_addr, __m128 a) pure @trusted // TODO should not be trusted 2717 { 2718 __m128* aligned = cast(__m128*)mem_addr; 2719 __m128 r; 2720 r.ptr[0] = a.array[3]; 2721 r.ptr[1] = a.array[2]; 2722 r.ptr[2] = a.array[1]; 2723 r.ptr[3] = a.array[0]; 2724 *aligned = r; 2725 } 2726 unittest 2727 { 2728 align(16) float[4] A; 2729 _mm_storer_ps(A.ptr, _mm_setr_ps(1.0f, 2, 3, 4)); 2730 float[4] correct = [4.0f, 3.0f, 2.0f, 1.0f]; 2731 assert(A == correct); 2732 } 2733 2734 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 2735 /// `mem_addr` does not need to be aligned on any particular boundary. 2736 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe // TODO should not be trusted 2737 { 2738 storeUnaligned!(float4)(a, mem_addr); 2739 } 2740 2741 /// Store 64-bits of integer data from `a` into memory using a non-temporal memory hint. 2742 void _mm_stream_pi (__m64* mem_addr, __m64 a) 2743 { 2744 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 2745 *mem_addr = a; // it's a regular move instead 2746 } 2747 2748 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a`s into memory using 2749 /// a non-temporal memory hint. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be 2750 /// generated. 2751 void _mm_stream_ps (float* mem_addr, __m128 a) 2752 { 2753 // BUG: can't implement non-temporal store with LDC inlineIR since !nontemporal 2754 // needs some IR outside this function that would say: 2755 // 2756 // !0 = !{ i32 1 } 2757 // 2758 // It's a LLVM IR metadata description. 2759 __m128* dest = cast(__m128*)mem_addr; 2760 *dest = a; // it's a regular move instead 2761 } 2762 unittest 2763 { 2764 align(16) float[4] A; 2765 _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f)); 2766 assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f); 2767 } 2768 2769 /// Subtract packed single-precision (32-bit) floating-point elements in `b` from packed single-precision (32-bit) 2770 /// floating-point elements in `a`. 2771 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe 2772 { 2773 return a - b; 2774 } 2775 unittest 2776 { 2777 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 2778 a = _mm_sub_ps(a, a); 2779 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 2780 assert(a.array == correct); 2781 } 2782 2783 /// Subtract the lower single-precision (32-bit) floating-point element in `b` from the lower single-precision (32-bit) 2784 /// floating-point element in `a`, store the subtration result in the lower element of result, and copy the upper 3 2785 /// packed elements from a to the upper elements of result. 2786 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe 2787 { 2788 static if (DMD_with_DSIMD) 2789 return cast(__m128) __simd(XMM.SUBSS, a, b); 2790 else static if (GDC_with_SSE) 2791 return __builtin_ia32_subss(a, b); 2792 else 2793 { 2794 a[0] -= b[0]; 2795 return a; 2796 } 2797 } 2798 unittest 2799 { 2800 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 2801 a = _mm_sub_ss(a, a); 2802 float[4] correct = [0.0f, -2.0, 3.0f, 1.0f]; 2803 assert(a.array == correct); 2804 } 2805 2806 /// Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in row0, row1, 2807 /// row2, and row3, and store the transposed matrix in these vectors (row0 now contains column 0, etc.). 2808 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe 2809 { 2810 __m128 tmp3, tmp2, tmp1, tmp0; 2811 tmp0 = _mm_unpacklo_ps(row0, row1); 2812 tmp2 = _mm_unpacklo_ps(row2, row3); 2813 tmp1 = _mm_unpackhi_ps(row0, row1); 2814 tmp3 = _mm_unpackhi_ps(row2, row3); 2815 row0 = _mm_movelh_ps(tmp0, tmp2); 2816 row1 = _mm_movehl_ps(tmp2, tmp0); 2817 row2 = _mm_movelh_ps(tmp1, tmp3); 2818 row3 = _mm_movehl_ps(tmp3, tmp1); 2819 } 2820 unittest 2821 { 2822 __m128 l0 = _mm_setr_ps(0, 1, 2, 3); 2823 __m128 l1 = _mm_setr_ps(4, 5, 6, 7); 2824 __m128 l2 = _mm_setr_ps(8, 9, 10, 11); 2825 __m128 l3 = _mm_setr_ps(12, 13, 14, 15); 2826 _MM_TRANSPOSE4_PS(l0, l1, l2, l3); 2827 float[4] r0 = [0.0f, 4, 8, 12]; 2828 float[4] r1 = [1.0f, 5, 9, 13]; 2829 float[4] r2 = [2.0f, 6, 10, 14]; 2830 float[4] r3 = [3.0f, 7, 11, 15]; 2831 assert(l0.array == r0); 2832 assert(l1.array == r1); 2833 assert(l2.array == r2); 2834 assert(l3.array == r3); 2835 } 2836 2837 // Note: the only difference between these intrinsics is the signalling 2838 // behaviour of quiet NaNs. This is incorrect but the case where 2839 // you would want to differentiate between qNaN and sNaN and then 2840 // treat them differently on purpose seems extremely rare. 2841 alias _mm_ucomieq_ss = _mm_comieq_ss; 2842 alias _mm_ucomige_ss = _mm_comige_ss; 2843 alias _mm_ucomigt_ss = _mm_comigt_ss; 2844 alias _mm_ucomile_ss = _mm_comile_ss; 2845 alias _mm_ucomilt_ss = _mm_comilt_ss; 2846 alias _mm_ucomineq_ss = _mm_comineq_ss; 2847 2848 /// Return vector of type `__m128` with undefined elements. 2849 __m128 _mm_undefined_ps() pure @safe 2850 { 2851 __m128 undef = void; 2852 return undef; 2853 } 2854 2855 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half `a` and `b`. 2856 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @trusted 2857 { 2858 version(LDC) 2859 { 2860 // x86: plain version generates unpckhps with LDC 1.0.0 -O1, but shufflevector 8 less instructions in -O0 2861 return shufflevectorLDC!(__m128, 2, 6, 3, 7)(a, b); 2862 } 2863 else 2864 { 2865 __m128 r; 2866 r.ptr[0] = a.array[2]; 2867 r.ptr[1] = b.array[2]; 2868 r.ptr[2] = a.array[3]; 2869 r.ptr[3] = b.array[3]; 2870 return r; 2871 } 2872 } 2873 unittest 2874 { 2875 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 2876 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 2877 __m128 R = _mm_unpackhi_ps(A, B); 2878 float[4] correct = [3.0f, 7.0f, 4.0f, 8.0f]; 2879 assert(R.array == correct); 2880 } 2881 2882 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of `a` and `b`. 2883 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @trusted 2884 { 2885 version(LDC) 2886 { 2887 // x86: plain version generates unpckhps with LDC 1.0.0 -O1, but shufflevector 8 less instructions in -O0 2888 return shufflevectorLDC!(__m128, 0, 4, 1, 5)(a, b); 2889 } 2890 else 2891 { 2892 __m128 r; 2893 r.ptr[0] = a.array[0]; 2894 r.ptr[1] = b.array[0]; 2895 r.ptr[2] = a.array[1]; 2896 r.ptr[3] = b.array[1]; 2897 return r; 2898 } 2899 } 2900 unittest 2901 { 2902 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 2903 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 2904 __m128 R = _mm_unpacklo_ps(A, B); 2905 float[4] correct = [1.0f, 5.0f, 2.0f, 6.0f]; 2906 assert(R.array == correct); 2907 } 2908 2909 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`. 2910 __m128 _mm_xor_ps (__m128 a, __m128 b) pure @safe 2911 { 2912 return cast(__m128)(cast(__m128i)a ^ cast(__m128i)b); 2913 } 2914 2915 2916 private 2917 { 2918 // Returns: `true` if the pointer is suitably aligned. 2919 bool isPointerAligned(void* p, size_t alignment) pure 2920 { 2921 assert(alignment != 0); 2922 return ( cast(size_t)p & (alignment - 1) ) == 0; 2923 } 2924 2925 // Returns: next pointer aligned with alignment bytes. 2926 void* nextAlignedPointer(void* start, size_t alignment) pure 2927 { 2928 return cast(void*)nextMultipleOf(cast(size_t)(start), alignment); 2929 } 2930 2931 // Returns number of bytes to actually allocate when asking 2932 // for a particular alignment 2933 @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure 2934 { 2935 enum size_t pointerSize = size_t.sizeof; 2936 return askedSize + alignment - 1 + pointerSize * 3; 2937 } 2938 2939 // Store pointer given by malloc + size + alignment 2940 @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure 2941 { 2942 enum size_t pointerSize = size_t.sizeof; 2943 char* start = cast(char*)raw + pointerSize * 3; 2944 void* aligned = nextAlignedPointer(start, alignment); 2945 void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize); 2946 *rawLocation = raw; 2947 size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize); 2948 *sizeLocation = size; 2949 size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize); 2950 *alignmentLocation = alignment; 2951 assert( isPointerAligned(aligned, alignment) ); 2952 return aligned; 2953 } 2954 2955 // Returns: x, multiple of powerOfTwo, so that x >= n. 2956 @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow 2957 { 2958 // check power-of-two 2959 assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0)); 2960 2961 size_t mask = ~(powerOfTwo - 1); 2962 return (n + powerOfTwo - 1) & mask; 2963 } 2964 2965 void* alignedReallocImpl(bool PreserveDataIfResized)(void* aligned, size_t size, size_t alignment) 2966 { 2967 if (aligned is null) 2968 return _mm_malloc(size, alignment); 2969 2970 assert(alignment != 0); 2971 assert(isPointerAligned(aligned, alignment)); 2972 2973 size_t previousSize = *cast(size_t*)(cast(char*)aligned - size_t.sizeof * 2); 2974 size_t prevAlignment = *cast(size_t*)(cast(char*)aligned - size_t.sizeof * 3); 2975 2976 // It is illegal to change the alignment across calls. 2977 assert(prevAlignment == alignment); 2978 2979 void* raw = *cast(void**)(cast(char*)aligned - size_t.sizeof); 2980 size_t request = requestedSize(size, alignment); 2981 size_t previousRequest = requestedSize(previousSize, alignment); 2982 assert(previousRequest - request == previousSize - size); 2983 2984 // Heuristic: if a requested size is within 50% to 100% of what is already allocated 2985 // then exit with the same pointer 2986 // PERF it seems like `realloc` should do that, not us. 2987 if ( (previousRequest < request * 4) && (request <= previousRequest) ) 2988 return aligned; 2989 2990 void* newRaw = malloc(request); 2991 if (request > 0 && newRaw == null) // realloc(0) can validly return anything 2992 onOutOfMemoryError(); 2993 2994 void* newAligned = storeRawPointerPlusInfo(newRaw, size, alignment); 2995 2996 static if (PreserveDataIfResized) 2997 { 2998 size_t minSize = size < previousSize ? size : previousSize; 2999 memcpy(newAligned, aligned, minSize); 3000 } 3001 3002 // Free previous data 3003 _mm_free(aligned); 3004 assert(isPointerAligned(newAligned, alignment)); 3005 return newAligned; 3006 } 3007 } 3008 3009 unittest 3010 { 3011 assert(nextMultipleOf(0, 4) == 0); 3012 assert(nextMultipleOf(1, 4) == 4); 3013 assert(nextMultipleOf(2, 4) == 4); 3014 assert(nextMultipleOf(3, 4) == 4); 3015 assert(nextMultipleOf(4, 4) == 4); 3016 assert(nextMultipleOf(5, 4) == 8); 3017 3018 { 3019 void* p = _mm_malloc(23, 16); 3020 assert(p !is null); 3021 assert(((cast(size_t)p) & 0xf) == 0); 3022 _mm_free(p); 3023 } 3024 3025 void* nullAlloc = _mm_malloc(0, 32); 3026 assert(nullAlloc != null); 3027 _mm_free(nullAlloc); 3028 } 3029 3030 // For some reason, order of declaration is important for this one 3031 // so it is misplaced. 3032 // Note: is just another name for _mm_cvtss_si32 3033 alias _mm_cvt_ss2si = _mm_cvtss_si32;