1 /** 2 * SSE intrinsics. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.xmmintrin; 8 9 public import inteli.types; 10 11 import inteli.internals; 12 13 import inteli.mmx; 14 import inteli.emmintrin; 15 16 import core.stdc.stdlib: malloc, free; 17 import core.exception: onOutOfMemoryError; 18 19 version(D_InlineAsm_X86) 20 version = InlineX86Asm; 21 else version(D_InlineAsm_X86_64) 22 version = InlineX86Asm; 23 24 25 // SSE1 26 27 nothrow @nogc: 28 29 30 enum int _MM_EXCEPT_INVALID = 0x0001; /// MXCSR Exception states. 31 enum int _MM_EXCEPT_DENORM = 0x0002; ///ditto 32 enum int _MM_EXCEPT_DIV_ZERO = 0x0004; ///ditto 33 enum int _MM_EXCEPT_OVERFLOW = 0x0008; ///ditto 34 enum int _MM_EXCEPT_UNDERFLOW = 0x0010; ///ditto 35 enum int _MM_EXCEPT_INEXACT = 0x0020; ///ditto 36 enum int _MM_EXCEPT_MASK = 0x003f; /// MXCSR Exception states mask. 37 38 enum int _MM_MASK_INVALID = 0x0080; /// MXCSR Exception masks. 39 enum int _MM_MASK_DENORM = 0x0100; ///ditto 40 enum int _MM_MASK_DIV_ZERO = 0x0200; ///ditto 41 enum int _MM_MASK_OVERFLOW = 0x0400; ///ditto 42 enum int _MM_MASK_UNDERFLOW = 0x0800; ///ditto 43 enum int _MM_MASK_INEXACT = 0x1000; ///ditto 44 enum int _MM_MASK_MASK = 0x1f80; /// MXCSR Exception masks mask. 45 46 enum int _MM_ROUND_NEAREST = 0x0000; /// MXCSR Rounding mode. 47 enum int _MM_ROUND_DOWN = 0x2000; ///ditto 48 enum int _MM_ROUND_UP = 0x4000; ///ditto 49 enum int _MM_ROUND_TOWARD_ZERO = 0x6000; ///ditto 50 enum int _MM_ROUND_MASK = 0x6000; /// MXCSR Rounding mode mask. 51 52 enum int _MM_FLUSH_ZERO_MASK = 0x8000; /// MXCSR Denormal flush to zero mask. 53 enum int _MM_FLUSH_ZERO_ON = 0x8000; /// MXCSR Denormal flush to zero modes. 54 enum int _MM_FLUSH_ZERO_OFF = 0x0000; ///ditto 55 56 /// Add packed single-precision (32-bit) floating-point elements in `a` and `b`. 57 __m128 _mm_add_ps(__m128 a, __m128 b) pure @safe 58 { 59 return a + b; 60 } 61 unittest 62 { 63 __m128 a = [1, 2, 3, 4]; 64 a = _mm_add_ps(a, a); 65 assert(a.array[0] == 2); 66 assert(a.array[1] == 4); 67 assert(a.array[2] == 6); 68 assert(a.array[3] == 8); 69 } 70 71 /// Add the lower single-precision (32-bit) floating-point element 72 /// in `a` and `b`, store the result in the lower element of result, 73 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 74 __m128 _mm_add_ss(__m128 a, __m128 b) pure @safe 75 { 76 static if (GDC_with_SSE) 77 { 78 return __builtin_ia32_addss(a, b); 79 } 80 else static if (DMD_with_DSIMD) 81 { 82 return cast(__m128) __simd(XMM.ADDSS, a, b); 83 } 84 else 85 { 86 a[0] += b[0]; 87 return a; 88 } 89 } 90 unittest 91 { 92 __m128 a = [1, 2, 3, 4]; 93 a = _mm_add_ss(a, a); 94 assert(a.array == [2.0f, 2, 3, 4]); 95 } 96 97 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`. 98 __m128 _mm_and_ps (__m128 a, __m128 b) pure @safe 99 { 100 return cast(__m128)(cast(__m128i)a & cast(__m128i)b); 101 } 102 unittest 103 { 104 float a = 4.32f; 105 float b = -78.99f; 106 int correct = (*cast(int*)(&a)) & (*cast(int*)(&b)); 107 __m128 A = _mm_set_ps(a, b, a, b); 108 __m128 B = _mm_set_ps(b, a, b, a); 109 int4 R = cast(int4)( _mm_and_ps(A, B) ); 110 assert(R.array[0] == correct); 111 assert(R.array[1] == correct); 112 assert(R.array[2] == correct); 113 assert(R.array[3] == correct); 114 } 115 116 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` and then AND with `b`. 117 __m128 _mm_andnot_ps (__m128 a, __m128 b) pure @safe 118 { 119 static if (DMD_with_DSIMD) 120 return cast(__m128) __simd(XMM.ANDNPS, a, b); 121 else 122 return cast(__m128)( (~cast(__m128i)a) & cast(__m128i)b ); 123 } 124 unittest 125 { 126 float a = 4.32f; 127 float b = -78.99f; 128 int correct = ~(*cast(int*)(&a)) & (*cast(int*)(&b)); 129 int correct2 = (*cast(int*)(&a)) & ~(*cast(int*)(&b)); 130 __m128 A = _mm_set_ps(a, b, a, b); 131 __m128 B = _mm_set_ps(b, a, b, a); 132 int4 R = cast(int4)( _mm_andnot_ps(A, B) ); 133 assert(R.array[0] == correct2); 134 assert(R.array[1] == correct); 135 assert(R.array[2] == correct2); 136 assert(R.array[3] == correct); 137 } 138 139 /// Average packed unsigned 16-bit integers in ``a` and `b`. 140 __m64 _mm_avg_pu16 (__m64 a, __m64 b) pure @safe 141 { 142 return to_m64(_mm_avg_epu16(to_m128i(a), to_m128i(b))); 143 } 144 145 /// Average packed unsigned 8-bit integers in ``a` and `b`. 146 __m64 _mm_avg_pu8 (__m64 a, __m64 b) pure @safe 147 { 148 return to_m64(_mm_avg_epu8(to_m128i(a), to_m128i(b))); 149 } 150 151 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for equality. 152 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) pure @safe 153 { 154 static if (DMD_with_DSIMD) 155 return cast(__m128) __simd(XMM.CMPPS, a, b, 0); 156 else 157 return cast(__m128) cmpps!(FPComparison.oeq)(a, b); 158 } 159 unittest 160 { 161 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 162 __m128 B = _mm_setr_ps(3.0f, 2.0f, float.nan, float.nan); 163 __m128i R = cast(__m128i) _mm_cmpeq_ps(A, B); 164 int[4] correct = [0, -1, 0, 0]; 165 assert(R.array == correct); 166 } 167 168 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for equality, 169 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 170 __m128 _mm_cmpeq_ss (__m128 a, __m128 b) pure @safe 171 { 172 static if (DMD_with_DSIMD) 173 return cast(__m128) __simd(XMM.CMPSS, a, b, 0); 174 else 175 return cast(__m128) cmpss!(FPComparison.oeq)(a, b); 176 } 177 unittest 178 { 179 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 180 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 181 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 182 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 183 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 184 __m128i R1 = cast(__m128i) _mm_cmpeq_ss(A, B); 185 __m128i R2 = cast(__m128i) _mm_cmpeq_ss(A, C); 186 __m128i R3 = cast(__m128i) _mm_cmpeq_ss(A, D); 187 __m128i R4 = cast(__m128i) _mm_cmpeq_ss(A, E); 188 int[4] correct1 = [-1, 0, 0, 0]; 189 int[4] correct2 = [0, 0, 0, 0]; 190 int[4] correct3 = [0, 0, 0, 0]; 191 int[4] correct4 = [0, 0, 0, 0]; 192 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 193 } 194 195 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal. 196 __m128 _mm_cmpge_ps (__m128 a, __m128 b) pure @safe 197 { 198 static if (DMD_with_DSIMD) 199 return cast(__m128) __simd(XMM.CMPPS, b, a, 2); 200 else 201 return cast(__m128) cmpps!(FPComparison.oge)(a, b); 202 } 203 unittest 204 { 205 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 206 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 207 __m128i R = cast(__m128i) _mm_cmpge_ps(A, B); 208 int[4] correct = [0, -1,-1, 0]; 209 assert(R.array == correct); 210 } 211 212 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than-or-equal, 213 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 214 __m128 _mm_cmpge_ss (__m128 a, __m128 b) pure @safe 215 { 216 static if (DMD_with_DSIMD) 217 { 218 __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 2); 219 a[0] = c[0]; 220 return a; 221 } 222 else 223 return cast(__m128) cmpss!(FPComparison.oge)(a, b); 224 } 225 unittest 226 { 227 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 228 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 229 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 230 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 231 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 232 __m128i R1 = cast(__m128i) _mm_cmpge_ss(A, B); 233 __m128i R2 = cast(__m128i) _mm_cmpge_ss(A, C); 234 __m128i R3 = cast(__m128i) _mm_cmpge_ss(A, D); 235 __m128i R4 = cast(__m128i) _mm_cmpge_ss(A, E); 236 int[4] correct1 = [-1, 0, 0, 0]; 237 int[4] correct2 = [-1, 0, 0, 0]; 238 int[4] correct3 = [0, 0, 0, 0]; 239 int[4] correct4 = [0, 0, 0, 0]; 240 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 241 } 242 243 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for greater-than. 244 __m128 _mm_cmpgt_ps (__m128 a, __m128 b) pure @safe 245 { 246 static if (DMD_with_DSIMD) 247 return cast(__m128) __simd(XMM.CMPPS, b, a, 1); 248 else 249 return cast(__m128) cmpps!(FPComparison.ogt)(a, b); 250 } 251 unittest 252 { 253 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 254 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 255 __m128i R = cast(__m128i) _mm_cmpgt_ps(A, B); 256 int[4] correct = [0, 0,-1, 0]; 257 assert(R.array == correct); 258 } 259 260 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for greater-than, 261 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 262 __m128 _mm_cmpgt_ss (__m128 a, __m128 b) pure @safe 263 { 264 static if (DMD_with_DSIMD) 265 { 266 __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 1); 267 a[0] = c[0]; 268 return a; 269 } 270 else 271 return cast(__m128) cmpss!(FPComparison.ogt)(a, b); 272 } 273 unittest 274 { 275 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 276 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 277 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 278 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 279 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 280 __m128i R1 = cast(__m128i) _mm_cmpgt_ss(A, B); 281 __m128i R2 = cast(__m128i) _mm_cmpgt_ss(A, C); 282 __m128i R3 = cast(__m128i) _mm_cmpgt_ss(A, D); 283 __m128i R4 = cast(__m128i) _mm_cmpgt_ss(A, E); 284 int[4] correct1 = [0, 0, 0, 0]; 285 int[4] correct2 = [-1, 0, 0, 0]; 286 int[4] correct3 = [0, 0, 0, 0]; 287 int[4] correct4 = [0, 0, 0, 0]; 288 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 289 } 290 291 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal. 292 __m128 _mm_cmple_ps (__m128 a, __m128 b) pure @safe 293 { 294 static if (DMD_with_DSIMD) 295 return cast(__m128) __simd(XMM.CMPPS, a, b, 2); 296 else 297 return cast(__m128) cmpps!(FPComparison.ole)(a, b); 298 } 299 unittest 300 { 301 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 302 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 303 __m128i R = cast(__m128i) _mm_cmple_ps(A, B); 304 int[4] correct = [-1, -1, 0, 0]; 305 assert(R.array == correct); 306 } 307 308 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than-or-equal, 309 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 310 __m128 _mm_cmple_ss (__m128 a, __m128 b) pure @safe 311 { 312 static if (DMD_with_DSIMD) 313 return cast(__m128) __simd(XMM.CMPSS, a, b, 2); 314 else 315 return cast(__m128) cmpss!(FPComparison.ole)(a, b); 316 } 317 unittest 318 { 319 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 320 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 321 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 322 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 323 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 324 __m128i R1 = cast(__m128i) _mm_cmple_ss(A, B); 325 __m128i R2 = cast(__m128i) _mm_cmple_ss(A, C); 326 __m128i R3 = cast(__m128i) _mm_cmple_ss(A, D); 327 __m128i R4 = cast(__m128i) _mm_cmple_ss(A, E); 328 int[4] correct1 = [-1, 0, 0, 0]; 329 int[4] correct2 = [0, 0, 0, 0]; 330 int[4] correct3 = [0, 0, 0, 0]; 331 int[4] correct4 = [-1, 0, 0, 0]; 332 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 333 } 334 335 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for less-than. 336 __m128 _mm_cmplt_ps (__m128 a, __m128 b) pure @safe 337 { 338 static if (DMD_with_DSIMD) 339 return cast(__m128) __simd(XMM.CMPPS, a, b, 1); 340 else 341 return cast(__m128) cmpps!(FPComparison.olt)(a, b); 342 } 343 unittest 344 { 345 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 346 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 347 __m128i R = cast(__m128i) _mm_cmplt_ps(A, B); 348 int[4] correct = [-1, 0, 0, 0]; 349 assert(R.array == correct); 350 } 351 352 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for less-than, 353 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 354 __m128 _mm_cmplt_ss (__m128 a, __m128 b) pure @safe 355 { 356 static if (DMD_with_DSIMD) 357 return cast(__m128) __simd(XMM.CMPSS, a, b, 1); 358 else 359 return cast(__m128) cmpss!(FPComparison.olt)(a, b); 360 } 361 unittest 362 { 363 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 364 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 365 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 366 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 367 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 368 __m128i R1 = cast(__m128i) _mm_cmplt_ss(A, B); 369 __m128i R2 = cast(__m128i) _mm_cmplt_ss(A, C); 370 __m128i R3 = cast(__m128i) _mm_cmplt_ss(A, D); 371 __m128i R4 = cast(__m128i) _mm_cmplt_ss(A, E); 372 int[4] correct1 = [0, 0, 0, 0]; 373 int[4] correct2 = [0, 0, 0, 0]; 374 int[4] correct3 = [0, 0, 0, 0]; 375 int[4] correct4 = [-1, 0, 0, 0]; 376 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 377 } 378 379 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-equal. 380 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) pure @safe 381 { 382 static if (DMD_with_DSIMD) 383 return cast(__m128) __simd(XMM.CMPPS, a, b, 4); 384 else 385 return cast(__m128) cmpps!(FPComparison.une)(a, b); 386 } 387 unittest 388 { 389 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 390 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 391 __m128i R = cast(__m128i) _mm_cmpneq_ps(A, B); 392 int[4] correct = [-1, 0, -1, -1]; 393 assert(R.array == correct); 394 } 395 396 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-equal, 397 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 398 __m128 _mm_cmpneq_ss (__m128 a, __m128 b) pure @safe 399 { 400 static if (DMD_with_DSIMD) 401 return cast(__m128) __simd(XMM.CMPSS, a, b, 4); 402 else 403 return cast(__m128) cmpss!(FPComparison.une)(a, b); 404 } 405 unittest 406 { 407 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 408 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 409 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 410 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 411 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 412 __m128i R1 = cast(__m128i) _mm_cmpneq_ss(A, B); 413 __m128i R2 = cast(__m128i) _mm_cmpneq_ss(A, C); 414 __m128i R3 = cast(__m128i) _mm_cmpneq_ss(A, D); 415 __m128i R4 = cast(__m128i) _mm_cmpneq_ss(A, E); 416 int[4] correct1 = [0, 0, 0, 0]; 417 int[4] correct2 = [-1, 0, 0, 0]; 418 int[4] correct3 = [-1, 0, 0, 0]; 419 int[4] correct4 = [-1, 0, 0, 0]; 420 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 421 } 422 423 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal. 424 __m128 _mm_cmpnge_ps (__m128 a, __m128 b) pure @safe 425 { 426 static if (DMD_with_DSIMD) 427 return cast(__m128) __simd(XMM.CMPPS, b, a, 6); 428 else 429 return cast(__m128) cmpps!(FPComparison.ult)(a, b); 430 } 431 unittest 432 { 433 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 434 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 435 __m128i R = cast(__m128i) _mm_cmpnge_ps(A, B); 436 int[4] correct = [-1, 0, 0, -1]; 437 assert(R.array == correct); 438 } 439 440 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than-or-equal, 441 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 442 __m128 _mm_cmpnge_ss (__m128 a, __m128 b) pure @safe 443 { 444 static if (DMD_with_DSIMD) 445 { 446 __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 6); 447 a[0] = c[0]; 448 return a; 449 } 450 else 451 return cast(__m128) cmpss!(FPComparison.ult)(a, b); 452 } 453 unittest 454 { 455 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 456 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 457 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 458 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 459 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 460 __m128i R1 = cast(__m128i) _mm_cmpnge_ss(A, B); 461 __m128i R2 = cast(__m128i) _mm_cmpnge_ss(A, C); 462 __m128i R3 = cast(__m128i) _mm_cmpnge_ss(A, D); 463 __m128i R4 = cast(__m128i) _mm_cmpnge_ss(A, E); 464 int[4] correct1 = [0, 0, 0, 0]; 465 int[4] correct2 = [0, 0, 0, 0]; 466 int[4] correct3 = [-1, 0, 0, 0]; 467 int[4] correct4 = [-1, 0, 0, 0]; 468 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 469 } 470 471 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than. 472 __m128 _mm_cmpngt_ps (__m128 a, __m128 b) pure @safe 473 { 474 static if (DMD_with_DSIMD) 475 return cast(__m128) __simd(XMM.CMPPS, b, a, 5); 476 else 477 return cast(__m128) cmpps!(FPComparison.ule)(a, b); 478 } 479 unittest 480 { 481 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 482 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 483 __m128i R = cast(__m128i) _mm_cmpngt_ps(A, B); 484 int[4] correct = [-1, -1, 0, -1]; 485 assert(R.array == correct); 486 } 487 488 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-greater-than, 489 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 490 __m128 _mm_cmpngt_ss (__m128 a, __m128 b) pure @safe 491 { 492 static if (DMD_with_DSIMD) 493 { 494 __m128 c = cast(__m128) __simd(XMM.CMPSS, b, a, 5); 495 a[0] = c[0]; 496 return a; 497 } 498 else 499 return cast(__m128) cmpss!(FPComparison.ule)(a, b); 500 } 501 unittest 502 { 503 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 504 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 505 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 506 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 507 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 508 __m128i R1 = cast(__m128i) _mm_cmpngt_ss(A, B); 509 __m128i R2 = cast(__m128i) _mm_cmpngt_ss(A, C); 510 __m128i R3 = cast(__m128i) _mm_cmpngt_ss(A, D); 511 __m128i R4 = cast(__m128i) _mm_cmpngt_ss(A, E); 512 int[4] correct1 = [-1, 0, 0, 0]; 513 int[4] correct2 = [0, 0, 0, 0]; 514 int[4] correct3 = [-1, 0, 0, 0]; 515 int[4] correct4 = [-1, 0, 0, 0]; 516 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 517 } 518 519 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal. 520 __m128 _mm_cmpnle_ps (__m128 a, __m128 b) pure @safe 521 { 522 static if (DMD_with_DSIMD) 523 return cast(__m128) __simd(XMM.CMPPS, a, b, 6); 524 else 525 return cast(__m128) cmpps!(FPComparison.ugt)(a, b); 526 } 527 unittest 528 { 529 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 530 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 531 __m128i R = cast(__m128i) _mm_cmpnle_ps(A, B); 532 int[4] correct = [0, 0, -1, -1]; 533 assert(R.array == correct); 534 } 535 536 537 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than-or-equal, 538 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 539 __m128 _mm_cmpnle_ss (__m128 a, __m128 b) pure @safe 540 { 541 static if (DMD_with_DSIMD) 542 return cast(__m128) __simd(XMM.CMPSS, a, b, 6); 543 else 544 return cast(__m128) cmpss!(FPComparison.ugt)(a, b); 545 } 546 unittest 547 { 548 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 549 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 550 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 551 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 552 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 553 __m128i R1 = cast(__m128i) _mm_cmpnle_ss(A, B); 554 __m128i R2 = cast(__m128i) _mm_cmpnle_ss(A, C); 555 __m128i R3 = cast(__m128i) _mm_cmpnle_ss(A, D); 556 __m128i R4 = cast(__m128i) _mm_cmpnle_ss(A, E); 557 int[4] correct1 = [0, 0, 0, 0]; 558 int[4] correct2 = [-1, 0, 0, 0]; 559 int[4] correct3 = [-1, 0, 0, 0]; 560 int[4] correct4 = [0, 0, 0, 0]; 561 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 562 } 563 564 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than. 565 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) pure @safe 566 { 567 static if (DMD_with_DSIMD) 568 return cast(__m128) __simd(XMM.CMPPS, a, b, 5); 569 else 570 return cast(__m128) cmpps!(FPComparison.uge)(a, b); 571 } 572 unittest 573 { 574 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 575 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 576 __m128i R = cast(__m128i) _mm_cmpnlt_ps(A, B); 577 int[4] correct = [0, -1, -1, -1]; 578 assert(R.array == correct); 579 } 580 581 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` for not-less-than, 582 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 583 __m128 _mm_cmpnlt_ss (__m128 a, __m128 b) pure @safe 584 { 585 static if (DMD_with_DSIMD) 586 return cast(__m128) __simd(XMM.CMPSS, a, b, 5); 587 else 588 return cast(__m128) cmpss!(FPComparison.uge)(a, b); 589 } 590 unittest 591 { 592 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 593 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 594 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 595 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 596 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 597 __m128i R1 = cast(__m128i) _mm_cmpnlt_ss(A, B); 598 __m128i R2 = cast(__m128i) _mm_cmpnlt_ss(A, C); 599 __m128i R3 = cast(__m128i) _mm_cmpnlt_ss(A, D); 600 __m128i R4 = cast(__m128i) _mm_cmpnlt_ss(A, E); 601 int[4] correct1 = [-1, 0, 0, 0]; 602 int[4] correct2 = [-1, 0, 0, 0]; 603 int[4] correct3 = [-1, 0, 0, 0]; 604 int[4] correct4 = [0, 0, 0, 0]; 605 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 606 } 607 608 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN. 609 __m128 _mm_cmpord_ps (__m128 a, __m128 b) pure @safe 610 { 611 static if (DMD_with_DSIMD) 612 return cast(__m128) __simd(XMM.CMPPS, a, b, 7); 613 else 614 return cast(__m128) cmpps!(FPComparison.ord)(a, b); 615 } 616 unittest 617 { 618 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 619 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 620 __m128i R = cast(__m128i) _mm_cmpord_ps(A, B); 621 int[4] correct = [-1, -1, -1, 0]; 622 assert(R.array == correct); 623 } 624 625 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if neither is NaN, 626 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 627 __m128 _mm_cmpord_ss (__m128 a, __m128 b) pure @safe 628 { 629 static if (DMD_with_DSIMD) 630 return cast(__m128) __simd(XMM.CMPSS, a, b, 7); 631 else 632 return cast(__m128) cmpss!(FPComparison.ord)(a, b); 633 } 634 unittest 635 { 636 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 637 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 638 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 639 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 640 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 641 __m128i R1 = cast(__m128i) _mm_cmpord_ss(A, B); 642 __m128i R2 = cast(__m128i) _mm_cmpord_ss(A, C); 643 __m128i R3 = cast(__m128i) _mm_cmpord_ss(A, D); 644 __m128i R4 = cast(__m128i) _mm_cmpord_ss(A, E); 645 int[4] correct1 = [-1, 0, 0, 0]; 646 int[4] correct2 = [-1, 0, 0, 0]; 647 int[4] correct3 = [0, 0, 0, 0]; 648 int[4] correct4 = [-1, 0, 0, 0]; 649 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 650 } 651 652 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN. 653 __m128 _mm_cmpunord_ps (__m128 a, __m128 b) pure @safe 654 { 655 static if (DMD_with_DSIMD) 656 return cast(__m128) __simd(XMM.CMPPS, a, b, 3); 657 else 658 return cast(__m128) cmpps!(FPComparison.uno)(a, b); 659 } 660 unittest 661 { 662 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, float.nan); 663 __m128 B = _mm_setr_ps(3.0f, 2.0f, 1.0f, float.nan); 664 __m128i R = cast(__m128i) _mm_cmpunord_ps(A, B); 665 int[4] correct = [0, 0, 0, -1]; 666 assert(R.array == correct); 667 } 668 669 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b` to see if either is NaN. 670 /// and copy the upper 3 packed elements from `a` to the upper elements of result. 671 __m128 _mm_cmpunord_ss (__m128 a, __m128 b) pure @safe 672 { 673 static if (DMD_with_DSIMD) 674 return cast(__m128) __simd(XMM.CMPSS, a, b, 3); 675 else return cast(__m128) cmpss!(FPComparison.uno)(a, b); 676 } 677 unittest 678 { 679 __m128 A = _mm_setr_ps(3.0f, 0, 0, 0); 680 __m128 B = _mm_setr_ps(3.0f, float.nan, float.nan, float.nan); 681 __m128 C = _mm_setr_ps(2.0f, float.nan, float.nan, float.nan); 682 __m128 D = _mm_setr_ps(float.nan, float.nan, float.nan, float.nan); 683 __m128 E = _mm_setr_ps(4.0f, float.nan, float.nan, float.nan); 684 __m128i R1 = cast(__m128i) _mm_cmpunord_ss(A, B); 685 __m128i R2 = cast(__m128i) _mm_cmpunord_ss(A, C); 686 __m128i R3 = cast(__m128i) _mm_cmpunord_ss(A, D); 687 __m128i R4 = cast(__m128i) _mm_cmpunord_ss(A, E); 688 int[4] correct1 = [0, 0, 0, 0]; 689 int[4] correct2 = [0, 0, 0, 0]; 690 int[4] correct3 = [-1, 0, 0, 0]; 691 int[4] correct4 = [0, 0, 0, 0]; 692 assert(R1.array == correct1 && R2.array == correct2 && R3.array == correct3 && R4.array == correct4); 693 } 694 695 696 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for equality, 697 /// and return the boolean result (0 or 1). 698 int _mm_comieq_ss (__m128 a, __m128 b) pure @safe 699 { 700 return a.array[0] == b.array[0]; 701 } 702 unittest 703 { 704 assert(1 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 705 assert(0 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 706 assert(0 == _mm_comieq_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 707 assert(0 == _mm_comieq_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 708 assert(1 == _mm_comieq_ss(_mm_set_ss(0.0), _mm_set_ss(-0.0))); 709 } 710 711 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than-or-equal, 712 /// and return the boolean result (0 or 1). 713 int _mm_comige_ss (__m128 a, __m128 b) pure @safe 714 { 715 return a.array[0] >= b.array[0]; 716 } 717 unittest 718 { 719 assert(1 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 720 assert(1 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 721 assert(0 == _mm_comige_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f))); 722 assert(0 == _mm_comige_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 723 assert(0 == _mm_comige_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 724 assert(1 == _mm_comige_ss(_mm_set_ss(-0.0f), _mm_set_ss(0.0f))); 725 } 726 727 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for greater-than, 728 /// and return the boolean result (0 or 1). 729 int _mm_comigt_ss (__m128 a, __m128 b) pure @safe // comiss + seta 730 { 731 return a.array[0] > b.array[0]; 732 } 733 unittest 734 { 735 assert(0 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 736 assert(1 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 737 assert(0 == _mm_comigt_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 738 assert(0 == _mm_comigt_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 739 assert(0 == _mm_comigt_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f))); 740 } 741 742 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than-or-equal, 743 /// and return the boolean result (0 or 1). 744 int _mm_comile_ss (__m128 a, __m128 b) pure @safe // comiss + setbe 745 { 746 return a.array[0] <= b.array[0]; 747 } 748 unittest 749 { 750 assert(1 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 751 assert(0 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 752 assert(1 == _mm_comile_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f))); 753 assert(0 == _mm_comile_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 754 assert(0 == _mm_comile_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 755 assert(1 == _mm_comile_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f))); 756 } 757 758 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for less-than, 759 /// and return the boolean result (0 or 1). 760 int _mm_comilt_ss (__m128 a, __m128 b) pure @safe // comiss + setb 761 { 762 return a.array[0] < b.array[0]; 763 } 764 unittest 765 { 766 assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 767 assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 768 assert(1 == _mm_comilt_ss(_mm_set_ss(-78.0f), _mm_set_ss(78.0f))); 769 assert(0 == _mm_comilt_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 770 assert(0 == _mm_comilt_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 771 assert(0 == _mm_comilt_ss(_mm_set_ss(-0.0f), _mm_set_ss(0.0f))); 772 } 773 774 /// Compare the lower single-precision (32-bit) floating-point element in `a` and `b` for not-equal, 775 /// and return the boolean result (0 or 1). 776 int _mm_comineq_ss (__m128 a, __m128 b) pure @safe // comiss + setne 777 { 778 return a.array[0] != b.array[0]; 779 } 780 unittest 781 { 782 assert(0 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(78.0f))); 783 assert(1 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(-78.0f))); 784 assert(1 == _mm_comineq_ss(_mm_set_ss(78.0f), _mm_set_ss(float.nan))); 785 assert(1 == _mm_comineq_ss(_mm_set_ss(float.nan), _mm_set_ss(-4.22f))); 786 assert(0 == _mm_comineq_ss(_mm_set_ss(0.0f), _mm_set_ss(-0.0f))); 787 } 788 789 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 790 /// floating-point elements, store the results in the lower 2 elements, 791 /// and copy the upper 2 packed elements from `a` to the upper elements of result. 792 alias _mm_cvt_pi2ps = _mm_cvtpi32_ps; 793 794 /// Convert 2 lower packed single-precision (32-bit) floating-point elements in `a` 795 /// to packed 32-bit integers. 796 __m64 _mm_cvt_ps2pi (__m128 a) @safe 797 { 798 return to_m64(_mm_cvtps_epi32(a)); 799 } 800 801 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 802 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 803 /// upper elements of the result. 804 __m128 _mm_cvt_si2ss (__m128 v, int x) pure @trusted 805 { 806 v.ptr[0] = cast(float)x; 807 return v; 808 } 809 unittest 810 { 811 __m128 a = _mm_cvt_si2ss(_mm_set1_ps(0.0f), 42); 812 assert(a.array == [42f, 0, 0, 0]); 813 } 814 815 /// Convert packed 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements. 816 __m128 _mm_cvtpi16_ps (__m64 a) pure @safe 817 { 818 __m128i ma = to_m128i(a); 819 ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit 820 ma = _mm_srai_epi32(_mm_slli_epi32(ma, 16), 16); // Replicate sign bit 821 return _mm_cvtepi32_ps(ma); 822 } 823 unittest 824 { 825 __m64 A = _mm_setr_pi16(-1, 2, -3, 4); 826 __m128 R = _mm_cvtpi16_ps(A); 827 float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f]; 828 assert(R.array == correct); 829 } 830 831 /// Convert packed signed 32-bit integers in `b` to packed single-precision (32-bit) 832 /// floating-point elements, store the results in the lower 2 elements, 833 /// and copy the upper 2 packed elements from `a` to the upper elements of result. 834 __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) pure @trusted 835 { 836 __m128 fb = _mm_cvtepi32_ps(to_m128i(b)); 837 a.ptr[0] = fb.array[0]; 838 a.ptr[1] = fb.array[1]; 839 return a; 840 } 841 unittest 842 { 843 __m128 R = _mm_cvtpi32_ps(_mm_set1_ps(4.0f), _mm_setr_pi32(1, 2)); 844 float[4] correct = [1.0f, 2.0f, 4.0f, 4.0f]; 845 assert(R.array == correct); 846 } 847 848 /// Convert packed signed 32-bit integers in `a` to packed single-precision (32-bit) floating-point elements, 849 /// store the results in the lower 2 elements, then covert the packed signed 32-bit integers in `b` to 850 /// single-precision (32-bit) floating-point element, and store the results in the upper 2 elements. 851 __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) pure @trusted 852 { 853 long2 l; 854 l.ptr[0] = a.array[0]; 855 l.ptr[1] = b.array[0]; 856 return _mm_cvtepi32_ps(cast(__m128i)l); 857 } 858 unittest 859 { 860 __m64 A = _mm_setr_pi32(-45, 128); 861 __m64 B = _mm_setr_pi32(0, 1000); 862 __m128 R = _mm_cvtpi32x2_ps(A, B); 863 float[4] correct = [-45.0f, 128.0f, 0.0f, 1000.0f]; 864 assert(R.array == correct); 865 } 866 867 /// Convert the lower packed 8-bit integers in `a` to packed single-precision (32-bit) floating-point elements. 868 __m128 _mm_cvtpi8_ps (__m64 a) pure @safe 869 { 870 __m128i b = to_m128i(a); 871 872 // Zero extend to 32-bit 873 b = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 874 b = _mm_unpacklo_epi16(b, _mm_setzero_si128()); 875 876 // Replicate sign bit 877 b = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24); // Replicate sign bit 878 return _mm_cvtepi32_ps(b); 879 } 880 unittest 881 { 882 __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0); 883 __m128 R = _mm_cvtpi8_ps(A); 884 float[4] correct = [-1.0f, 2.0f, -3.0f, 4.0f]; 885 assert(R.array == correct); 886 } 887 888 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 16-bit integers. 889 /// Note: this intrinsic will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and 0x7FFFFFFF. 890 __m64 _mm_cvtps_pi16 (__m128 a) @safe 891 { 892 // The C++ version of this intrinsic convert to 32-bit float, then use packssdw 893 // Which means the 16-bit integers should be saturated 894 __m128i b = _mm_cvtps_epi32(a); 895 b = _mm_packs_epi32(b, b); 896 return to_m64(b); 897 } 898 unittest 899 { 900 __m128 A = _mm_setr_ps(-1.0f, 2.0f, -33000.0f, 70000.0f); 901 short4 R = cast(short4) _mm_cvtps_pi16(A); 902 short[4] correct = [-1, 2, -32768, 32767]; 903 assert(R.array == correct); 904 } 905 906 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers. 907 __m64 _mm_cvtps_pi32 (__m128 a) @safe 908 { 909 return to_m64(_mm_cvtps_epi32(a)); 910 } 911 unittest 912 { 913 __m128 A = _mm_setr_ps(-33000.0f, 70000.0f, -1.0f, 2.0f, ); 914 int2 R = cast(int2) _mm_cvtps_pi32(A); 915 int[2] correct = [-33000, 70000]; 916 assert(R.array == correct); 917 } 918 919 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 8-bit integers, 920 /// and store the results in lower 4 elements. 921 /// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values between 0x7F and 0x7FFFFFFF. 922 __m64 _mm_cvtps_pi8 (__m128 a) @safe 923 { 924 // The C++ version of this intrinsic convert to 32-bit float, then use packssdw + packsswb 925 // Which means the 8-bit integers should be saturated 926 __m128i b = _mm_cvtps_epi32(a); 927 b = _mm_packs_epi32(b, _mm_setzero_si128()); 928 b = _mm_packs_epi16(b, _mm_setzero_si128()); 929 return to_m64(b); 930 } 931 unittest 932 { 933 __m128 A = _mm_setr_ps(-1.0f, 2.0f, -129.0f, 128.0f); 934 byte8 R = cast(byte8) _mm_cvtps_pi8(A); 935 byte[8] correct = [-1, 2, -128, 127, 0, 0, 0, 0]; 936 assert(R.array == correct); 937 } 938 939 /// Convert packed unsigned 16-bit integers in `a` to packed single-precision (32-bit) floating-point elements. 940 __m128 _mm_cvtpu16_ps (__m64 a) pure @safe 941 { 942 __m128i ma = to_m128i(a); 943 ma = _mm_unpacklo_epi16(ma, _mm_setzero_si128()); // Zero-extend to 32-bit 944 return _mm_cvtepi32_ps(ma); 945 } 946 unittest 947 { 948 __m64 A = _mm_setr_pi16(-1, 2, -3, 4); 949 __m128 R = _mm_cvtpu16_ps(A); 950 float[4] correct = [65535.0f, 2.0f, 65533.0f, 4.0f]; 951 assert(R.array == correct); 952 } 953 954 /// Convert the lower packed unsigned 8-bit integers in `a` to packed single-precision (32-bit) floating-point element. 955 __m128 _mm_cvtpu8_ps (__m64 a) pure @safe 956 { 957 __m128i b = to_m128i(a); 958 959 // Zero extend to 32-bit 960 b = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 961 b = _mm_unpacklo_epi16(b, _mm_setzero_si128()); 962 return _mm_cvtepi32_ps(b); 963 } 964 unittest 965 { 966 __m64 A = _mm_setr_pi8(-1, 2, -3, 4, 0, 0, 0, 0); 967 __m128 R = _mm_cvtpu8_ps(A); 968 float[4] correct = [255.0f, 2.0f, 253.0f, 4.0f]; 969 assert(R.array == correct); 970 } 971 972 /// Convert the signed 32-bit integer `b` to a single-precision (32-bit) floating-point element, 973 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 974 /// upper elements of result. 975 __m128 _mm_cvtsi32_ss(__m128 v, int x) pure @trusted 976 { 977 v.ptr[0] = cast(float)x; 978 return v; 979 } 980 unittest 981 { 982 __m128 a = _mm_cvtsi32_ss(_mm_set1_ps(0.0f), 42); 983 assert(a.array == [42.0f, 0, 0, 0]); 984 } 985 986 987 /// Convert the signed 64-bit integer `b` to a single-precision (32-bit) floating-point element, 988 /// store the result in the lower element, and copy the upper 3 packed elements from `a` to the 989 /// upper elements of result. 990 __m128 _mm_cvtsi64_ss(__m128 v, long x) pure @trusted 991 { 992 v.ptr[0] = cast(float)x; 993 return v; 994 } 995 unittest 996 { 997 __m128 a = _mm_cvtsi64_ss(_mm_set1_ps(0.0f), 42); 998 assert(a.array == [42.0f, 0, 0, 0]); 999 } 1000 1001 /// Take the lower single-precision (32-bit) floating-point element of `a`. 1002 float _mm_cvtss_f32(__m128 a) pure @safe 1003 { 1004 return a.array[0]; 1005 } 1006 1007 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit integer. 1008 int _mm_cvtss_si32 (__m128 a) @safe // PERF GDC 1009 { 1010 static if (GDC_with_SSE) 1011 { 1012 return __builtin_ia32_cvtss2si(a); 1013 } 1014 else static if (LDC_with_SSE1) 1015 { 1016 return __builtin_ia32_cvtss2si(a); 1017 } 1018 else static if (DMD_with_DSIMD) 1019 { 1020 __m128 b; 1021 __m128i r = cast(__m128i) __simd(XMM.CVTPS2DQ, a); // Note: converts 4 integers. 1022 return r.array[0]; 1023 } 1024 else 1025 { 1026 return convertFloatToInt32UsingMXCSR(a.array[0]); 1027 } 1028 } 1029 unittest 1030 { 1031 assert(1 == _mm_cvtss_si32(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 1032 } 1033 1034 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer. 1035 long _mm_cvtss_si64 (__m128 a) @safe 1036 { 1037 version(LDC) 1038 { 1039 version(X86_64) 1040 { 1041 return __builtin_ia32_cvtss2si64(a); 1042 } 1043 else 1044 { 1045 // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer 1046 // using SSE instructions only. So the builtin doesn't exit for this arch. 1047 return convertFloatToInt64UsingMXCSR(a.array[0]); 1048 } 1049 } 1050 else 1051 { 1052 return convertFloatToInt64UsingMXCSR(a.array[0]); 1053 } 1054 } 1055 unittest 1056 { 1057 assert(1 == _mm_cvtss_si64(_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f))); 1058 1059 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1060 1061 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1062 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.49f))); 1063 1064 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1065 assert(-86187 == _mm_cvtss_si64(_mm_set1_ps(-86186.1f))); 1066 1067 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1068 assert(86187 == _mm_cvtss_si64(_mm_set1_ps(86186.1f))); 1069 1070 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1071 assert(-86186 == _mm_cvtss_si64(_mm_set1_ps(-86186.9f))); 1072 1073 _MM_SET_ROUNDING_MODE(savedRounding); 1074 } 1075 1076 1077 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit 1078 /// integer with truncation. 1079 int _mm_cvtt_ss2si (__m128 a) pure @safe 1080 { 1081 // x86: cvttss2si always generated, even in -O0 1082 return cast(int)(a.array[0]); 1083 } 1084 alias _mm_cvttss_si32 = _mm_cvtt_ss2si; ///ditto 1085 unittest 1086 { 1087 assert(1 == _mm_cvtt_ss2si(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1088 } 1089 1090 1091 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit 1092 /// integers with truncation. 1093 __m64 _mm_cvtt_ps2pi (__m128 a) pure @safe 1094 { 1095 return to_m64(_mm_cvttps_epi32(a)); 1096 } 1097 1098 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit 1099 /// integer with truncation. 1100 long _mm_cvttss_si64 (__m128 a) pure @safe 1101 { 1102 return cast(long)(a.array[0]); 1103 } 1104 unittest 1105 { 1106 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1107 } 1108 1109 /// Divide packed single-precision (32-bit) floating-point elements in `a` by packed elements in `b`. 1110 __m128 _mm_div_ps(__m128 a, __m128 b) pure @safe 1111 { 1112 return a / b; 1113 } 1114 unittest 1115 { 1116 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1117 a = _mm_div_ps(a, a); 1118 float[4] correct = [1.0f, 1.0f, 1.0f, 1.0f]; 1119 assert(a.array == correct); 1120 } 1121 1122 /// Divide the lower single-precision (32-bit) floating-point element in `a` by the lower 1123 /// single-precision (32-bit) floating-point element in `b`, store the result in the lower 1124 /// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result. 1125 __m128 _mm_div_ss(__m128 a, __m128 b) pure @safe 1126 { 1127 static if (DMD_with_DSIMD) 1128 return cast(__m128) __simd(XMM.DIVSS, a, b); 1129 else static if (GDC_with_SSE) 1130 return __builtin_ia32_divss(a, b); 1131 else 1132 { 1133 a[0] /= b[0]; 1134 return a; 1135 } 1136 } 1137 unittest 1138 { 1139 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1140 a = _mm_div_ss(a, a); 1141 float[4] correct = [1.0f, -2.0, 3.0f, 1.0f]; 1142 assert(a.array == correct); 1143 } 1144 1145 /// Extract a 16-bit unsigned integer from `a`, selected with `imm8`. Zero-extended. 1146 int _mm_extract_pi16 (__m64 a, int imm8) 1147 { 1148 short4 sa = cast(short4)a; 1149 return cast(ushort)(sa.array[imm8]); 1150 } 1151 unittest 1152 { 1153 __m64 A = _mm_setr_pi16(-1, 6, 0, 4); 1154 assert(_mm_extract_pi16(A, 0) == 65535); 1155 assert(_mm_extract_pi16(A, 1) == 6); 1156 assert(_mm_extract_pi16(A, 2) == 0); 1157 assert(_mm_extract_pi16(A, 3) == 4); 1158 } 1159 1160 /// Free aligned memory that was allocated with `_mm_malloc`. 1161 void _mm_free(void * mem_addr) @trusted 1162 { 1163 // support for free(NULL) 1164 if (mem_addr is null) 1165 return; 1166 1167 // Technically we don't need to store size and alignement in the chunk, but we do in case we 1168 // have to implement _mm_realloc 1169 1170 size_t pointerSize = (void*).sizeof; 1171 void** rawLocation = cast(void**)(cast(char*)mem_addr - size_t.sizeof); 1172 size_t* alignmentLocation = cast(size_t*)(cast(char*)mem_addr - 3 * pointerSize); 1173 size_t alignment = *alignmentLocation; 1174 assert(alignment != 0); 1175 assert(isPointerAligned(mem_addr, alignment)); 1176 free(*rawLocation); 1177 } 1178 1179 /// Get the exception mask bits from the MXCSR control and status register. 1180 /// The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, 1181 /// `_MM_MASK_DIV_ZERO`, `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`. 1182 /// Note: won't correspond to reality on non-x86, where MXCSR this is emulated. 1183 uint _MM_GET_EXCEPTION_MASK() @safe 1184 { 1185 return _mm_getcsr() & _MM_MASK_MASK; 1186 } 1187 1188 /// Get the exception state bits from the MXCSR control and status register. 1189 /// The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 1190 /// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`. 1191 /// Note: won't correspond to reality on non-x86, where MXCSR this is emulated. No exception reported. 1192 uint _MM_GET_EXCEPTION_STATE() @safe 1193 { 1194 return _mm_getcsr() & _MM_EXCEPT_MASK; 1195 } 1196 1197 /// Get the flush zero bits from the MXCSR control and status register. 1198 /// The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF` 1199 uint _MM_GET_FLUSH_ZERO_MODE() @safe 1200 { 1201 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 1202 } 1203 1204 /// Get the rounding mode bits from the MXCSR control and status register. The rounding mode may 1205 /// contain any of the following flags: `_MM_ROUND_NEAREST, `_MM_ROUND_DOWN`, `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`. 1206 uint _MM_GET_ROUNDING_MODE() @safe 1207 { 1208 return _mm_getcsr() & _MM_ROUND_MASK; 1209 } 1210 1211 /// Get the unsigned 32-bit value of the MXCSR control and status register. 1212 /// Note: this is emulated on ARM, because there is no MXCSR register then. 1213 uint _mm_getcsr() @trusted 1214 { 1215 static if (LDC_with_ARM) 1216 { 1217 // Note: we convert the ARM FPSCR into a x86 SSE control word. 1218 // However, only rounding mode and flush to zero are actually set. 1219 // The returned control word will have all exceptions masked, and no exception detected. 1220 1221 uint fpscr = arm_get_fpcr(); 1222 1223 uint cw = 0; // No exception detected 1224 if (fpscr & _MM_FLUSH_ZERO_MASK_ARM) 1225 { 1226 // ARM has one single flag for ARM. 1227 // It does both x86 bits. 1228 // https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode 1229 cw |= _MM_FLUSH_ZERO_ON; 1230 cw |= 0x40; // set "denormals are zeros" 1231 } 1232 cw |= _MM_MASK_MASK; // All exception maske 1233 1234 // Rounding mode 1235 switch(fpscr & _MM_ROUND_MASK_ARM) 1236 { 1237 default: 1238 case _MM_ROUND_NEAREST_ARM: cw |= _MM_ROUND_NEAREST; break; 1239 case _MM_ROUND_DOWN_ARM: cw |= _MM_ROUND_DOWN; break; 1240 case _MM_ROUND_UP_ARM: cw |= _MM_ROUND_UP; break; 1241 case _MM_ROUND_TOWARD_ZERO_ARM: cw |= _MM_ROUND_TOWARD_ZERO; break; 1242 } 1243 return cw; 1244 } 1245 else version(GNU) 1246 { 1247 static if (GDC_with_SSE) 1248 { 1249 return __builtin_ia32_stmxcsr(); 1250 } 1251 else version(X86) 1252 { 1253 uint sseRounding = 0; 1254 asm pure nothrow @nogc @trusted 1255 { 1256 "stmxcsr %0;\n" 1257 : "=m" (sseRounding) 1258 : 1259 : ; 1260 } 1261 return sseRounding; 1262 } 1263 else 1264 static assert(false); 1265 } 1266 else version (InlineX86Asm) 1267 { 1268 uint controlWord; 1269 asm nothrow @nogc pure @safe 1270 { 1271 stmxcsr controlWord; 1272 } 1273 return controlWord; 1274 } 1275 else 1276 static assert(0, "Not yet supported"); 1277 } 1278 unittest 1279 { 1280 uint csr = _mm_getcsr(); 1281 } 1282 1283 /// Insert a 16-bit integer `i` inside `a` at the location specified by `imm8`. 1284 __m64 _mm_insert_pi16 (__m64 v, int i, int imm8) pure @trusted 1285 { 1286 short4 r = cast(short4)v; 1287 r.ptr[imm8 & 3] = cast(short)i; 1288 return cast(__m64)r; 1289 } 1290 unittest 1291 { 1292 __m64 A = _mm_set_pi16(3, 2, 1, 0); 1293 short4 R = cast(short4) _mm_insert_pi16(A, 42, 1 | 4); 1294 short[4] correct = [0, 42, 2, 3]; 1295 assert(R.array == correct); 1296 } 1297 1298 /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory. 1299 // `p` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1300 __m128 _mm_load_ps(const(float)*p) pure @trusted // TODO shouldn't be trusted 1301 { 1302 return *cast(__m128*)p; 1303 } 1304 unittest 1305 { 1306 static immutable align(16) float[4] correct = [1.0f, 2.0f, 3.0f, 4.0f]; 1307 __m128 A = _mm_load_ps(correct.ptr); 1308 assert(A.array == correct); 1309 } 1310 1311 /// Load a single-precision (32-bit) floating-point element from memory into all elements. 1312 __m128 _mm_load_ps1(const(float)*p) pure @trusted 1313 { 1314 return __m128(*p); 1315 } 1316 unittest 1317 { 1318 float n = 2.5f; 1319 float[4] correct = [2.5f, 2.5f, 2.5f, 2.5f]; 1320 __m128 A = _mm_load_ps1(&n); 1321 assert(A.array == correct); 1322 } 1323 1324 /// Load a single-precision (32-bit) floating-point element from memory into the lower of dst, and zero the upper 3 1325 /// elements. `mem_addr` does not need to be aligned on any particular boundary. 1326 __m128 _mm_load_ss (const(float)* mem_addr) pure @trusted 1327 { 1328 static if (DMD_with_DSIMD) 1329 { 1330 return cast(__m128)__simd(XMM.LODSS, *cast(__m128*)mem_addr); 1331 } 1332 else 1333 { 1334 __m128 r; 1335 r.ptr[0] = *mem_addr; 1336 r.ptr[1] = 0; 1337 r.ptr[2] = 0; 1338 r.ptr[3] = 0; 1339 return r; 1340 } 1341 } 1342 unittest 1343 { 1344 float n = 2.5f; 1345 float[4] correct = [2.5f, 0.0f, 0.0f, 0.0f]; 1346 __m128 A = _mm_load_ss(&n); 1347 assert(A.array == correct); 1348 } 1349 1350 /// Load a single-precision (32-bit) floating-point element from memory into all elements. 1351 alias _mm_load1_ps = _mm_load_ps1; 1352 1353 /// Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of result, 1354 /// and copy the lower 2 elements from `a` to result. `mem_addr does` not need to be aligned on any particular boundary. 1355 __m128 _mm_loadh_pi (__m128 a, const(__m64)* mem_addr) pure @trusted 1356 { 1357 static if (DMD_with_DSIMD) 1358 { 1359 return cast(__m128) __simd(XMM.LODHPS, a, *cast(const(__m128)*)mem_addr); 1360 } 1361 else 1362 { 1363 // x86: movlhps generated since LDC 1.9.0 -O1 1364 long2 la = cast(long2)a; 1365 la.ptr[1] = (*mem_addr).array[0]; 1366 return cast(__m128)la; 1367 } 1368 } 1369 unittest 1370 { 1371 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1372 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1373 __m64 M = to_m64(cast(__m128i)B); 1374 __m128 R = _mm_loadh_pi(A, &M); 1375 float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f]; 1376 assert(R.array == correct); 1377 } 1378 1379 /// Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of result, 1380 /// and copy the upper 2 elements from `a` to result. `mem_addr` does not need to be aligned on any particular boundary. 1381 __m128 _mm_loadl_pi (__m128 a, const(__m64)* mem_addr) pure @trusted 1382 { 1383 static if (DMD_with_DSIMD) 1384 { 1385 return cast(__m128) __simd(XMM.LODLPS, a, *cast(const(__m128)*)mem_addr); 1386 } 1387 else 1388 { 1389 // x86: movlpd/movlps generated with all LDC -01 1390 long2 la = cast(long2)a; 1391 la.ptr[0] = (*mem_addr).array[0]; 1392 return cast(__m128)la; 1393 } 1394 } 1395 unittest 1396 { 1397 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1398 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1399 __m64 M = to_m64(cast(__m128i)B); 1400 __m128 R = _mm_loadl_pi(A, &M); 1401 float[4] correct = [5.0f, 6.0f, 3.0f, 4.0f]; 1402 assert(R.array == correct); 1403 } 1404 1405 /// Load 4 single-precision (32-bit) floating-point elements from memory in reverse order. 1406 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1407 __m128 _mm_loadr_ps (const(float)* mem_addr) pure @trusted // TODO shouldn't be trusted 1408 { 1409 __m128* aligned = cast(__m128*)mem_addr; // x86: movaps + shups since LDC 1.0.0 -O1 1410 __m128 a = *aligned; 1411 static if (DMD_with_DSIMD) 1412 { 1413 return cast(__m128) __simd(XMM.SHUFPS, a, a, 27); 1414 } 1415 else 1416 { 1417 __m128 r; 1418 r.ptr[0] = a.array[3]; 1419 r.ptr[1] = a.array[2]; 1420 r.ptr[2] = a.array[1]; 1421 r.ptr[3] = a.array[0]; 1422 return r; 1423 } 1424 } 1425 unittest 1426 { 1427 align(16) static immutable float[4] arr = [ 1.0f, 2.0f, 3.0f, 8.0f ]; 1428 __m128 A = _mm_loadr_ps(arr.ptr); 1429 float[4] correct = [ 8.0f, 3.0f, 2.0f, 1.0f ]; 1430 assert(A.array == correct); 1431 } 1432 1433 /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory. 1434 /// `mem_addr` does not need to be aligned on any particular boundary. 1435 __m128 _mm_loadu_ps(const(float)* mem_addr) pure @trusted 1436 { 1437 static if (GDC_with_SSE2) 1438 { 1439 return __builtin_ia32_loadups(mem_addr); 1440 } 1441 else version(LDC) 1442 { 1443 return loadUnaligned!(__m128)(mem_addr); 1444 } 1445 else version(DigitalMars) 1446 { 1447 static if (DMD_with_DSIMD) 1448 { 1449 return cast(__m128)__simd(XMM.LODUPS, *mem_addr); 1450 } 1451 else static if (SSESizedVectorsAreEmulated) 1452 { 1453 // Since this vector is emulated, it doesn't have alignement constraints 1454 // and as such we can just cast it. 1455 return *cast(__m128*)(mem_addr); 1456 } 1457 else 1458 { 1459 __m128 result; 1460 result.ptr[0] = mem_addr[0]; 1461 result.ptr[1] = mem_addr[1]; 1462 result.ptr[2] = mem_addr[2]; 1463 result.ptr[3] = mem_addr[3]; 1464 return result; 1465 } 1466 } 1467 else 1468 { 1469 __m128 result; 1470 result.ptr[0] = mem_addr[0]; 1471 result.ptr[1] = mem_addr[1]; 1472 result.ptr[2] = mem_addr[2]; 1473 result.ptr[3] = mem_addr[3]; 1474 return result; 1475 } 1476 } 1477 unittest 1478 { 1479 align(16) static immutable float[5] arr = [ 1.0f, 2.0f, 3.0f, 8.0f, 9.0f ]; // force unaligned load 1480 __m128 A = _mm_loadu_ps(&arr[1]); 1481 float[4] correct = [ 2.0f, 3.0f, 8.0f, 9.0f ]; 1482 assert(A.array == correct); 1483 } 1484 1485 /// Load unaligned 16-bit integer from memory into the first element, fill with zeroes otherwise. 1486 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted 1487 { 1488 static if (DMD_with_DSIMD) 1489 { 1490 int r = *cast(short*)(mem_addr); 1491 return cast(__m128i) __simd(XMM.LODD, *cast(__m128i*)&r); 1492 } 1493 else version(DigitalMars) 1494 { 1495 // Workaround issue: https://issues.dlang.org/show_bug.cgi?id=21672 1496 // DMD cannot handle the below code... 1497 align(16) short[8] r = [0, 0, 0, 0, 0, 0, 0, 0]; 1498 r[0] = *cast(short*)(mem_addr); 1499 return *cast(int4*)(r.ptr); 1500 } 1501 else 1502 { 1503 short r = *cast(short*)(mem_addr); 1504 short8 result = [0, 0, 0, 0, 0, 0, 0, 0]; 1505 result.ptr[0] = r; 1506 return cast(__m128i)result; 1507 } 1508 } 1509 unittest 1510 { 1511 short r = 13; 1512 short8 A = cast(short8) _mm_loadu_si16(&r); 1513 short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0]; 1514 assert(A.array == correct); 1515 } 1516 1517 /// Load unaligned 64-bit integer from memory into the first element of result. 1518 /// Upper 64-bit is zeroed. 1519 __m128i _mm_loadu_si64(const(void)* mem_addr) pure @trusted 1520 { 1521 static if (DMD_with_DSIMD) 1522 { 1523 return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr); 1524 } 1525 else 1526 { 1527 long r = *cast(long*)(mem_addr); 1528 long2 result = [0, 0]; 1529 result.ptr[0] = r; 1530 return cast(__m128i)result; 1531 } 1532 } 1533 unittest 1534 { 1535 long r = 446446446446; 1536 long2 A = cast(long2) _mm_loadu_si64(&r); 1537 long[2] correct = [446446446446, 0]; 1538 assert(A.array == correct); 1539 } 1540 1541 /// Allocate size bytes of memory, aligned to the alignment specified in align, 1542 /// and return a pointer to the allocated memory. `_mm_free` should be used to free 1543 /// memory that is allocated with `_mm_malloc`. 1544 void* _mm_malloc(size_t size, size_t alignment) @trusted 1545 { 1546 assert(alignment != 0); 1547 size_t request = requestedSize(size, alignment); 1548 void* raw = malloc(request); 1549 if (request > 0 && raw == null) // malloc(0) can validly return anything 1550 onOutOfMemoryError(); 1551 return storeRawPointerPlusInfo(raw, size, alignment); // PERF: no need to store size 1552 } 1553 1554 /// Conditionally store 8-bit integer elements from a into memory using mask (elements are not stored when the highest 1555 /// bit is not set in the corresponding element) and a non-temporal memory hint. 1556 void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr) @trusted 1557 { 1558 // this works since mask is zero-extended 1559 return _mm_maskmoveu_si128 (to_m128i(a), to_m128i(mask), mem_addr); 1560 } 1561 1562 deprecated("Use _mm_maskmove_si64 instead") alias _m_maskmovq = _mm_maskmove_si64;/// 1563 1564 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum value. 1565 __m64 _mm_max_pi16 (__m64 a, __m64 b) pure @safe 1566 { 1567 return to_m64(_mm_max_epi16(to_m128i(a), to_m128i(b))); 1568 } 1569 1570 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values. 1571 __m128 _mm_max_ps(__m128 a, __m128 b) pure @safe 1572 { 1573 static if (DMD_with_DSIMD) 1574 { 1575 return cast(__m128) __simd(XMM.MAXPS, a, b); 1576 } 1577 else static if (GDC_with_SSE) 1578 { 1579 return __builtin_ia32_maxps(a, b); 1580 } 1581 else static if (LDC_with_SSE1) 1582 { 1583 return __builtin_ia32_maxps(a, b); 1584 } 1585 else 1586 { 1587 // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02 1588 __m128 r; 1589 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 1590 r[1] = (a[1] > b[1]) ? a[1] : b[1]; 1591 r[2] = (a[2] > b[2]) ? a[2] : b[2]; 1592 r[3] = (a[3] > b[3]) ? a[3] : b[3]; 1593 return r; 1594 } 1595 } 1596 unittest 1597 { 1598 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 1599 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 1600 __m128 M = _mm_max_ps(A, B); 1601 assert(M.array[0] == 4); 1602 assert(M.array[1] == 2); 1603 assert(M.array[2] == 4); // in case of NaN, second operand prevails (as it seems) 1604 assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems) 1605 } 1606 1607 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed maximum values. 1608 __m64 _mm_max_pu8 (__m64 a, __m64 b) pure @safe 1609 { 1610 return to_m64(_mm_max_epu8(to_m128i(a), to_m128i(b))); 1611 } 1612 1613 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the maximum value in the 1614 /// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result. 1615 __m128 _mm_max_ss(__m128 a, __m128 b) pure @safe 1616 { 1617 static if (DMD_with_DSIMD) 1618 { 1619 return cast(__m128) __simd(XMM.MAXSS, a, b); 1620 } 1621 else static if (GDC_with_SSE) 1622 { 1623 return __builtin_ia32_maxss(a, b); 1624 } 1625 else static if (LDC_with_SSE1) 1626 { 1627 return __builtin_ia32_maxss(a, b); 1628 } 1629 else 1630 { 1631 __m128 r = a; 1632 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 1633 return r; 1634 } 1635 } 1636 unittest 1637 { 1638 __m128 A = _mm_setr_ps(1, 2, 3, 4); 1639 __m128 B = _mm_setr_ps(4, 1, 4, 1); 1640 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 1641 __m128 M = _mm_max_ss(A, B); 1642 assert(M.array[0] == 4); 1643 assert(M.array[1] == 2); 1644 assert(M.array[2] == 3); 1645 assert(M.array[3] == 4); 1646 M = _mm_max_ps(A, C); // in case of NaN, second operand prevails 1647 assert(M.array[0] != M.array[0]); 1648 M = _mm_max_ps(C, A); // in case of NaN, second operand prevails 1649 assert(M.array[0] == 1); 1650 } 1651 1652 /// Compare packed signed 16-bit integers in a and b, and return packed minimum values. 1653 __m64 _mm_min_pi16 (__m64 a, __m64 b) pure @safe 1654 { 1655 return to_m64(_mm_min_epi16(to_m128i(a), to_m128i(b))); 1656 } 1657 1658 /// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`, and return packed maximum values. 1659 __m128 _mm_min_ps(__m128 a, __m128 b) pure @safe 1660 { 1661 static if (DMD_with_DSIMD) 1662 { 1663 return cast(__m128) __simd(XMM.MINPS, a, b); 1664 } 1665 else static if (GDC_with_SSE) 1666 { 1667 return __builtin_ia32_minps(a, b); 1668 } 1669 else static if (LDC_with_SSE1) 1670 { 1671 // not technically needed, but better perf in debug mode 1672 return __builtin_ia32_minps(a, b); 1673 } 1674 else 1675 { 1676 // ARM: Optimized into fcmgt + bsl since LDC 1.8 -02 1677 __m128 r; 1678 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 1679 r[1] = (a[1] < b[1]) ? a[1] : b[1]; 1680 r[2] = (a[2] < b[2]) ? a[2] : b[2]; 1681 r[3] = (a[3] < b[3]) ? a[3] : b[3]; 1682 return r; 1683 } 1684 } 1685 unittest 1686 { 1687 __m128 A = _mm_setr_ps(1, 2, float.nan, 4); 1688 __m128 B = _mm_setr_ps(4, 1, 4, float.nan); 1689 __m128 M = _mm_min_ps(A, B); 1690 assert(M.array[0] == 1); 1691 assert(M.array[1] == 1); 1692 assert(M.array[2] == 4); // in case of NaN, second operand prevails (as it seems) 1693 assert(M.array[3] != M.array[3]); // in case of NaN, second operand prevails (as it seems) 1694 } 1695 1696 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values. 1697 __m64 _mm_min_pu8 (__m64 a, __m64 b) pure @safe 1698 { 1699 return to_m64(_mm_min_epu8(to_m128i(a), to_m128i(b))); 1700 } 1701 1702 /// Compare the lower single-precision (32-bit) floating-point elements in `a` and `b`, store the minimum value in the 1703 /// lower element of result, and copy the upper 3 packed elements from `a` to the upper element of result. 1704 __m128 _mm_min_ss(__m128 a, __m128 b) pure @safe 1705 { 1706 static if (DMD_with_DSIMD) 1707 { 1708 return cast(__m128) __simd(XMM.MINSS, a, b); 1709 } 1710 else static if (GDC_with_SSE) 1711 { 1712 return __builtin_ia32_minss(a, b); 1713 } 1714 else static if (LDC_with_SSE1) 1715 { 1716 return __builtin_ia32_minss(a, b); 1717 } 1718 else 1719 { 1720 // Generates minss since LDC 1.3 -O1 1721 __m128 r = a; 1722 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 1723 return r; 1724 } 1725 } 1726 unittest 1727 { 1728 __m128 A = _mm_setr_ps(1, 2, 3, 4); 1729 __m128 B = _mm_setr_ps(4, 1, 4, 1); 1730 __m128 C = _mm_setr_ps(float.nan, 1, 4, 1); 1731 __m128 M = _mm_min_ss(A, B); 1732 assert(M.array[0] == 1); 1733 assert(M.array[1] == 2); 1734 assert(M.array[2] == 3); 1735 assert(M.array[3] == 4); 1736 M = _mm_min_ps(A, C); // in case of NaN, second operand prevails 1737 assert(M.array[0] != M.array[0]); 1738 M = _mm_min_ps(C, A); // in case of NaN, second operand prevails 1739 assert(M.array[0] == 1); 1740 } 1741 1742 /// Move the lower single-precision (32-bit) floating-point element from `b` to the lower element of result, and copy 1743 /// the upper 3 packed elements from `a` to the upper elements of result. 1744 __m128 _mm_move_ss (__m128 a, __m128 b) pure @trusted 1745 { 1746 // Workaround https://issues.dlang.org/show_bug.cgi?id=21673 1747 // inlining of this function fails. 1748 version(DigitalMars) asm nothrow @nogc pure { nop; } 1749 1750 a.ptr[0] = b.array[0]; 1751 return a; 1752 } 1753 unittest 1754 { 1755 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1756 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1757 __m128 R = _mm_move_ss(A, B); 1758 float[4] correct = [5.0f, 2.0f, 3.0f, 4.0f]; 1759 assert(R.array == correct); 1760 } 1761 1762 /// Move the upper 2 single-precision (32-bit) floating-point elements from `b` to the lower 2 elements of result, and 1763 /// copy the upper 2 elements from `a` to the upper 2 elements of dst. 1764 __m128 _mm_movehl_ps (__m128 a, __m128 b) pure @trusted 1765 { 1766 // Disabled because of https://issues.dlang.org/show_bug.cgi?id=19443 1767 /* 1768 static if (DMD_with_DSIMD) 1769 { 1770 1771 return cast(__m128) __simd(XMM.MOVHLPS, a, b); 1772 } 1773 else */ 1774 { 1775 a.ptr[0] = b.array[2]; 1776 a.ptr[1] = b.array[3]; 1777 return a; 1778 } 1779 } 1780 unittest 1781 { 1782 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1783 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1784 __m128 R = _mm_movehl_ps(A, B); 1785 float[4] correct = [7.0f, 8.0f, 3.0f, 4.0f]; 1786 assert(R.array == correct); 1787 } 1788 1789 /// Move the lower 2 single-precision (32-bit) floating-point elements from `b` to the upper 2 elements of result, and 1790 /// copy the lower 2 elements from `a` to the lower 2 elements of result 1791 __m128 _mm_movelh_ps (__m128 a, __m128 b) pure @trusted 1792 { 1793 // Disabled because of https://issues.dlang.org/show_bug.cgi?id=19443 1794 /* 1795 static if (DMD_with_DSIMD) 1796 { 1797 return cast(__m128) __simd(XMM.MOVLHPS, a, b); 1798 } 1799 else 1800 */ 1801 { 1802 a.ptr[2] = b.array[0]; 1803 a.ptr[3] = b.array[1]; 1804 return a; 1805 } 1806 } 1807 unittest 1808 { 1809 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 1810 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 1811 __m128 R = _mm_movelh_ps(A, B); 1812 float[4] correct = [1.0f, 2.0f, 5.0f, 6.0f]; 1813 assert(R.array == correct); 1814 } 1815 1816 /// Create mask from the most significant bit of each 8-bit element in `a`. 1817 int _mm_movemask_pi8 (__m64 a) pure @safe 1818 { 1819 return _mm_movemask_epi8(to_m128i(a)); 1820 } 1821 unittest 1822 { 1823 assert(0x9C == _mm_movemask_pi8(_mm_set_pi8(-1, 0, 0, -1, -1, -1, 0, 0))); 1824 } 1825 1826 /// Set each bit of result based on the most significant bit of the corresponding packed single-precision (32-bit) 1827 /// floating-point element in `a`. 1828 int _mm_movemask_ps (__m128 a) pure @trusted 1829 { 1830 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 1831 static if (GDC_with_SSE) 1832 { 1833 return __builtin_ia32_movmskps(a); 1834 } 1835 else static if (LDC_with_SSE1) 1836 { 1837 return __builtin_ia32_movmskps(a); 1838 } 1839 else static if (LDC_with_ARM) 1840 { 1841 int4 ai = cast(int4)a; 1842 int4 shift31 = [31, 31, 31, 31]; 1843 ai = ai >>> shift31; 1844 int4 shift = [0, 1, 2, 3]; 1845 ai = ai << shift; // 4-way shift, only efficient on ARM. 1846 int r = ai.array[0] + (ai.array[1]) + (ai.array[2]) + (ai.array[3]); 1847 return r; 1848 } 1849 else 1850 { 1851 int4 ai = cast(int4)a; 1852 int r = 0; 1853 if (ai.array[0] < 0) r += 1; 1854 if (ai.array[1] < 0) r += 2; 1855 if (ai.array[2] < 0) r += 4; 1856 if (ai.array[3] < 0) r += 8; 1857 return r; 1858 } 1859 } 1860 unittest 1861 { 1862 int4 A = [-1, 0, -43, 0]; 1863 assert(5 == _mm_movemask_ps(cast(float4)A)); 1864 } 1865 1866 /// Multiply packed single-precision (32-bit) floating-point elements in `a` and `b`. 1867 __m128 _mm_mul_ps(__m128 a, __m128 b) pure @safe 1868 { 1869 return a * b; 1870 } 1871 unittest 1872 { 1873 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1874 a = _mm_mul_ps(a, a); 1875 float[4] correct = [2.25f, 4.0f, 9.0f, 1.0f]; 1876 assert(a.array == correct); 1877 } 1878 1879 /// Multiply the lower single-precision (32-bit) floating-point element in `a` and `b`, store the result in the lower 1880 /// element of result, and copy the upper 3 packed elements from `a` to the upper elements of result. 1881 __m128 _mm_mul_ss(__m128 a, __m128 b) pure @safe 1882 { 1883 static if (DMD_with_DSIMD) 1884 return cast(__m128) __simd(XMM.MULSS, a, b); 1885 else static if (GDC_with_SSE) 1886 return __builtin_ia32_mulss(a, b); 1887 else 1888 { 1889 a[0] *= b[0]; 1890 return a; 1891 } 1892 } 1893 unittest 1894 { 1895 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 1896 a = _mm_mul_ss(a, a); 1897 float[4] correct = [2.25f, -2.0f, 3.0f, 1.0f]; 1898 assert(a.array == correct); 1899 } 1900 1901 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, 1902 /// and return the high 16 bits of the intermediate integers. 1903 __m64 _mm_mulhi_pu16 (__m64 a, __m64 b) pure @safe 1904 { 1905 return to_m64(_mm_mulhi_epu16(to_m128i(a), to_m128i(b))); 1906 } 1907 unittest 1908 { 1909 __m64 A = _mm_setr_pi16(0, -16, 2, 3); 1910 __m64 B = _mm_set1_pi16(16384); 1911 short4 R = cast(short4)_mm_mulhi_pu16(A, B); 1912 short[4] correct = [0, 0x3FFC, 0, 0]; 1913 assert(R.array == correct); 1914 } 1915 1916 /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in `a` and `b`, and 1917 /// return the result. 1918 __m128 _mm_or_ps (__m128 a, __m128 b) pure @safe 1919 { 1920 static if (DMD_with_DSIMD) 1921 return cast(__m128)__simd(XMM.ORPS, a, b); 1922 else 1923 return cast(__m128)(cast(__m128i)a | cast(__m128i)b); 1924 } 1925 1926 deprecated("Use _mm_avg_pu8 instead") alias _m_pavgb = _mm_avg_pu8;/// 1927 deprecated("Use _mm_avg_pu16 instead") alias _m_pavgw = _mm_avg_pu16;/// 1928 deprecated("Use _mm_extract_pi16 instead") alias _m_pextrw = _mm_extract_pi16;/// 1929 deprecated("Use _mm_insert_pi16 instead") alias _m_pinsrw = _mm_insert_pi16;/// 1930 deprecated("Use _mm_max_pi16 instead") alias _m_pmaxsw = _mm_max_pi16;/// 1931 deprecated("Use _mm_max_pu8 instead") alias _m_pmaxub = _mm_max_pu8;/// 1932 deprecated("Use _mm_min_pi16 instead") alias _m_pminsw = _mm_min_pi16;/// 1933 deprecated("Use _mm_min_pu8 instead") alias _m_pminub = _mm_min_pu8;/// 1934 deprecated("Use _mm_movemask_pi8 instead") alias _m_pmovmskb = _mm_movemask_pi8;/// 1935 deprecated("Use _mm_mulhi_pu16 instead") alias _m_pmulhuw = _mm_mulhi_pu16;/// 1936 1937 enum _MM_HINT_T0 = 3; /// 1938 enum _MM_HINT_T1 = 2; /// 1939 enum _MM_HINT_T2 = 1; /// 1940 enum _MM_HINT_NTA = 0; /// 1941 1942 1943 version(LDC) 1944 { 1945 // Starting with LLVM 10, it seems llvm.prefetch has changed its name. 1946 // Was reported at: https://github.com/ldc-developers/ldc/issues/3397 1947 static if (__VERSION__ >= 2091) 1948 { 1949 pragma(LDC_intrinsic, "llvm.prefetch.p0i8") // was "llvm.prefetch" 1950 void llvm_prefetch_fixed(void* ptr, uint rw, uint locality, uint cachetype) pure @safe; 1951 } 1952 } 1953 1954 /// Fetch the line of data from memory that contains address `p` to a location in the 1955 /// cache hierarchy specified by the locality hint i. 1956 /// 1957 /// Warning: `locality` is a compile-time parameter, unlike in Intel Intrinsics API. 1958 void _mm_prefetch(int locality)(const(void)* p) pure @trusted 1959 { 1960 static if (GDC_with_SSE) 1961 { 1962 return __builtin_prefetch(p, (locality & 0x4) >> 2, locality & 0x3); 1963 } 1964 else static if (DMD_with_DSIMD) 1965 { 1966 enum bool isWrite = (locality & 0x4) != 0; 1967 enum level = locality & 3; 1968 return prefetch!(isWrite, level)(p); 1969 } 1970 else version(LDC) 1971 { 1972 static if (__VERSION__ >= 2091) 1973 { 1974 // const_cast here. `llvm_prefetch` wants a mutable pointer 1975 llvm_prefetch_fixed( cast(void*)p, 0, locality, 1); 1976 } 1977 else 1978 { 1979 // const_cast here. `llvm_prefetch` wants a mutable pointer 1980 llvm_prefetch( cast(void*)p, 0, locality, 1); 1981 } 1982 } 1983 else version(D_InlineAsm_X86_64) 1984 { 1985 static if (locality == _MM_HINT_NTA) 1986 { 1987 asm pure nothrow @nogc @trusted 1988 { 1989 mov RAX, p; 1990 prefetchnta [RAX]; 1991 } 1992 } 1993 else static if (locality == _MM_HINT_T0) 1994 { 1995 asm pure nothrow @nogc @trusted 1996 { 1997 mov RAX, p; 1998 prefetcht0 [RAX]; 1999 } 2000 } 2001 else static if (locality == _MM_HINT_T1) 2002 { 2003 asm pure nothrow @nogc @trusted 2004 { 2005 mov RAX, p; 2006 prefetcht1 [RAX]; 2007 } 2008 } 2009 else static if (locality == _MM_HINT_T2) 2010 { 2011 asm pure nothrow @nogc @trusted 2012 { 2013 mov RAX, p; 2014 prefetcht2 [RAX]; 2015 } 2016 } 2017 else 2018 assert(false); // invalid locality hint 2019 } 2020 else version(D_InlineAsm_X86) 2021 { 2022 static if (locality == _MM_HINT_NTA) 2023 { 2024 asm pure nothrow @nogc @trusted 2025 { 2026 mov EAX, p; 2027 prefetchnta [EAX]; 2028 } 2029 } 2030 else static if (locality == _MM_HINT_T0) 2031 { 2032 asm pure nothrow @nogc @trusted 2033 { 2034 mov EAX, p; 2035 prefetcht0 [EAX]; 2036 } 2037 } 2038 else static if (locality == _MM_HINT_T1) 2039 { 2040 asm pure nothrow @nogc @trusted 2041 { 2042 mov EAX, p; 2043 prefetcht1 [EAX]; 2044 } 2045 } 2046 else static if (locality == _MM_HINT_T2) 2047 { 2048 asm pure nothrow @nogc @trusted 2049 { 2050 mov EAX, p; 2051 prefetcht2 [EAX]; 2052 } 2053 } 2054 else 2055 assert(false); // invalid locality hint 2056 } 2057 else 2058 { 2059 // Generic version: do nothing. From bitter experience, 2060 // it's unlikely you get ANY speed-up with manual prefetching. 2061 // Prefetching or not doesn't change program behaviour. 2062 } 2063 } 2064 unittest 2065 { 2066 // From Intel documentation: 2067 // "The amount of data prefetched is also processor implementation-dependent. It will, however, be a minimum of 2068 // 32 bytes." 2069 ubyte[256] cacheline; // though it seems it cannot generate GP fault 2070 _mm_prefetch!_MM_HINT_T0(cacheline.ptr); 2071 _mm_prefetch!_MM_HINT_T1(cacheline.ptr); 2072 _mm_prefetch!_MM_HINT_T2(cacheline.ptr); 2073 _mm_prefetch!_MM_HINT_NTA(cacheline.ptr); 2074 } 2075 2076 deprecated("Use _mm_sad_pu8 instead") alias _m_psadbw = _mm_sad_pu8;/// 2077 deprecated("Use _mm_shuffle_pi16 instead") alias _m_pshufw = _mm_shuffle_pi16;/// 2078 2079 2080 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a`` , 2081 /// and return the results. The maximum relative error for this approximation is less than 1.5*2^-12. 2082 __m128 _mm_rcp_ps (__m128 a) pure @trusted 2083 { 2084 static if (DMD_with_DSIMD) 2085 { 2086 return cast(__m128) __simd(XMM.RCPPS, a); 2087 } 2088 else static if (GDC_with_SSE) 2089 { 2090 return __builtin_ia32_rcpps(a); 2091 } 2092 else static if (LDC_with_SSE1) 2093 { 2094 return __builtin_ia32_rcpps(a); 2095 } 2096 else 2097 { 2098 a.ptr[0] = 1.0f / a.array[0]; 2099 a.ptr[1] = 1.0f / a.array[1]; 2100 a.ptr[2] = 1.0f / a.array[2]; 2101 a.ptr[3] = 1.0f / a.array[3]; 2102 return a; 2103 } 2104 } 2105 unittest 2106 { 2107 __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f); 2108 __m128 groundTruth = _mm_set1_ps(1.0f) / A; 2109 __m128 result = _mm_rcp_ps(A); 2110 foreach(i; 0..4) 2111 { 2112 double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1; 2113 assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 2114 } 2115 } 2116 2117 /// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in `a`, store it 2118 /// in the lower element of the result, and copy the upper 3 packed elements from `a` to the upper elements of result. 2119 /// The maximum relative error for this approximation is less than 1.5*2^-12. 2120 __m128 _mm_rcp_ss (__m128 a) pure @trusted 2121 { 2122 static if (DMD_with_DSIMD) 2123 { 2124 return cast(__m128) __simd(XMM.RCPSS, a); 2125 } 2126 else static if (GDC_with_SSE) 2127 { 2128 return __builtin_ia32_rcpss(a); 2129 } 2130 else static if (LDC_with_SSE1) 2131 { 2132 return __builtin_ia32_rcpss(a); 2133 } 2134 else 2135 { 2136 a.ptr[0] = 1.0f / a.array[0]; 2137 return a; 2138 } 2139 } 2140 unittest 2141 { 2142 __m128 A = _mm_setr_ps(2.34f, -70000.0f, 0.00001f, 345.5f); 2143 __m128 correct = _mm_setr_ps(1 / 2.34f, -70000.0f, 0.00001f, 345.5f); 2144 __m128 R = _mm_rcp_ss(A); 2145 double relError = (cast(double)(correct.array[0]) / R.array[0]) - 1; 2146 assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 2147 assert(R.array[1] == correct.array[1]); 2148 assert(R.array[2] == correct.array[2]); 2149 assert(R.array[3] == correct.array[3]); 2150 } 2151 2152 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in `a`. 2153 /// The maximum relative error for this approximation is less than 1.5*2^-12. 2154 __m128 _mm_rsqrt_ps (__m128 a) pure @trusted 2155 { 2156 static if (DMD_with_DSIMD) 2157 { 2158 return cast(__m128) __simd(XMM.RSQRTPS, a); 2159 } 2160 else static if (GDC_with_SSE) 2161 { 2162 return __builtin_ia32_rsqrtps(a); 2163 } 2164 else static if (LDC_with_SSE1) 2165 { 2166 return __builtin_ia32_rsqrtps(a); 2167 } 2168 else version(LDC) 2169 { 2170 a[0] = 1.0f / llvm_sqrt(a[0]); 2171 a[1] = 1.0f / llvm_sqrt(a[1]); 2172 a[2] = 1.0f / llvm_sqrt(a[2]); 2173 a[3] = 1.0f / llvm_sqrt(a[3]); 2174 return a; 2175 } 2176 else 2177 { 2178 a.ptr[0] = 1.0f / sqrt(a.array[0]); 2179 a.ptr[1] = 1.0f / sqrt(a.array[1]); 2180 a.ptr[2] = 1.0f / sqrt(a.array[2]); 2181 a.ptr[3] = 1.0f / sqrt(a.array[3]); 2182 return a; 2183 } 2184 } 2185 unittest 2186 { 2187 __m128 A = _mm_setr_ps(2.34f, 70000.0f, 0.00001f, 345.5f); 2188 __m128 groundTruth = _mm_setr_ps(0.65372045f, 0.00377964473f, 316.227766f, 0.05379921937f); 2189 __m128 result = _mm_rsqrt_ps(A); 2190 foreach(i; 0..4) 2191 { 2192 double relError = (cast(double)(groundTruth.array[i]) / result.array[i]) - 1; 2193 assert(abs_double(relError) < 0.00037); // 1.5*2^-12 is 0.00036621093 2194 } 2195 } 2196 2197 /// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in `a`, 2198 /// store the result in the lower element. Copy the upper 3 packed elements from `a` to the upper elements of result. 2199 /// The maximum relative error for this approximation is less than 1.5*2^-12. 2200 __m128 _mm_rsqrt_ss (__m128 a) pure @trusted 2201 { 2202 static if (DMD_with_DSIMD) 2203 { 2204 return cast(__m128) __simd(XMM.RSQRTSS, a); 2205 } 2206 else static if (GDC_with_SSE) 2207 { 2208 return __builtin_ia32_rsqrtss(a); 2209 } 2210 else static if (LDC_with_SSE1) 2211 { 2212 return __builtin_ia32_rsqrtss(a); 2213 } 2214 else version(LDC) 2215 { 2216 a[0] = 1.0f / llvm_sqrt(a[0]); 2217 return a; 2218 } 2219 else 2220 { 2221 a[0] = 1.0f / sqrt(a[0]); 2222 return a; 2223 } 2224 } 2225 unittest // this one test 4 different intrinsics: _mm_rsqrt_ss, _mm_rsqrt_ps, _mm_rcp_ps, _mm_rcp_ss 2226 { 2227 double maxRelativeError = 0.000245; // -72 dB, stuff is apparently more precise than said in the doc? 2228 void testApproximateSSE(float number) nothrow @nogc 2229 { 2230 __m128 A = _mm_set1_ps(number); 2231 2232 // test _mm_rcp_ps 2233 __m128 B = _mm_rcp_ps(A); 2234 foreach(i; 0..4) 2235 { 2236 double exact = 1.0f / A.array[i]; 2237 double ratio = cast(double)(B.array[i]) / cast(double)(exact); 2238 assert(abs_double(ratio - 1) <= maxRelativeError); 2239 } 2240 2241 // test _mm_rcp_ss 2242 { 2243 B = _mm_rcp_ss(A); 2244 double exact = 1.0f / A.array[0]; 2245 double ratio = cast(double)(B.array[0]) / cast(double)(exact); 2246 assert(abs_double(ratio - 1) <= maxRelativeError); 2247 } 2248 2249 // test _mm_rsqrt_ps 2250 B = _mm_rsqrt_ps(A); 2251 foreach(i; 0..4) 2252 { 2253 double exact = 1.0f / sqrt(A.array[i]); 2254 double ratio = cast(double)(B.array[i]) / cast(double)(exact); 2255 assert(abs_double(ratio - 1) <= maxRelativeError); 2256 } 2257 2258 // test _mm_rsqrt_ss 2259 { 2260 B = _mm_rsqrt_ss(A); 2261 double exact = 1.0f / sqrt(A.array[0]); 2262 double ratio = cast(double)(B.array[0]) / cast(double)(exact); 2263 assert(abs_double(ratio - 1) <= maxRelativeError); 2264 } 2265 } 2266 2267 testApproximateSSE(0.00001f); 2268 testApproximateSSE(1.1f); 2269 testApproximateSSE(345.0f); 2270 testApproximateSSE(2.45674864151f); 2271 testApproximateSSE(700000.0f); 2272 testApproximateSSE(10000000.0f); 2273 testApproximateSSE(27841456468.0f); 2274 } 2275 2276 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 2277 /// consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 2278 /// low 16 bits of result. 2279 __m64 _mm_sad_pu8 (__m64 a, __m64 b) pure @safe 2280 { 2281 return to_m64(_mm_sad_epu8(to_m128i(a), to_m128i(b))); 2282 } 2283 2284 /// Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer 2285 /// `_MM_MASK_xxxx`. The exception mask may contain any of the following flags: `_MM_MASK_INVALID`, `_MM_MASK_DIV_ZERO`, 2286 /// `_MM_MASK_DENORM`, `_MM_MASK_OVERFLOW`, `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`. 2287 void _MM_SET_EXCEPTION_MASK(int _MM_MASK_xxxx) @safe 2288 { 2289 // Note: unsupported on ARM 2290 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | _MM_MASK_xxxx); 2291 } 2292 2293 /// Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer 2294 /// `_MM_EXCEPT_xxxx`. The exception state may contain any of the following flags: `_MM_EXCEPT_INVALID`, 2295 /// `_MM_EXCEPT_DIV_ZERO`, `_MM_EXCEPT_DENORM`, `_MM_EXCEPT_OVERFLOW`, `_MM_EXCEPT_UNDERFLOW`, `_MM_EXCEPT_INEXACT`. 2296 void _MM_SET_EXCEPTION_STATE(int _MM_EXCEPT_xxxx) @safe 2297 { 2298 // Note: unsupported on ARM 2299 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | _MM_EXCEPT_xxxx); 2300 } 2301 2302 /// Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer 2303 /// `_MM_FLUSH_xxxx`. The flush zero may contain any of the following flags: `_MM_FLUSH_ZERO_ON` or `_MM_FLUSH_ZERO_OFF`. 2304 void _MM_SET_FLUSH_ZERO_MODE(int _MM_FLUSH_xxxx) @safe 2305 { 2306 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_xxxx); 2307 } 2308 2309 /// Set packed single-precision (32-bit) floating-point elements with the supplied values. 2310 __m128 _mm_set_ps (float e3, float e2, float e1, float e0) pure @trusted 2311 { 2312 // Note: despite appearances, generates sensible code, 2313 // inlines correctly and is constant folded 2314 float[4] result = [e0, e1, e2, e3]; 2315 return loadUnaligned!(float4)(result.ptr); 2316 } 2317 unittest 2318 { 2319 __m128 A = _mm_set_ps(3, 2, 1, 546); 2320 float[4] correct = [546.0f, 1.0f, 2.0f, 3.0f]; 2321 assert(A.array == correct); 2322 } 2323 2324 deprecated("Use _mm_set1_ps instead") alias _mm_set_ps1 = _mm_set1_ps; /// 2325 2326 /// Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer 2327 /// `_MM_ROUND_xxxx`. The rounding mode may contain any of the following flags: `_MM_ROUND_NEAREST`, `_MM_ROUND_DOWN`, 2328 /// `_MM_ROUND_UP`, `_MM_ROUND_TOWARD_ZERO`. 2329 void _MM_SET_ROUNDING_MODE(int _MM_ROUND_xxxx) @safe 2330 { 2331 // Work-around for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607 2332 version(GNU) asm @trusted { "" : : : "memory"; } 2333 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | _MM_ROUND_xxxx); 2334 } 2335 2336 /// Copy single-precision (32-bit) floating-point element `a` to the lower element of result, and zero the upper 3 elements. 2337 __m128 _mm_set_ss (float a) pure @trusted 2338 { 2339 static if (DMD_with_DSIMD) 2340 { 2341 return cast(__m128) __simd(XMM.LODSS, a); 2342 } 2343 else 2344 { 2345 __m128 r = _mm_setzero_ps(); 2346 r.ptr[0] = a; 2347 return r; 2348 } 2349 } 2350 unittest 2351 { 2352 float[4] correct = [42.0f, 0.0f, 0.0f, 0.0f]; 2353 __m128 A = _mm_set_ss(42.0f); 2354 assert(A.array == correct); 2355 } 2356 2357 /// Broadcast single-precision (32-bit) floating-point value `a` to all elements. 2358 __m128 _mm_set1_ps (float a) pure @trusted 2359 { 2360 __m128 r = a; 2361 return r; 2362 } 2363 unittest 2364 { 2365 float[4] correct = [42.0f, 42.0f, 42.0f, 42.0f]; 2366 __m128 A = _mm_set1_ps(42.0f); 2367 assert(A.array == correct); 2368 } 2369 2370 /// Set the MXCSR control and status register with the value in unsigned 32-bit integer `controlWord`. 2371 void _mm_setcsr(uint controlWord) @trusted 2372 { 2373 static if (LDC_with_ARM) 2374 { 2375 // Convert from SSE to ARM control word. This is done _partially_ 2376 // and only support rounding mode changes. 2377 2378 // "To alter some bits of a VFP system register without 2379 // affecting other bits, use a read-modify-write procedure" 2380 uint fpscr = arm_get_fpcr(); 2381 2382 // Bits 23 to 22 are rounding modes, however not used in NEON 2383 fpscr = fpscr & ~_MM_ROUND_MASK_ARM; 2384 switch(controlWord & _MM_ROUND_MASK) 2385 { 2386 default: 2387 case _MM_ROUND_NEAREST: fpscr |= _MM_ROUND_NEAREST_ARM; break; 2388 case _MM_ROUND_DOWN: fpscr |= _MM_ROUND_DOWN_ARM; break; 2389 case _MM_ROUND_UP: fpscr |= _MM_ROUND_UP_ARM; break; 2390 case _MM_ROUND_TOWARD_ZERO: fpscr |= _MM_ROUND_TOWARD_ZERO_ARM; break; 2391 } 2392 fpscr = fpscr & ~_MM_FLUSH_ZERO_MASK_ARM; 2393 if (controlWord & _MM_FLUSH_ZERO_MASK) 2394 fpscr |= _MM_FLUSH_ZERO_MASK_ARM; 2395 arm_set_fpcr(fpscr); 2396 } 2397 else version(GNU) 2398 { 2399 static if (GDC_with_SSE) 2400 { 2401 // Work-around for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607 2402 version(GNU) asm @trusted { "" : : : "memory"; } 2403 __builtin_ia32_ldmxcsr(controlWord); 2404 } 2405 else version(X86) 2406 { 2407 asm pure nothrow @nogc @trusted 2408 { 2409 "ldmxcsr %0;\n" 2410 : 2411 : "m" (controlWord) 2412 : ; 2413 } 2414 } 2415 else 2416 static assert(false); 2417 } 2418 else version (InlineX86Asm) 2419 { 2420 asm pure nothrow @nogc @safe 2421 { 2422 ldmxcsr controlWord; 2423 } 2424 } 2425 else 2426 static assert(0, "Not yet supported"); 2427 } 2428 unittest 2429 { 2430 _mm_setcsr(_mm_getcsr()); 2431 } 2432 2433 /// Set packed single-precision (32-bit) floating-point elements with the supplied values in reverse order. 2434 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0) pure @trusted 2435 { 2436 float[4] result = [e3, e2, e1, e0]; 2437 return loadUnaligned!(float4)(result.ptr); 2438 } 2439 unittest 2440 { 2441 __m128 A = _mm_setr_ps(3, 2, 1, 546); 2442 float[4] correct = [3.0f, 2.0f, 1.0f, 546.0f]; 2443 assert(A.array == correct); 2444 assert(A.array[0] == 3.0f); 2445 assert(A.array[1] == 2.0f); 2446 assert(A.array[2] == 1.0f); 2447 assert(A.array[3] == 546.0f); 2448 } 2449 2450 /// Return vector of type `__m128` with all elements set to zero. 2451 __m128 _mm_setzero_ps() pure @trusted 2452 { 2453 float4 r; 2454 r = 0.0f; 2455 return r; 2456 } 2457 unittest 2458 { 2459 __m128 R = _mm_setzero_ps(); 2460 float[4] correct = [0.0f, 0, 0, 0]; 2461 assert(R.array == correct); 2462 } 2463 2464 /// Perform a serializing operation on all store-to-memory instructions that were issued prior 2465 /// to this instruction. Guarantees that every store instruction that precedes, in program order, 2466 /// is globally visible before any store instruction which follows the fence in program order. 2467 void _mm_sfence() @trusted 2468 { 2469 version(GNU) 2470 { 2471 static if (GDC_with_SSE) 2472 { 2473 __builtin_ia32_sfence(); 2474 } 2475 else version(X86) 2476 { 2477 asm pure nothrow @nogc @trusted 2478 { 2479 "sfence;\n" : : : ; 2480 } 2481 } 2482 else 2483 static assert(false); 2484 } 2485 else static if (LDC_with_SSE1) 2486 { 2487 __builtin_ia32_sfence(); 2488 } 2489 else static if (DMD_with_asm) 2490 { 2491 asm nothrow @nogc pure @safe 2492 { 2493 sfence; 2494 } 2495 } 2496 else version(LDC) 2497 { 2498 llvm_memory_fence(); // PERF: this generates mfence instead of sfence 2499 } 2500 else 2501 static assert(false); 2502 } 2503 unittest 2504 { 2505 _mm_sfence(); 2506 } 2507 2508 /// Warning: the immediate shuffle value `imm8` is given at compile-time instead of runtime. 2509 __m64 _mm_shuffle_pi16(int imm8)(__m64 a) pure @safe 2510 { 2511 return cast(__m64) shufflevector!(short4, ( (imm8 >> 0) & 3 ), 2512 ( (imm8 >> 2) & 3 ), 2513 ( (imm8 >> 4) & 3 ), 2514 ( (imm8 >> 6) & 3 ))(cast(short4)a, cast(short4)a); 2515 } 2516 unittest 2517 { 2518 __m64 A = _mm_setr_pi16(0, 1, 2, 3); 2519 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 2520 short4 B = cast(short4) _mm_shuffle_pi16!SHUFFLE(A); 2521 short[4] expectedB = [ 3, 2, 1, 0 ]; 2522 assert(B.array == expectedB); 2523 } 2524 2525 /// Warning: the immediate shuffle value `imm8` is given at compile-time instead of runtime. 2526 __m128 _mm_shuffle_ps(ubyte imm)(__m128 a, __m128 b) pure @safe 2527 { 2528 return shufflevector!(__m128, imm & 3, (imm>>2) & 3, 4 + ((imm>>4) & 3), 4 + ((imm>>6) & 3) )(a, b); 2529 } 2530 2531 /// Compute the square root of packed single-precision (32-bit) floating-point elements in `a`. 2532 __m128 _mm_sqrt_ps(__m128 a) @trusted 2533 { 2534 static if (GDC_with_SSE) 2535 { 2536 return __builtin_ia32_sqrtps(a); 2537 } 2538 else version(LDC) 2539 { 2540 // Disappeared with LDC 1.11 2541 static if (__VERSION__ < 2081) 2542 return __builtin_ia32_sqrtps(a); 2543 else 2544 { 2545 a[0] = llvm_sqrt(a[0]); 2546 a[1] = llvm_sqrt(a[1]); 2547 a[2] = llvm_sqrt(a[2]); 2548 a[3] = llvm_sqrt(a[3]); 2549 return a; 2550 } 2551 } 2552 else 2553 { 2554 a.ptr[0] = sqrt(a.array[0]); 2555 a.ptr[1] = sqrt(a.array[1]); 2556 a.ptr[2] = sqrt(a.array[2]); 2557 a.ptr[3] = sqrt(a.array[3]); 2558 return a; 2559 } 2560 } 2561 unittest 2562 { 2563 __m128 A = _mm_sqrt_ps(_mm_set1_ps(4.0f)); 2564 assert(A.array[0] == 2.0f); 2565 assert(A.array[1] == 2.0f); 2566 assert(A.array[2] == 2.0f); 2567 assert(A.array[3] == 2.0f); 2568 } 2569 2570 /// Compute the square root of the lower single-precision (32-bit) floating-point element in `a`, store it in the lower 2571 /// element, and copy the upper 3 packed elements from `a` to the upper elements of result. 2572 __m128 _mm_sqrt_ss(__m128 a) @trusted 2573 { 2574 static if (GDC_with_SSE) 2575 { 2576 return __builtin_ia32_sqrtss(a); 2577 } 2578 else version(LDC) 2579 { 2580 a.ptr[0] = llvm_sqrt(a.array[0]); 2581 return a; 2582 } 2583 else 2584 { 2585 a.ptr[0] = sqrt(a.array[0]); 2586 return a; 2587 } 2588 } 2589 unittest 2590 { 2591 __m128 A = _mm_sqrt_ss(_mm_set1_ps(4.0f)); 2592 assert(A.array[0] == 2.0f); 2593 assert(A.array[1] == 4.0f); 2594 assert(A.array[2] == 4.0f); 2595 assert(A.array[3] == 4.0f); 2596 } 2597 2598 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 2599 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 2600 void _mm_store_ps (float* mem_addr, __m128 a) pure 2601 { 2602 __m128* aligned = cast(__m128*)mem_addr; 2603 *aligned = a; 2604 } 2605 2606 deprecated("Use _mm_store1_ps instead") alias _mm_store_ps1 = _mm_store1_ps; /// 2607 2608 /// Store the lower single-precision (32-bit) floating-point element from `a` into memory. 2609 /// `mem_addr` does not need to be aligned on any particular boundary. 2610 void _mm_store_ss (float* mem_addr, __m128 a) pure @safe 2611 { 2612 *mem_addr = a.array[0]; 2613 } 2614 unittest 2615 { 2616 float a; 2617 _mm_store_ss(&a, _mm_set_ps(3, 2, 1, 546)); 2618 assert(a == 546); 2619 } 2620 2621 /// Store the lower single-precision (32-bit) floating-point element from `a` into 4 contiguous elements in memory. 2622 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 2623 void _mm_store1_ps(float* mem_addr, __m128 a) pure @trusted // TODO: shouldn't be trusted 2624 { 2625 __m128* aligned = cast(__m128*)mem_addr; 2626 __m128 r; 2627 r.ptr[0] = a.array[0]; 2628 r.ptr[1] = a.array[0]; 2629 r.ptr[2] = a.array[0]; 2630 r.ptr[3] = a.array[0]; 2631 *aligned = r; 2632 } 2633 unittest 2634 { 2635 align(16) float[4] A; 2636 _mm_store1_ps(A.ptr, _mm_set_ss(42.0f)); 2637 float[4] correct = [42.0f, 42, 42, 42]; 2638 assert(A == correct); 2639 } 2640 2641 /// Store the upper 2 single-precision (32-bit) floating-point elements from `a` into memory. 2642 void _mm_storeh_pi(__m64* p, __m128 a) pure @trusted 2643 { 2644 long2 la = cast(long2)a; 2645 (*p).ptr[0] = la.array[1]; 2646 } 2647 unittest 2648 { 2649 __m64 R = _mm_setzero_si64(); 2650 long2 A = [13, 25]; 2651 _mm_storeh_pi(&R, cast(__m128)A); 2652 assert(R.array[0] == 25); 2653 } 2654 2655 /// Store the lower 2 single-precision (32-bit) floating-point elements from `a` into memory. 2656 void _mm_storel_pi(__m64* p, __m128 a) pure @trusted 2657 { 2658 long2 la = cast(long2)a; 2659 (*p).ptr[0] = la.array[0]; 2660 } 2661 unittest 2662 { 2663 __m64 R = _mm_setzero_si64(); 2664 long2 A = [13, 25]; 2665 _mm_storel_pi(&R, cast(__m128)A); 2666 assert(R.array[0] == 13); 2667 } 2668 2669 /// Store 4 single-precision (32-bit) floating-point elements from `a` into memory in reverse order. 2670 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 2671 void _mm_storer_ps(float* mem_addr, __m128 a) pure @trusted // TODO should not be trusted 2672 { 2673 __m128* aligned = cast(__m128*)mem_addr; 2674 __m128 r; 2675 r.ptr[0] = a.array[3]; 2676 r.ptr[1] = a.array[2]; 2677 r.ptr[2] = a.array[1]; 2678 r.ptr[3] = a.array[0]; 2679 *aligned = r; 2680 } 2681 unittest 2682 { 2683 align(16) float[4] A; 2684 _mm_storer_ps(A.ptr, _mm_setr_ps(1.0f, 2, 3, 4)); 2685 float[4] correct = [4.0f, 3.0f, 2.0f, 1.0f]; 2686 assert(A == correct); 2687 } 2688 2689 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a` into memory. 2690 /// `mem_addr` does not need to be aligned on any particular boundary. 2691 void _mm_storeu_ps(float* mem_addr, __m128 a) pure @safe // TODO should not be trusted 2692 { 2693 storeUnaligned!(float4)(a, mem_addr); 2694 } 2695 2696 /// Store 64-bits of integer data from `a` into memory using a non-temporal memory hint. 2697 void _mm_stream_pi (__m64* mem_addr, __m64 a) 2698 { 2699 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 2700 *mem_addr = a; // it's a regular move instead 2701 } 2702 2703 /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a`s into memory using 2704 /// a non-temporal memory hint. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be 2705 /// generated. 2706 void _mm_stream_ps (float* mem_addr, __m128 a) 2707 { 2708 // BUG: can't implement non-temporal store with LDC inlineIR since !nontemporal 2709 // needs some IR outside this function that would say: 2710 // 2711 // !0 = !{ i32 1 } 2712 // 2713 // It's a LLVM IR metadata description. 2714 __m128* dest = cast(__m128*)mem_addr; 2715 *dest = a; // it's a regular move instead 2716 } 2717 unittest 2718 { 2719 align(16) float[4] A; 2720 _mm_stream_ps(A.ptr, _mm_set1_ps(78.0f)); 2721 assert(A[0] == 78.0f && A[1] == 78.0f && A[2] == 78.0f && A[3] == 78.0f); 2722 } 2723 2724 /// Subtract packed single-precision (32-bit) floating-point elements in `b` from packed single-precision (32-bit) 2725 /// floating-point elements in `a`. 2726 __m128 _mm_sub_ps(__m128 a, __m128 b) pure @safe 2727 { 2728 return a - b; 2729 } 2730 unittest 2731 { 2732 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 2733 a = _mm_sub_ps(a, a); 2734 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 2735 assert(a.array == correct); 2736 } 2737 2738 /// Subtract the lower single-precision (32-bit) floating-point element in `b` from the lower single-precision (32-bit) 2739 /// floating-point element in `a`, store the subtration result in the lower element of result, and copy the upper 3 2740 /// packed elements from a to the upper elements of result. 2741 __m128 _mm_sub_ss(__m128 a, __m128 b) pure @safe 2742 { 2743 static if (DMD_with_DSIMD) 2744 return cast(__m128) __simd(XMM.SUBSS, a, b); 2745 else static if (GDC_with_SSE) 2746 return __builtin_ia32_subss(a, b); 2747 else 2748 { 2749 a[0] -= b[0]; 2750 return a; 2751 } 2752 } 2753 unittest 2754 { 2755 __m128 a = [1.5f, -2.0f, 3.0f, 1.0f]; 2756 a = _mm_sub_ss(a, a); 2757 float[4] correct = [0.0f, -2.0, 3.0f, 1.0f]; 2758 assert(a.array == correct); 2759 } 2760 2761 /// Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in row0, row1, 2762 /// row2, and row3, and store the transposed matrix in these vectors (row0 now contains column 0, etc.). 2763 void _MM_TRANSPOSE4_PS (ref __m128 row0, ref __m128 row1, ref __m128 row2, ref __m128 row3) pure @safe 2764 { 2765 __m128 tmp3, tmp2, tmp1, tmp0; 2766 tmp0 = _mm_unpacklo_ps(row0, row1); 2767 tmp2 = _mm_unpacklo_ps(row2, row3); 2768 tmp1 = _mm_unpackhi_ps(row0, row1); 2769 tmp3 = _mm_unpackhi_ps(row2, row3); 2770 row0 = _mm_movelh_ps(tmp0, tmp2); 2771 row1 = _mm_movehl_ps(tmp2, tmp0); 2772 row2 = _mm_movelh_ps(tmp1, tmp3); 2773 row3 = _mm_movehl_ps(tmp3, tmp1); 2774 } 2775 unittest 2776 { 2777 __m128 l0 = _mm_setr_ps(0, 1, 2, 3); 2778 __m128 l1 = _mm_setr_ps(4, 5, 6, 7); 2779 __m128 l2 = _mm_setr_ps(8, 9, 10, 11); 2780 __m128 l3 = _mm_setr_ps(12, 13, 14, 15); 2781 _MM_TRANSPOSE4_PS(l0, l1, l2, l3); 2782 float[4] r0 = [0.0f, 4, 8, 12]; 2783 float[4] r1 = [1.0f, 5, 9, 13]; 2784 float[4] r2 = [2.0f, 6, 10, 14]; 2785 float[4] r3 = [3.0f, 7, 11, 15]; 2786 assert(l0.array == r0); 2787 assert(l1.array == r1); 2788 assert(l2.array == r2); 2789 assert(l3.array == r3); 2790 } 2791 2792 // Note: the only difference between these intrinsics is the signalling 2793 // behaviour of quiet NaNs. This is incorrect but the case where 2794 // you would want to differentiate between qNaN and sNaN and then 2795 // treat them differently on purpose seems extremely rare. 2796 alias _mm_ucomieq_ss = _mm_comieq_ss; 2797 alias _mm_ucomige_ss = _mm_comige_ss; 2798 alias _mm_ucomigt_ss = _mm_comigt_ss; 2799 alias _mm_ucomile_ss = _mm_comile_ss; 2800 alias _mm_ucomilt_ss = _mm_comilt_ss; 2801 alias _mm_ucomineq_ss = _mm_comineq_ss; 2802 2803 /// Return vector of type `__m128` with undefined elements. 2804 __m128 _mm_undefined_ps() pure @safe 2805 { 2806 __m128 undef = void; 2807 return undef; 2808 } 2809 2810 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half `a` and `b`. 2811 __m128 _mm_unpackhi_ps (__m128 a, __m128 b) pure @trusted 2812 { 2813 version(LDC) 2814 { 2815 // x86: plain version generates unpckhps with LDC 1.0.0 -O1, but shufflevector 8 less instructions in -O0 2816 return shufflevectorLDC!(__m128, 2, 6, 3, 7)(a, b); 2817 } 2818 else 2819 { 2820 __m128 r; 2821 r.ptr[0] = a.array[2]; 2822 r.ptr[1] = b.array[2]; 2823 r.ptr[2] = a.array[3]; 2824 r.ptr[3] = b.array[3]; 2825 return r; 2826 } 2827 } 2828 unittest 2829 { 2830 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 2831 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 2832 __m128 R = _mm_unpackhi_ps(A, B); 2833 float[4] correct = [3.0f, 7.0f, 4.0f, 8.0f]; 2834 assert(R.array == correct); 2835 } 2836 2837 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of `a` and `b`. 2838 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) pure @trusted 2839 { 2840 version(LDC) 2841 { 2842 // x86: plain version generates unpckhps with LDC 1.0.0 -O1, but shufflevector 8 less instructions in -O0 2843 return shufflevectorLDC!(__m128, 0, 4, 1, 5)(a, b); 2844 } 2845 else 2846 { 2847 __m128 r; 2848 r.ptr[0] = a.array[0]; 2849 r.ptr[1] = b.array[0]; 2850 r.ptr[2] = a.array[1]; 2851 r.ptr[3] = b.array[1]; 2852 return r; 2853 } 2854 } 2855 unittest 2856 { 2857 __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 2858 __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); 2859 __m128 R = _mm_unpacklo_ps(A, B); 2860 float[4] correct = [1.0f, 5.0f, 2.0f, 6.0f]; 2861 assert(R.array == correct); 2862 } 2863 2864 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`. 2865 __m128 _mm_xor_ps (__m128 a, __m128 b) pure @safe 2866 { 2867 return cast(__m128)(cast(__m128i)a ^ cast(__m128i)b); 2868 } 2869 2870 2871 private 2872 { 2873 // Returns: `true` if the pointer is suitably aligned. 2874 bool isPointerAligned(void* p, size_t alignment) pure 2875 { 2876 assert(alignment != 0); 2877 return ( cast(size_t)p & (alignment - 1) ) == 0; 2878 } 2879 2880 // Returns: next pointer aligned with alignment bytes. 2881 void* nextAlignedPointer(void* start, size_t alignment) pure 2882 { 2883 return cast(void*)nextMultipleOf(cast(size_t)(start), alignment); 2884 } 2885 2886 // Returns number of bytes to actually allocate when asking 2887 // for a particular alignment 2888 @nogc size_t requestedSize(size_t askedSize, size_t alignment) pure 2889 { 2890 enum size_t pointerSize = size_t.sizeof; 2891 return askedSize + alignment - 1 + pointerSize * 3; 2892 } 2893 2894 // Store pointer given my malloc, size and alignment 2895 @nogc void* storeRawPointerPlusInfo(void* raw, size_t size, size_t alignment) pure 2896 { 2897 enum size_t pointerSize = size_t.sizeof; 2898 char* start = cast(char*)raw + pointerSize * 3; 2899 void* aligned = nextAlignedPointer(start, alignment); 2900 void** rawLocation = cast(void**)(cast(char*)aligned - pointerSize); 2901 *rawLocation = raw; 2902 size_t* sizeLocation = cast(size_t*)(cast(char*)aligned - 2 * pointerSize); 2903 *sizeLocation = size; 2904 size_t* alignmentLocation = cast(size_t*)(cast(char*)aligned - 3 * pointerSize); 2905 *alignmentLocation = alignment; 2906 assert( isPointerAligned(aligned, alignment) ); 2907 return aligned; 2908 } 2909 2910 // Returns: x, multiple of powerOfTwo, so that x >= n. 2911 @nogc size_t nextMultipleOf(size_t n, size_t powerOfTwo) pure nothrow 2912 { 2913 // check power-of-two 2914 assert( (powerOfTwo != 0) && ((powerOfTwo & (powerOfTwo - 1)) == 0)); 2915 2916 size_t mask = ~(powerOfTwo - 1); 2917 return (n + powerOfTwo - 1) & mask; 2918 } 2919 } 2920 2921 unittest 2922 { 2923 assert(nextMultipleOf(0, 4) == 0); 2924 assert(nextMultipleOf(1, 4) == 4); 2925 assert(nextMultipleOf(2, 4) == 4); 2926 assert(nextMultipleOf(3, 4) == 4); 2927 assert(nextMultipleOf(4, 4) == 4); 2928 assert(nextMultipleOf(5, 4) == 8); 2929 2930 { 2931 void* p = _mm_malloc(23, 16); 2932 assert(p !is null); 2933 assert(((cast(size_t)p) & 0xf) == 0); 2934 _mm_free(p); 2935 } 2936 2937 void* nullAlloc = _mm_malloc(0, 32); 2938 assert(nullAlloc != null); 2939 _mm_free(nullAlloc); 2940 } 2941 2942 // For some reason, order of declaration is important for this one 2943 // so it is misplaced. 2944 // Note: is just another name for _mm_cvtss_si32 2945 alias _mm_cvt_ss2si = _mm_cvtss_si32;