1 /** 2 * SSE2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2 4 * 5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.emmintrin; 9 10 public import inteli.types; 11 public import inteli.xmmintrin; // SSE2 includes SSE1 12 import inteli.mmx; 13 import inteli.internals; 14 15 nothrow @nogc: 16 17 18 // SSE2 instructions 19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 20 21 /// Add packed 16-bit integers in `a` and `b`. 22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 23 { 24 pragma(inline, true); 25 return cast(__m128i)(cast(short8)a + cast(short8)b); 26 } 27 unittest 28 { 29 __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77); 30 short8 R = cast(short8) _mm_add_epi16(A, A); 31 short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154]; 32 assert(R.array == correct); 33 } 34 35 /// Add packed 32-bit integers in `a` and `b`. 36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 37 { 38 pragma(inline, true); 39 return cast(__m128i)(cast(int4)a + cast(int4)b); 40 } 41 unittest 42 { 43 __m128i A = _mm_setr_epi32( -7, -1, 0, 9); 44 int4 R = _mm_add_epi32(A, A); 45 int[4] correct = [ -14, -2, 0, 18 ]; 46 assert(R.array == correct); 47 } 48 49 /// Add packed 64-bit integers in `a` and `b`. 50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 51 { 52 pragma(inline, true); 53 return cast(__m128i)(cast(long2)a + cast(long2)b); 54 } 55 unittest 56 { 57 __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000); 58 long2 R = cast(long2) _mm_add_epi64(A, A); 59 long[2] correct = [ -2, 0 ]; 60 assert(R.array == correct); 61 } 62 63 /// Add packed 8-bit integers in `a` and `b`. 64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 65 { 66 pragma(inline, true); 67 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 68 } 69 unittest 70 { 71 __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78); 72 byte16 R = cast(byte16) _mm_add_epi8(A, A); 73 byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100]; 74 assert(R.array == correct); 75 } 76 77 /// Add the lower double-precision (64-bit) floating-point element 78 /// in `a` and `b`, store the result in the lower element of dst, 79 /// and copy the upper element from `a` to the upper element of destination. 80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 81 { 82 static if (DMD_with_DSIMD) 83 { 84 return cast(__m128d) __simd(XMM.ADDSD, a, b); 85 } 86 else static if (GDC_with_SSE2) 87 { 88 return __builtin_ia32_addsd(a, b); 89 } 90 else version(DigitalMars) 91 { 92 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 93 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 94 asm pure nothrow @nogc @trusted { nop;} 95 a[0] = a[0] + b[0]; 96 return a; 97 } 98 else 99 { 100 a[0] += b[0]; 101 return a; 102 } 103 } 104 unittest 105 { 106 __m128d a = [1.5, -2.0]; 107 a = _mm_add_sd(a, a); 108 assert(a.array == [3.0, -2.0]); 109 } 110 111 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 112 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 113 { 114 pragma(inline, true); 115 return a + b; 116 } 117 unittest 118 { 119 __m128d a = [1.5, -2.0]; 120 a = _mm_add_pd(a, a); 121 assert(a.array == [3.0, -4.0]); 122 } 123 124 /// Add 64-bit integers `a` and `b`. 125 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe 126 { 127 // PERF DMD 128 pragma(inline, true); 129 return a + b; 130 } 131 132 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 133 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 134 { 135 static if (DMD_with_DSIMD) 136 { 137 return cast(__m128i) __simd(XMM.PADDSW, a, b); 138 } 139 else static if (GDC_with_SSE2) 140 { 141 return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 142 } 143 else version(LDC) 144 { 145 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 146 { 147 // x86: Generates PADDSW since LDC 1.15 -O0 148 // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20 149 enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 150 enum ir = ` 151 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 152 ret <8 x i16> %r`; 153 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 154 } 155 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 156 { 157 short[8] res; // PERF =void; 158 short8 sa = cast(short8)a; 159 short8 sb = cast(short8)b; 160 foreach(i; 0..8) 161 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 162 return _mm_loadu_si128(cast(int4*)res.ptr); 163 } 164 else 165 return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 166 } 167 else 168 { 169 short[8] res; // PERF =void; 170 short8 sa = cast(short8)a; 171 short8 sb = cast(short8)b; 172 foreach(i; 0..8) 173 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 174 return _mm_loadu_si128(cast(int4*)res.ptr); 175 } 176 } 177 unittest 178 { 179 short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), 180 _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); 181 static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 182 assert(res.array == correctResult); 183 } 184 185 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 186 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 187 { 188 static if (DMD_with_DSIMD) 189 { 190 return cast(__m128i) __simd(XMM.PADDSB, a, b); 191 } 192 else static if (GDC_with_SSE2) 193 { 194 return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b); 195 } 196 else version(LDC) 197 { 198 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 199 { 200 // x86: Generates PADDSB since LDC 1.15 -O0 201 // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20 202 enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 203 enum ir = ` 204 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 205 ret <16 x i8> %r`; 206 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 207 } 208 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 209 { 210 byte[16] res; // PERF =void; 211 byte16 sa = cast(byte16)a; 212 byte16 sb = cast(byte16)b; 213 foreach(i; 0..16) 214 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 215 return _mm_loadu_si128(cast(int4*)res.ptr); 216 } 217 else 218 return cast(__m128i) __builtin_ia32_paddsb128(cast(byte16)a, cast(byte16)b); 219 } 220 else 221 { 222 byte[16] res; // PERF =void; 223 byte16 sa = cast(byte16)a; 224 byte16 sb = cast(byte16)b; 225 foreach(i; 0..16) 226 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 227 return _mm_loadu_si128(cast(int4*)res.ptr); 228 } 229 } 230 unittest 231 { 232 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 233 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 234 static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14, 235 16, 18, 20, 22, 24, 26, 28, 30]; 236 assert(res.array == correctResult); 237 } 238 239 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 240 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 241 { 242 static if (DMD_with_DSIMD) 243 { 244 return cast(__m128i) __simd(XMM.PADDUSB, a, b); 245 } 246 else static if (GDC_with_SSE2) 247 { 248 return cast(__m128i) __builtin_ia32_paddusb128(cast(ubyte16)a, cast(ubyte16)b); 249 } 250 else version(LDC) 251 { 252 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 253 { 254 // x86: Generates PADDUSB since LDC 1.15 -O0 255 // ARM: Generates uqadd.16b since LDC 1.21 -O1 256 enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 257 enum ir = ` 258 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 259 ret <16 x i8> %r`; 260 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 261 } 262 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 263 { 264 ubyte[16] res; // PERF =void; 265 byte16 sa = cast(byte16)a; 266 byte16 sb = cast(byte16)b; 267 foreach(i; 0..16) 268 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 269 return _mm_loadu_si128(cast(int4*)res.ptr); 270 } 271 else 272 return __builtin_ia32_paddusb128(a, b); 273 } 274 else 275 { 276 ubyte[16] res; // PERF =void; 277 byte16 sa = cast(byte16)a; 278 byte16 sb = cast(byte16)b; 279 foreach(i; 0..16) 280 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 281 return _mm_loadu_si128(cast(int4*)res.ptr); 282 } 283 } 284 unittest 285 { 286 byte16 res = cast(byte16) 287 _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0), 288 _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0)); 289 static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 290 0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 291 assert(res.array == correctResult); 292 } 293 294 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation. 295 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 296 { 297 static if (DMD_with_DSIMD) 298 { 299 // Note: DMD generates a reverted paddusw vs LDC and GDC, but that doesn't change the result anyway 300 return cast(__m128i) __simd(XMM.PADDUSW, a, b); 301 } 302 else static if (GDC_with_SSE2) 303 { 304 return cast(__m128i) __builtin_ia32_paddusw128(cast(short8)a, cast(short8)b); 305 } 306 else version(LDC) 307 { 308 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 309 { 310 // x86: Generates PADDUSW since LDC 1.15 -O0 311 // ARM: Generates uqadd.8h since LDC 1.21 -O1 312 enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 313 enum ir = ` 314 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 315 ret <8 x i16> %r`; 316 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 317 } 318 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 319 { 320 ushort[8] res; // PERF =void; 321 short8 sa = cast(short8)a; 322 short8 sb = cast(short8)b; 323 foreach(i; 0..8) 324 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 325 return _mm_loadu_si128(cast(int4*)res.ptr); 326 } 327 else 328 return __builtin_ia32_paddusw128(a, b); 329 } 330 else 331 { 332 ushort[8] res; // PERF =void; 333 short8 sa = cast(short8)a; 334 short8 sb = cast(short8)b; 335 foreach(i; 0..8) 336 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 337 return _mm_loadu_si128(cast(int4*)res.ptr); 338 } 339 } 340 unittest 341 { 342 short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0), 343 _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0)); 344 static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6]; 345 assert(res.array == correctResult); 346 } 347 348 /// Compute the bitwise AND of packed double-precision (64-bit) 349 /// floating-point elements in `a` and `b`. 350 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 351 { 352 pragma(inline, true); 353 return cast(__m128d)( cast(long2)a & cast(long2)b ); 354 } 355 unittest 356 { 357 double a = 4.32; 358 double b = -78.99; 359 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 360 __m128d A = _mm_set_pd(a, b); 361 __m128d B = _mm_set_pd(b, a); 362 long2 R = cast(long2)( _mm_and_pd(A, B) ); 363 assert(R.array[0] == correct); 364 assert(R.array[1] == correct); 365 } 366 367 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`. 368 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 369 { 370 pragma(inline, true); 371 return a & b; 372 } 373 unittest 374 { 375 __m128i A = _mm_set1_epi32(7); 376 __m128i B = _mm_set1_epi32(14); 377 __m128i R = _mm_and_si128(A, B); 378 int[4] correct = [6, 6, 6, 6]; 379 assert(R.array == correct); 380 } 381 382 /// Compute the bitwise NOT of packed double-precision (64-bit) 383 /// floating-point elements in `a` and then AND with `b`. 384 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 385 { 386 static if (DMD_with_DSIMD) 387 { 388 return cast(__m128d) __simd(XMM.ANDNPD, a, b); 389 } 390 else 391 { 392 return cast(__m128d)( ~(cast(long2)a) & cast(long2)b); 393 } 394 } 395 unittest 396 { 397 double a = 4.32; 398 double b = -78.99; 399 long correct = (~*cast(long*)(&a)) & ( *cast(long*)(&b)); 400 long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b)); 401 __m128d A = _mm_setr_pd(a, b); 402 __m128d B = _mm_setr_pd(b, a); 403 long2 R = cast(long2)( _mm_andnot_pd(A, B) ); 404 assert(R.array[0] == correct); 405 assert(R.array[1] == correct2); 406 } 407 408 /// Compute the bitwise NOT of 128 bits (representing integer data) 409 /// in `a` and then AND with `b`. 410 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 411 { 412 static if (DMD_with_DSIMD) 413 { 414 return cast(__m128i) __simd(XMM.PANDN, a, b); 415 } 416 else 417 { 418 return (~a) & b; 419 } 420 } 421 unittest 422 { 423 __m128i A = _mm_setr_epi32(7, -2, 9, 54654); 424 __m128i B = _mm_setr_epi32(14, 78, 111, -256); 425 __m128i R = _mm_andnot_si128(A, B); 426 int[4] correct = [8, 0, 102, -54784]; 427 assert(R.array == correct); 428 } 429 430 /// Average packed unsigned 16-bit integers in `a` and `b`. 431 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted 432 { 433 static if (DMD_with_DSIMD) 434 { 435 return cast(__m128i) __simd(XMM.PAVGW, a, b); 436 } 437 else static if (GDC_with_SSE2) 438 { 439 return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b); 440 } 441 else static if (LDC_with_ARM64) 442 { 443 return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b); 444 } 445 else version(LDC) 446 { 447 // Generates pavgw even in LDC 1.0, even in -O0 448 // But not in ARM 449 enum ir = ` 450 %ia = zext <8 x i16> %0 to <8 x i32> 451 %ib = zext <8 x i16> %1 to <8 x i32> 452 %isum = add <8 x i32> %ia, %ib 453 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 454 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 455 %r = trunc <8 x i32> %isums to <8 x i16> 456 ret <8 x i16> %r`; 457 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 458 } 459 else 460 { 461 short8 sa = cast(short8)a; 462 short8 sb = cast(short8)b; 463 short8 sr = void; 464 foreach(i; 0..8) 465 { 466 sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 ); 467 } 468 return cast(int4)sr; 469 } 470 } 471 unittest 472 { 473 __m128i A = _mm_set1_epi16(31); 474 __m128i B = _mm_set1_epi16(64); 475 short8 avg = cast(short8)(_mm_avg_epu16(A, B)); 476 foreach(i; 0..8) 477 assert(avg.array[i] == 48); 478 } 479 480 /// Average packed unsigned 8-bit integers in `a` and `b`. 481 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted 482 { 483 static if (DMD_with_DSIMD) 484 { 485 return cast(__m128i) __simd(XMM.PAVGB, a, b); 486 } 487 else static if (GDC_with_SSE2) 488 { 489 return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b); 490 } 491 else static if (LDC_with_ARM64) 492 { 493 return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b); 494 } 495 else version(LDC) 496 { 497 // Generates pavgb even in LDC 1.0, even in -O0 498 // But not in ARM 499 enum ir = ` 500 %ia = zext <16 x i8> %0 to <16 x i16> 501 %ib = zext <16 x i8> %1 to <16 x i16> 502 %isum = add <16 x i16> %ia, %ib 503 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 504 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 505 %r = trunc <16 x i16> %isums to <16 x i8> 506 ret <16 x i8> %r`; 507 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 508 } 509 else 510 { 511 byte16 sa = cast(byte16)a; 512 byte16 sb = cast(byte16)b; 513 byte16 sr = void; 514 foreach(i; 0..16) 515 { 516 sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 ); 517 } 518 return cast(int4)sr; 519 } 520 } 521 unittest 522 { 523 __m128i A = _mm_set1_epi8(31); 524 __m128i B = _mm_set1_epi8(64); 525 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B)); 526 foreach(i; 0..16) 527 assert(avg.array[i] == 48); 528 } 529 530 /// Shift `a` left by `bytes` bytes while shifting in zeros. 531 alias _mm_bslli_si128 = _mm_slli_si128; 532 unittest 533 { 534 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 535 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 536 __m128i result = _mm_bslli_si128!5(toShift); 537 assert( (cast(byte16)result).array == exact); 538 } 539 540 /// Shift `v` right by `bytes` bytes while shifting in zeros. 541 alias _mm_bsrli_si128 = _mm_srli_si128; 542 unittest 543 { 544 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 545 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0]; 546 __m128i result = _mm_bsrli_si128!5(toShift); 547 assert( (cast(byte16)result).array == exact); 548 } 549 550 /// Cast vector of type `__m128d` to type `__m128`. 551 /// Note: Also possible with a regular `cast(__m128)(a)`. 552 __m128 _mm_castpd_ps (__m128d a) pure @safe 553 { 554 return cast(__m128)a; 555 } 556 557 /// Cast vector of type `__m128d` to type `__m128i`. 558 /// Note: Also possible with a regular `cast(__m128i)(a)`. 559 __m128i _mm_castpd_si128 (__m128d a) pure @safe 560 { 561 return cast(__m128i)a; 562 } 563 564 /// Cast vector of type `__m128` to type `__m128d`. 565 /// Note: Also possible with a regular `cast(__m128d)(a)`. 566 __m128d _mm_castps_pd (__m128 a) pure @safe 567 { 568 return cast(__m128d)a; 569 } 570 571 /// Cast vector of type `__m128` to type `__m128i`. 572 /// Note: Also possible with a regular `cast(__m128i)(a)`. 573 __m128i _mm_castps_si128 (__m128 a) pure @safe 574 { 575 return cast(__m128i)a; 576 } 577 578 /// Cast vector of type `__m128i` to type `__m128d`. 579 /// Note: Also possible with a regular `cast(__m128d)(a)`. 580 __m128d _mm_castsi128_pd (__m128i a) pure @safe 581 { 582 return cast(__m128d)a; 583 } 584 585 /// Cast vector of type `__m128i` to type `__m128`. 586 /// Note: Also possible with a regular `cast(__m128)(a)`. 587 __m128 _mm_castsi128_ps (__m128i a) pure @safe 588 { 589 return cast(__m128)a; 590 } 591 592 /// Invalidate and flush the cache line that contains `p` 593 /// from all levels of the cache hierarchy. 594 void _mm_clflush (const(void)* p) @trusted 595 { 596 static if (GDC_with_SSE2) 597 { 598 __builtin_ia32_clflush(p); 599 } 600 else static if (LDC_with_SSE2) 601 { 602 __builtin_ia32_clflush(cast(void*)p); 603 } 604 else version(D_InlineAsm_X86) 605 { 606 asm pure nothrow @nogc @safe 607 { 608 mov EAX, p; 609 clflush [EAX]; 610 } 611 } 612 else version(D_InlineAsm_X86_64) 613 { 614 asm pure nothrow @nogc @safe 615 { 616 mov RAX, p; 617 clflush [RAX]; 618 } 619 } 620 else 621 { 622 // Do nothing. Invalidating cacheline does 623 // not affect correctness. 624 } 625 } 626 unittest 627 { 628 ubyte[64] cacheline; 629 _mm_clflush(cacheline.ptr); 630 } 631 632 /// Compare packed 16-bit integers in `a` and `b` for equality. 633 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 634 { 635 static if (GDC_with_SSE2) 636 { 637 return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b); 638 } 639 else 640 { 641 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 642 } 643 } 644 unittest 645 { 646 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 647 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 648 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 649 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 650 assert(R.array == E); 651 } 652 653 /// Compare packed 32-bit integers in `a` and `b` for equality. 654 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 655 { 656 static if (GDC_with_SSE2) 657 { 658 return __builtin_ia32_pcmpeqd128(a, b); 659 } 660 else 661 { 662 return equalMask!__m128i(a, b); 663 } 664 } 665 unittest 666 { 667 int4 A = [-3, -2, -1, 0]; 668 int4 B = [ 4, -2, 2, 0]; 669 int[4] E = [ 0, -1, 0, -1]; 670 int4 R = cast(int4)(_mm_cmpeq_epi32(A, B)); 671 assert(R.array == E); 672 } 673 674 /// Compare packed 8-bit integers in `a` and `b` for equality. 675 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 676 { 677 static if (GDC_with_SSE2) 678 { 679 return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b); 680 } 681 else 682 { 683 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 684 } 685 } 686 unittest 687 { 688 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 689 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 690 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 691 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 692 assert(C.array == correct); 693 } 694 695 /// Compare packed double-precision (64-bit) floating-point elements 696 /// in `a` and `b` for equality. 697 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 698 { 699 static if (GDC_with_SSE2) 700 { 701 return __builtin_ia32_cmpeqpd(a, b); 702 } 703 else 704 { 705 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b); 706 } 707 } 708 709 /// Compare the lower double-precision (64-bit) floating-point elements 710 /// in `a` and `b` for equality, store the result in the lower element, 711 /// and copy the upper element from `a`. 712 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 713 { 714 static if (GDC_with_SSE2) 715 { 716 return __builtin_ia32_cmpeqsd(a, b); 717 } 718 else 719 { 720 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b); 721 } 722 } 723 724 /// Compare packed 16-bit integers elements in `a` and `b` for greater-than-or-equal. 725 /// #BONUS 726 __m128i _mm_cmpge_epi16 (__m128i a, __m128i b) pure @safe 727 { 728 version (LDC) 729 { 730 // LDC ARM64: generates cmge since -O1 731 return cast(__m128i) greaterOrEqualMask!short8(cast(short8)a, cast(short8)b); 732 } 733 else 734 { 735 return _mm_xor_si128(_mm_cmpeq_epi16(a, b), _mm_cmpgt_epi16(a, b)); 736 } 737 } 738 unittest 739 { 740 short8 A = [-3, -2, -32768, 0, 0, 1, 2, 3]; 741 short8 B = [ 4, 3, 32767, 1, 0, -1, -2, -3]; 742 short[8] E = [ 0, 0, 0, 0, -1, -1, -1, -1]; 743 short8 R = cast(short8)(_mm_cmpge_epi16(cast(__m128i)A, cast(__m128i)B)); 744 assert(R.array == E); 745 } 746 747 /// Compare packed double-precision (64-bit) floating-point elements 748 /// in `a` and `b` for greater-than-or-equal. 749 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 750 { 751 static if (GDC_with_SSE2) 752 { 753 return __builtin_ia32_cmpgepd(a, b); 754 } 755 else 756 { 757 return cast(__m128d) cmppd!(FPComparison.oge)(a, b); 758 } 759 } 760 761 /// Compare the lower double-precision (64-bit) floating-point elements 762 /// in `a` and `b` for greater-than-or-equal, store the result in the 763 /// lower element, and copy the upper element from `a`. 764 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 765 { 766 // Note: There is no __builtin_ia32_cmpgesd builtin. 767 static if (GDC_with_SSE2) 768 { 769 return __builtin_ia32_cmpnltsd(b, a); 770 } 771 else 772 { 773 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b); 774 } 775 } 776 777 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 778 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 779 { 780 static if (GDC_with_SSE2) 781 { 782 return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b); 783 } 784 else 785 { 786 return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b); 787 } 788 } 789 unittest 790 { 791 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 792 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 793 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 794 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 795 assert(R.array == E); 796 } 797 798 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 799 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 800 { 801 static if (GDC_with_SSE2) 802 { 803 return __builtin_ia32_pcmpgtd128(a, b); 804 } 805 else 806 { 807 return cast(__m128i)( greaterMask!int4(a, b)); 808 } 809 } 810 unittest 811 { 812 int4 A = [-3, 2, -1, 0]; 813 int4 B = [ 4, -2, 2, 0]; 814 int[4] E = [ 0, -1, 0, 0]; 815 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 816 assert(R.array == E); 817 } 818 819 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 820 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 821 { 822 // Workaround of a GCC bug here. 823 // Of course the GCC builtin is buggy and generates a weird (and wrong) sequence 824 // with __builtin_ia32_pcmpgtb128. 825 // GCC's emmintrin.h uses comparison operators we don't have instead. 826 // PERF: this is a quite severe GDC performance problem. 827 // Could be workarounded with inline assembly, or another algorithm I guess. 828 829 /* 830 static if (GDC_with_SSE2) 831 { 832 return cast(__m128i) __builtin_ia32_pcmpgtb128(cast(ubyte16)a, cast(ubyte16)b); 833 } 834 else */ 835 { 836 return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b); 837 } 838 } 839 unittest 840 { 841 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 842 __m128i B = _mm_setr_epi8(2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 843 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 844 byte[16] correct = [0, 0,-1, 0, -1, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 845 __m128i D = _mm_cmpeq_epi8(A, B); 846 assert(C.array == correct); 847 } 848 849 /// Compare packed double-precision (64-bit) floating-point elements 850 /// in `a` and `b` for greater-than. 851 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 852 { 853 static if (GDC_with_SSE2) 854 { 855 return __builtin_ia32_cmpgtpd(a, b); 856 } 857 else 858 { 859 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b); 860 } 861 } 862 863 /// Compare the lower double-precision (64-bit) floating-point elements 864 /// in `a` and `b` for greater-than, store the result in the lower element, 865 /// and copy the upper element from `a`. 866 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 867 { 868 // Note: There is no __builtin_ia32_cmpgtsd builtin. 869 static if (GDC_with_SSE2) 870 { 871 return __builtin_ia32_cmpnlesd(b, a); 872 } 873 else 874 { 875 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b); 876 } 877 } 878 879 /// Compare packed 16-bit integers elements in `a` and `b` for greater-than-or-equal. 880 /// #BONUS 881 __m128i _mm_cmple_epi16 (__m128i a, __m128i b) pure @safe 882 { 883 version (LDC) 884 { 885 // LDC ARM64: generates cmge since -O1 886 return cast(__m128i) greaterOrEqualMask!short8(cast(short8)b, cast(short8)a); 887 } 888 else 889 { 890 return _mm_xor_si128(_mm_cmpeq_epi16(b, a), _mm_cmpgt_epi16(b, a)); 891 } 892 } 893 unittest 894 { 895 short8 A = [-3, -2, -32768, 1, 0, 1, 2, 3]; 896 short8 B = [ 4, 3, 32767, 0, 0, -1, -2, -3]; 897 short[8] E = [-1, -1, -1, 0, -1, 0, 0, 0]; 898 short8 R = cast(short8)(_mm_cmple_epi16(cast(__m128i)A, cast(__m128i)B)); 899 assert(R.array == E); 900 } 901 902 /// Compare packed double-precision (64-bit) floating-point elements 903 /// in `a` and `b` for less-than-or-equal. 904 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 905 { 906 static if (GDC_with_SSE2) 907 { 908 return __builtin_ia32_cmplepd(a, b); 909 } 910 else 911 { 912 return cast(__m128d) cmppd!(FPComparison.ole)(a, b); 913 } 914 } 915 916 /// Compare the lower double-precision (64-bit) floating-point elements 917 /// in `a` and `b` for less-than-or-equal, store the result in the 918 /// lower element, and copy the upper element from `a`. 919 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 920 { 921 static if (GDC_with_SSE2) 922 { 923 return __builtin_ia32_cmplesd(a, b); 924 } 925 else 926 { 927 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b); 928 } 929 } 930 931 /// Compare packed 16-bit integers in `a` and `b` for less-than. 932 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 933 { 934 return _mm_cmpgt_epi16(b, a); 935 } 936 937 /// Compare packed 32-bit integers in `a` and `b` for less-than. 938 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 939 { 940 return _mm_cmpgt_epi32(b, a); 941 } 942 943 /// Compare packed 8-bit integers in `a` and `b` for less-than. 944 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 945 { 946 return _mm_cmpgt_epi8(b, a); 947 } 948 949 /// Compare packed double-precision (64-bit) floating-point elements 950 /// in `a` and `b` for less-than. 951 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 952 { 953 static if (GDC_with_SSE2) 954 { 955 return __builtin_ia32_cmpltpd(a, b); 956 } 957 else 958 { 959 return cast(__m128d) cmppd!(FPComparison.olt)(a, b); 960 } 961 } 962 963 /// Compare the lower double-precision (64-bit) floating-point elements 964 /// in `a` and `b` for less-than, store the result in the lower 965 /// element, and copy the upper element from `a`. 966 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 967 { 968 static if (GDC_with_SSE2) 969 { 970 return __builtin_ia32_cmpltsd(a, b); 971 } 972 else 973 { 974 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b); 975 } 976 } 977 978 /// Compare packed double-precision (64-bit) floating-point elements 979 /// in `a` and `b` for not-equal. 980 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 981 { 982 static if (GDC_with_SSE2) 983 { 984 return __builtin_ia32_cmpneqpd(a, b); 985 } 986 else 987 { 988 return cast(__m128d) cmppd!(FPComparison.une)(a, b); 989 } 990 } 991 992 /// Compare the lower double-precision (64-bit) floating-point elements 993 /// in `a` and `b` for not-equal, store the result in the lower 994 /// element, and copy the upper element from `a`. 995 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 996 { 997 static if (GDC_with_SSE2) 998 { 999 return __builtin_ia32_cmpneqsd(a, b); 1000 } 1001 else 1002 { 1003 return cast(__m128d) cmpsd!(FPComparison.une)(a, b); 1004 } 1005 } 1006 1007 /// Compare packed double-precision (64-bit) floating-point elements 1008 /// in `a` and `b` for not-greater-than-or-equal. 1009 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 1010 { 1011 static if (GDC_with_SSE2) 1012 { 1013 return __builtin_ia32_cmpngepd(a, b); 1014 } 1015 else 1016 { 1017 return cast(__m128d) cmppd!(FPComparison.ult)(a, b); 1018 } 1019 } 1020 1021 /// Compare the lower double-precision (64-bit) floating-point elements 1022 /// in `a` and `b` for not-greater-than-or-equal, store the result in 1023 /// the lower element, and copy the upper element from `a`. 1024 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 1025 { 1026 // Note: There is no __builtin_ia32_cmpngesd builtin. 1027 static if (GDC_with_SSE2) 1028 { 1029 return __builtin_ia32_cmpltsd(b, a); 1030 } 1031 else 1032 { 1033 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b); 1034 } 1035 } 1036 1037 /// Compare packed double-precision (64-bit) floating-point elements 1038 /// in `a` and `b` for not-greater-than. 1039 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 1040 { 1041 static if (GDC_with_SSE2) 1042 { 1043 return __builtin_ia32_cmpngtpd(a, b); 1044 } 1045 else 1046 { 1047 return cast(__m128d) cmppd!(FPComparison.ule)(a, b); 1048 } 1049 } 1050 1051 /// Compare the lower double-precision (64-bit) floating-point elements 1052 /// in `a` and `b` for not-greater-than, store the result in the 1053 /// lower element, and copy the upper element from `a`. 1054 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 1055 { 1056 // Note: There is no __builtin_ia32_cmpngtsd builtin. 1057 static if (GDC_with_SSE2) 1058 { 1059 return __builtin_ia32_cmplesd(b, a); 1060 } 1061 else 1062 { 1063 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b); 1064 } 1065 } 1066 1067 /// Compare packed double-precision (64-bit) floating-point elements 1068 /// in `a` and `b` for not-less-than-or-equal. 1069 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 1070 { 1071 static if (GDC_with_SSE2) 1072 { 1073 return __builtin_ia32_cmpnlepd(a, b); 1074 } 1075 else 1076 { 1077 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b); 1078 } 1079 } 1080 1081 /// Compare the lower double-precision (64-bit) floating-point elements 1082 /// in `a` and `b` for not-less-than-or-equal, store the result in the 1083 /// lower element, and copy the upper element from `a`. 1084 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 1085 { 1086 static if (GDC_with_SSE2) 1087 { 1088 return __builtin_ia32_cmpnlesd(a, b); 1089 } 1090 else 1091 { 1092 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b); 1093 } 1094 } 1095 1096 /// Compare packed double-precision (64-bit) floating-point elements 1097 /// in `a` and `b` for not-less-than. 1098 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 1099 { 1100 static if (GDC_with_SSE2) 1101 { 1102 return __builtin_ia32_cmpnltpd(a, b); 1103 } 1104 else 1105 { 1106 return cast(__m128d) cmppd!(FPComparison.uge)(a, b); 1107 } 1108 } 1109 1110 /// Compare the lower double-precision (64-bit) floating-point elements 1111 /// in `a` and `b` for not-less-than, store the result in the lower 1112 /// element, and copy the upper element from `a`. 1113 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 1114 { 1115 static if (GDC_with_SSE2) 1116 { 1117 return __builtin_ia32_cmpnltsd(a, b); 1118 } 1119 else 1120 { 1121 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b); 1122 } 1123 } 1124 1125 /// Compare packed double-precision (64-bit) floating-point elements 1126 /// in `a` and `b` to see if neither is NaN. 1127 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 1128 { 1129 static if (GDC_with_SSE2) 1130 { 1131 return __builtin_ia32_cmpordpd(a, b); 1132 } 1133 else 1134 { 1135 return cast(__m128d) cmppd!(FPComparison.ord)(a, b); 1136 } 1137 } 1138 1139 /// Compare the lower double-precision (64-bit) floating-point elements 1140 /// in `a` and `b` to see if neither is NaN, store the result in the 1141 /// lower element, and copy the upper element from `a` to the upper element. 1142 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 1143 { 1144 static if (GDC_with_SSE2) 1145 { 1146 return __builtin_ia32_cmpordsd(a, b); 1147 } 1148 else 1149 { 1150 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b); 1151 } 1152 } 1153 1154 /// Compare packed double-precision (64-bit) floating-point elements 1155 /// in `a` and `b` to see if either is NaN. 1156 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 1157 { 1158 static if (GDC_with_SSE2) 1159 { 1160 return __builtin_ia32_cmpunordpd(a, b); 1161 } 1162 else 1163 { 1164 return cast(__m128d) cmppd!(FPComparison.uno)(a, b); 1165 } 1166 } 1167 1168 /// Compare the lower double-precision (64-bit) floating-point elements 1169 /// in `a` and `b` to see if either is NaN, store the result in the lower 1170 /// element, and copy the upper element from `a` to the upper element. 1171 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 1172 { 1173 static if (GDC_with_SSE2) 1174 { 1175 return __builtin_ia32_cmpunordsd(a, b); 1176 } 1177 else 1178 { 1179 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b); 1180 } 1181 } 1182 1183 /// Compare the lower double-precision (64-bit) floating-point element 1184 /// in `a` and `b` for equality, and return the boolean result (0 or 1). 1185 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe 1186 { 1187 // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 1188 // comisd instruction, it returns false in case of unordered instead. 1189 // 1190 // Actually C++ compilers disagree over the meaning of that instruction. 1191 // GCC will manage NaNs like the comisd instruction (return true if unordered), 1192 // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says. 1193 // We choose to do like the most numerous. It seems GCC is buggy with NaNs. 1194 return a.array[0] == b.array[0]; 1195 } 1196 unittest 1197 { 1198 assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1199 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1200 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1201 assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1202 assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1203 } 1204 1205 /// Compare the lower double-precision (64-bit) floating-point element 1206 /// in `a` and `b` for greater-than-or-equal, and return the boolean 1207 /// result (0 or 1). 1208 int _mm_comige_sd (__m128d a, __m128d b) pure @safe 1209 { 1210 return a.array[0] >= b.array[0]; 1211 } 1212 unittest 1213 { 1214 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1215 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1216 assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1217 assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1218 assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1219 assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1220 } 1221 1222 /// Compare the lower double-precision (64-bit) floating-point element 1223 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1). 1224 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe 1225 { 1226 return a.array[0] > b.array[0]; 1227 } 1228 unittest 1229 { 1230 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1231 assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1232 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1233 assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1234 assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1235 } 1236 1237 /// Compare the lower double-precision (64-bit) floating-point element 1238 /// in `a` and `b` for less-than-or-equal. 1239 int _mm_comile_sd (__m128d a, __m128d b) pure @safe 1240 { 1241 return a.array[0] <= b.array[0]; 1242 } 1243 unittest 1244 { 1245 assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1246 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1247 assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1248 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1249 assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1250 assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1251 } 1252 1253 /// Compare the lower double-precision (64-bit) floating-point element 1254 /// in `a` and `b` for less-than, and return the boolean result (0 or 1). 1255 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe 1256 { 1257 return a.array[0] < b.array[0]; 1258 } 1259 unittest 1260 { 1261 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1262 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1263 assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1264 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1265 assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1266 assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1267 } 1268 1269 /// Compare the lower double-precision (64-bit) floating-point element 1270 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1). 1271 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe 1272 { 1273 return a.array[0] != b.array[0]; 1274 } 1275 unittest 1276 { 1277 assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1278 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1279 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1280 assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1281 assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1282 } 1283 1284 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) 1285 /// floating-point elements. 1286 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted 1287 { 1288 version(LDC) 1289 { 1290 // Generates cvtdq2pd since LDC 1.0, even without optimizations 1291 enum ir = ` 1292 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 1293 %r = sitofp <2 x i32> %v to <2 x double> 1294 ret <2 x double> %r`; 1295 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a); 1296 } 1297 else static if (GDC_with_SSE2) 1298 { 1299 return __builtin_ia32_cvtdq2pd(a); 1300 } 1301 else 1302 { 1303 double2 r = void; 1304 r.ptr[0] = a.array[0]; 1305 r.ptr[1] = a.array[1]; 1306 return r; 1307 } 1308 } 1309 unittest 1310 { 1311 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54)); 1312 assert(A.array[0] == 54.0); 1313 assert(A.array[1] == 54.0); 1314 } 1315 1316 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 1317 /// floating-point elements. 1318 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted 1319 { 1320 static if (DMD_with_DSIMD) 1321 { 1322 return cast(__m128)__simd(XMM.CVTDQ2PS, cast(void16) a); 1323 } 1324 else static if (GDC_with_SSE2) 1325 { 1326 return __builtin_ia32_cvtdq2ps(a); 1327 } 1328 else version(LDC) 1329 { 1330 // See #86 for why we had to resort to LLVM IR. 1331 // Plain code below was leading to catastrophic behaviour. 1332 // x86: Generates cvtdq2ps since LDC 1.1.0 -O0 1333 // ARM: Generats scvtf.4s since LDC 1.8.0 -O0 1334 enum ir = ` 1335 %r = sitofp <4 x i32> %0 to <4 x float> 1336 ret <4 x float> %r`; 1337 return cast(__m128) LDCInlineIR!(ir, float4, int4)(a); 1338 } 1339 else 1340 { 1341 __m128 res; // PERF =void; 1342 res.ptr[0] = cast(float)a.array[0]; 1343 res.ptr[1] = cast(float)a.array[1]; 1344 res.ptr[2] = cast(float)a.array[2]; 1345 res.ptr[3] = cast(float)a.array[3]; 1346 return res; 1347 } 1348 } 1349 unittest 1350 { 1351 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 1352 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 1353 } 1354 1355 /// Convert packed double-precision (64-bit) floating-point elements 1356 /// in `a` to packed 32-bit integers. 1357 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted 1358 { 1359 // PERF ARM32 1360 static if (LDC_with_SSE2) 1361 { 1362 return __builtin_ia32_cvtpd2dq(a); 1363 } 1364 else static if (GDC_with_SSE2) 1365 { 1366 return __builtin_ia32_cvtpd2dq(a); 1367 } 1368 else static if (LDC_with_ARM64) 1369 { 1370 // Get current rounding mode. 1371 uint fpscr = arm_get_fpcr(); 1372 long2 i; 1373 switch(fpscr & _MM_ROUND_MASK_ARM) 1374 { 1375 default: 1376 case _MM_ROUND_NEAREST_ARM: i = vcvtnq_s64_f64(a); break; 1377 case _MM_ROUND_DOWN_ARM: i = vcvtmq_s64_f64(a); break; 1378 case _MM_ROUND_UP_ARM: i = vcvtpq_s64_f64(a); break; 1379 case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break; 1380 } 1381 int4 zero = 0; 1382 return cast(__m128i) shufflevectorLDC!(int4, 0, 2, 4, 6)(cast(int4)i, zero); 1383 } 1384 else 1385 { 1386 // PERF ARM32 1387 __m128i r = _mm_setzero_si128(); 1388 r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]); 1389 r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]); 1390 return r; 1391 } 1392 } 1393 unittest 1394 { 1395 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0)); 1396 assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0); 1397 } 1398 1399 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1400 /// to packed 32-bit integers 1401 __m64 _mm_cvtpd_pi32 (__m128d v) @safe 1402 { 1403 return to_m64(_mm_cvtpd_epi32(v)); 1404 } 1405 unittest 1406 { 1407 int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0)); 1408 assert(A.array[0] == 55 && A.array[1] == 61); 1409 } 1410 1411 /// Convert packed double-precision (64-bit) floating-point elements 1412 /// in `a` to packed single-precision (32-bit) floating-point elements. 1413 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted 1414 { 1415 static if (LDC_with_SSE2) 1416 { 1417 return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately 1418 } 1419 else static if (GDC_with_SSE2) 1420 { 1421 return __builtin_ia32_cvtpd2ps(a); 1422 } 1423 else 1424 { 1425 __m128 r = void; 1426 r.ptr[0] = a.array[0]; 1427 r.ptr[1] = a.array[1]; 1428 r.ptr[2] = 0; 1429 r.ptr[3] = 0; 1430 return r; 1431 } 1432 } 1433 unittest 1434 { 1435 __m128d A = _mm_set_pd(5.25, 4.0); 1436 __m128 B = _mm_cvtpd_ps(A); 1437 assert(B.array == [4.0f, 5.25f, 0, 0]); 1438 } 1439 1440 /// Convert packed 32-bit integers in `v` to packed double-precision 1441 /// (64-bit) floating-point elements. 1442 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe 1443 { 1444 return _mm_cvtepi32_pd(to_m128i(v)); 1445 } 1446 unittest 1447 { 1448 __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5)); 1449 assert(A.array[0] == 4.0 && A.array[1] == -5.0); 1450 } 1451 1452 /// Convert packed single-precision (32-bit) floating-point elements 1453 /// in `a` to packed 32-bit integers 1454 __m128i _mm_cvtps_epi32 (__m128 a) @trusted 1455 { 1456 static if (LDC_with_SSE2) 1457 { 1458 return cast(__m128i) __builtin_ia32_cvtps2dq(a); 1459 } 1460 else static if (GDC_with_SSE2) 1461 { 1462 return __builtin_ia32_cvtps2dq(a); 1463 } 1464 else static if (LDC_with_ARM64) 1465 { 1466 // Get current rounding mode. 1467 uint fpscr = arm_get_fpcr(); 1468 switch(fpscr & _MM_ROUND_MASK_ARM) 1469 { 1470 default: 1471 case _MM_ROUND_NEAREST_ARM: return vcvtnq_s32_f32(a); 1472 case _MM_ROUND_DOWN_ARM: return vcvtmq_s32_f32(a); 1473 case _MM_ROUND_UP_ARM: return vcvtpq_s32_f32(a); 1474 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a); 1475 } 1476 } 1477 else 1478 { 1479 __m128i r = void; 1480 r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]); 1481 r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]); 1482 r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]); 1483 r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]); 1484 return r; 1485 } 1486 } 1487 unittest 1488 { 1489 // GDC bug #98607 1490 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607 1491 // GDC does not provide optimization barrier for rounding mode. 1492 // Workarounded with different literals. This bug will likely only manifest in unittest. 1493 // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't. 1494 1495 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1496 1497 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1498 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1499 assert(A.array == [1, -2, 54, -3]); 1500 1501 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1502 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f)); 1503 assert(A.array == [1, -3, 53, -3]); 1504 1505 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1506 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f)); 1507 assert(A.array == [2, -2, 54, -2]); 1508 1509 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1510 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f)); 1511 assert(A.array == [1, -2, 53, -2]); 1512 1513 _MM_SET_ROUNDING_MODE(savedRounding); 1514 } 1515 1516 /// Convert packed single-precision (32-bit) floating-point elements 1517 /// in `a` to packed double-precision (64-bit) floating-point elements. 1518 __m128d _mm_cvtps_pd (__m128 a) pure @trusted 1519 { 1520 version(LDC) 1521 { 1522 // Generates cvtps2pd since LDC 1.0 -O0 1523 enum ir = ` 1524 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1> 1525 %r = fpext <2 x float> %v to <2 x double> 1526 ret <2 x double> %r`; 1527 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a); 1528 } 1529 else static if (GDC_with_SSE2) 1530 { 1531 return __builtin_ia32_cvtps2pd(a); 1532 } 1533 else 1534 { 1535 double2 r = void; 1536 r.ptr[0] = a.array[0]; 1537 r.ptr[1] = a.array[1]; 1538 return r; 1539 } 1540 } 1541 unittest 1542 { 1543 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f)); 1544 assert(A.array[0] == 54.0); 1545 assert(A.array[1] == 54.0); 1546 } 1547 1548 /// Copy the lower double-precision (64-bit) floating-point element of `a`. 1549 double _mm_cvtsd_f64 (__m128d a) pure @safe 1550 { 1551 return a.array[0]; 1552 } 1553 1554 /// Convert the lower double-precision (64-bit) floating-point element 1555 /// in `a` to a 32-bit integer. 1556 int _mm_cvtsd_si32 (__m128d a) @safe 1557 { 1558 static if (LDC_with_SSE2) 1559 { 1560 return __builtin_ia32_cvtsd2si(a); 1561 } 1562 else static if (GDC_with_SSE2) 1563 { 1564 return __builtin_ia32_cvtsd2si(a); 1565 } 1566 else 1567 { 1568 return convertDoubleToInt32UsingMXCSR(a[0]); 1569 } 1570 } 1571 unittest 1572 { 1573 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0))); 1574 } 1575 1576 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer. 1577 long _mm_cvtsd_si64 (__m128d a) @trusted 1578 { 1579 version (LDC) 1580 { 1581 version (X86_64) 1582 { 1583 return __builtin_ia32_cvtsd2si64(a); 1584 } 1585 else 1586 { 1587 // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer 1588 // using SSE instructions only. So the builtin doesn't exit for this arch. 1589 return convertDoubleToInt64UsingMXCSR(a[0]); 1590 } 1591 } 1592 else 1593 { 1594 return convertDoubleToInt64UsingMXCSR(a.array[0]); 1595 } 1596 } 1597 unittest 1598 { 1599 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0))); 1600 1601 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1602 1603 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1604 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49))); 1605 1606 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1607 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1))); 1608 1609 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1610 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1))); 1611 1612 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1613 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9))); 1614 1615 _MM_SET_ROUNDING_MODE(savedRounding); 1616 } 1617 1618 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; /// 1619 1620 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 1621 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a` 1622 /// to the upper elements of result. 1623 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted 1624 { 1625 static if (GDC_with_SSE2) 1626 { 1627 return __builtin_ia32_cvtsd2ss(a, b); 1628 } 1629 else 1630 { 1631 // Generates cvtsd2ss since LDC 1.3 -O0 1632 a.ptr[0] = b.array[0]; 1633 return a; 1634 } 1635 } 1636 unittest 1637 { 1638 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0)); 1639 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]); 1640 } 1641 1642 /// Get the lower 32-bit integer in `a`. 1643 int _mm_cvtsi128_si32 (__m128i a) pure @safe 1644 { 1645 return a.array[0]; 1646 } 1647 1648 /// Get the lower 64-bit integer in `a`. 1649 long _mm_cvtsi128_si64 (__m128i a) pure @safe 1650 { 1651 long2 la = cast(long2)a; 1652 return la.array[0]; 1653 } 1654 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 1655 1656 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 1657 /// lower element of result, and copy the upper element from `a` to the upper element of result. 1658 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted 1659 { 1660 a.ptr[0] = cast(double)b; 1661 return a; 1662 } 1663 unittest 1664 { 1665 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 1666 assert(a.array == [42.0, 0]); 1667 } 1668 1669 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements. 1670 __m128i _mm_cvtsi32_si128 (int a) pure @trusted 1671 { 1672 int4 r = [0, 0, 0, 0]; 1673 r.ptr[0] = a; 1674 return r; 1675 } 1676 unittest 1677 { 1678 __m128i a = _mm_cvtsi32_si128(65); 1679 assert(a.array == [65, 0, 0, 0]); 1680 } 1681 1682 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 1683 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 1684 1685 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted 1686 { 1687 a.ptr[0] = cast(double)b; 1688 return a; 1689 } 1690 unittest 1691 { 1692 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 1693 assert(a.array == [42.0, 0]); 1694 } 1695 1696 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element. 1697 __m128i _mm_cvtsi64_si128 (long a) pure @trusted 1698 { 1699 long2 r = [0, 0]; 1700 r.ptr[0] = a; 1701 return cast(__m128i)(r); 1702 } 1703 1704 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; /// 1705 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; /// 1706 1707 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 1708 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 1709 // element of result. 1710 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted 1711 { 1712 a.ptr[0] = b.array[0]; 1713 return a; 1714 } 1715 unittest 1716 { 1717 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 1718 assert(a.array == [42.0, 0]); 1719 } 1720 1721 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation. 1722 long _mm_cvttss_si64 (__m128 a) pure @safe 1723 { 1724 return cast(long)(a.array[0]); // Generates cvttss2si as expected 1725 } 1726 unittest 1727 { 1728 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1729 } 1730 1731 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1732 /// Put zeroes in the upper elements of result. 1733 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted 1734 { 1735 static if (LDC_with_SSE2) 1736 { 1737 return __builtin_ia32_cvttpd2dq(a); 1738 } 1739 else static if (GDC_with_SSE2) 1740 { 1741 return __builtin_ia32_cvttpd2dq(a); 1742 } 1743 else 1744 { 1745 // Note: doesn't generate cvttpd2dq as of LDC 1.13 1746 __m128i r; // PERF =void; 1747 r.ptr[0] = cast(int)a.array[0]; 1748 r.ptr[1] = cast(int)a.array[1]; 1749 r.ptr[2] = 0; 1750 r.ptr[3] = 0; 1751 return r; 1752 } 1753 } 1754 unittest 1755 { 1756 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f)); 1757 assert(R.array == [-4, 45641, 0, 0]); 1758 } 1759 1760 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1761 /// to packed 32-bit integers with truncation. 1762 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe 1763 { 1764 return to_m64(_mm_cvttpd_epi32(v)); 1765 } 1766 unittest 1767 { 1768 int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f)); 1769 int[2] correct = [-4, 45641]; 1770 assert(R.array == correct); 1771 } 1772 1773 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1774 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted 1775 { 1776 // x86: Generates cvttps2dq since LDC 1.3 -O2 1777 // ARM64: generates fcvtze since LDC 1.8 -O2 1778 __m128i r; // PERF = void; 1779 r.ptr[0] = cast(int)a.array[0]; 1780 r.ptr[1] = cast(int)a.array[1]; 1781 r.ptr[2] = cast(int)a.array[2]; 1782 r.ptr[3] = cast(int)a.array[3]; 1783 return r; 1784 } 1785 unittest 1786 { 1787 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f)); 1788 assert(R.array == [-4, 45641, 0, 1]); 1789 } 1790 1791 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation. 1792 int _mm_cvttsd_si32 (__m128d a) 1793 { 1794 // Generates cvttsd2si since LDC 1.3 -O0 1795 return cast(int)a.array[0]; 1796 } 1797 1798 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation. 1799 long _mm_cvttsd_si64 (__m128d a) 1800 { 1801 // Generates cvttsd2si since LDC 1.3 -O0 1802 // but in 32-bit instead, it's a long sequence that resort to FPU 1803 return cast(long)a.array[0]; 1804 } 1805 1806 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; /// 1807 1808 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`. 1809 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe 1810 { 1811 pragma(inline, true); 1812 return a / b; 1813 } 1814 1815 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted 1816 { 1817 static if (GDC_with_SSE2) 1818 { 1819 return __builtin_ia32_divsd(a, b); 1820 } 1821 else version(DigitalMars) 1822 { 1823 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1824 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 1825 asm pure nothrow @nogc @trusted { nop;} 1826 a.array[0] = a.array[0] / b.array[0]; 1827 return a; 1828 } 1829 else 1830 { 1831 a.ptr[0] /= b.array[0]; 1832 return a; 1833 } 1834 } 1835 unittest 1836 { 1837 __m128d a = [2.0, 4.5]; 1838 a = _mm_div_sd(a, a); 1839 assert(a.array == [1.0, 4.5]); 1840 } 1841 1842 /// Extract a 16-bit integer from `v`, selected with `index`. 1843 /// Warning: the returned value is zero-extended to 32-bits. 1844 int _mm_extract_epi16(__m128i v, int index) pure @safe 1845 { 1846 short8 r = cast(short8)v; 1847 return cast(ushort)(r.array[index & 7]); 1848 } 1849 unittest 1850 { 1851 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1); 1852 assert(_mm_extract_epi16(A, 6) == 6); 1853 assert(_mm_extract_epi16(A, 0) == 65535); 1854 assert(_mm_extract_epi16(A, 5 + 8) == 5); 1855 } 1856 1857 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`. 1858 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted 1859 { 1860 short8 r = cast(short8)v; 1861 r.ptr[index & 7] = cast(short)i; 1862 return cast(__m128i)r; 1863 } 1864 unittest 1865 { 1866 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1867 short8 R = cast(short8) _mm_insert_epi16(A, 42, 6); 1868 short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7]; 1869 assert(R.array == correct); 1870 } 1871 1872 /// Perform a serializing operation on all load-from-memory instructions that were issued prior 1873 /// to this instruction. Guarantees that every load instruction that precedes, in program order, 1874 /// is globally visible before any load instruction which follows the fence in program order. 1875 void _mm_lfence() @trusted 1876 { 1877 version(GNU) 1878 { 1879 static if (GDC_with_SSE2) 1880 { 1881 __builtin_ia32_lfence(); 1882 } 1883 else version(X86) 1884 { 1885 asm pure nothrow @nogc @trusted 1886 { 1887 "lfence;\n" : : : ; 1888 } 1889 } 1890 else 1891 static assert(false); 1892 } 1893 else static if (LDC_with_SSE2) 1894 { 1895 __builtin_ia32_lfence(); 1896 } 1897 else static if (LDC_with_ARM64) 1898 { 1899 __builtin_arm_dmb(9); // dmb ishld 1900 } 1901 else static if (DMD_with_asm) 1902 { 1903 asm nothrow @nogc pure @safe 1904 { 1905 lfence; 1906 } 1907 } 1908 else version(LDC) 1909 { 1910 // When the architecture is unknown, generate a full memory barrier, 1911 // as the semantics of sfence do not really match those of atomics. 1912 llvm_memory_fence(); 1913 } 1914 else 1915 static assert(false); 1916 } 1917 unittest 1918 { 1919 _mm_lfence(); 1920 } 1921 1922 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1923 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1924 __m128d _mm_load_pd (const(double) * mem_addr) pure 1925 { 1926 pragma(inline, true); 1927 __m128d* aligned = cast(__m128d*)mem_addr; 1928 return *aligned; 1929 } 1930 unittest 1931 { 1932 align(16) double[2] S = [-5.0, 7.0]; 1933 __m128d R = _mm_load_pd(S.ptr); 1934 assert(R.array == S); 1935 } 1936 1937 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst. 1938 /// `mem_addr` does not need to be aligned on any particular boundary. 1939 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 1940 { 1941 double m = *mem_addr; 1942 __m128d r; // PERF =void; 1943 r.ptr[0] = m; 1944 r.ptr[1] = m; 1945 return r; 1946 } 1947 unittest 1948 { 1949 double what = 4; 1950 __m128d R = _mm_load_pd1(&what); 1951 double[2] correct = [4.0, 4]; 1952 assert(R.array == correct); 1953 } 1954 1955 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 1956 /// element. `mem_addr` does not need to be aligned on any particular boundary. 1957 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted 1958 { 1959 double2 r = [0, 0]; 1960 r.ptr[0] = *mem_addr; 1961 return r; 1962 } 1963 unittest 1964 { 1965 double x = -42; 1966 __m128d a = _mm_load_sd(&x); 1967 assert(a.array == [-42.0, 0.0]); 1968 } 1969 1970 /// Load 128-bits of integer data from memory into dst. 1971 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1972 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted // TODO: shoudln't be trusted because alignment, Issue #62 1973 { 1974 pragma(inline, true); 1975 return *mem_addr; 1976 } 1977 unittest 1978 { 1979 align(16) int[4] correct = [-1, 2, 3, 4]; 1980 int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr); 1981 assert(A.array == correct); 1982 } 1983 1984 alias _mm_load1_pd = _mm_load_pd1; /// 1985 1986 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 1987 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary. 1988 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted 1989 { 1990 pragma(inline, true); 1991 a.ptr[1] = *mem_addr; 1992 return a; 1993 } 1994 unittest 1995 { 1996 double A = 7.0; 1997 __m128d B = _mm_setr_pd(4.0, -5.0); 1998 __m128d R = _mm_loadh_pd(B, &A); 1999 double[2] correct = [ 4.0, 7.0 ]; 2000 assert(R.array == correct); 2001 } 2002 2003 /// Load 64-bit integer from memory into the first element of result. Zero out the other. 2004 // Note: strange signature since the memory doesn't have to aligned (Issue #60), and doesn't have to be 128-bit 2005 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted // TODO signature 2006 { 2007 pragma(inline, true); 2008 static if (DMD_with_DSIMD) 2009 { 2010 return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr); 2011 } 2012 else 2013 { 2014 auto pLong = cast(const(long)*)mem_addr; 2015 long2 r = [0, 0]; 2016 r.ptr[0] = *pLong; 2017 return cast(__m128i)(r); 2018 } 2019 } 2020 unittest 2021 { 2022 long A = 0x7878787870707070; 2023 long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A); 2024 long[2] correct = [0x7878787870707070, 0]; 2025 assert(R.array == correct); 2026 } 2027 2028 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 2029 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary. 2030 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted 2031 { 2032 a.ptr[0] = *mem_addr; 2033 return a; 2034 } 2035 unittest 2036 { 2037 double A = 7.0; 2038 __m128d B = _mm_setr_pd(4.0, -5.0); 2039 __m128d R = _mm_loadl_pd(B, &A); 2040 double[2] correct = [ 7.0, -5.0 ]; 2041 assert(R.array == correct); 2042 } 2043 2044 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 2045 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 2046 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted 2047 { 2048 __m128d a = *cast(__m128d*)(mem_addr); 2049 __m128d r; // PERF =void; 2050 r.ptr[0] = a.array[1]; 2051 r.ptr[1] = a.array[0]; 2052 return r; 2053 } 2054 unittest 2055 { 2056 align(16) double[2] A = [56.0, -74.0]; 2057 __m128d R = _mm_loadr_pd(A.ptr); 2058 double[2] correct = [-74.0, 56.0]; 2059 assert(R.array == correct); 2060 } 2061 2062 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 2063 /// `mem_addr` does not need to be aligned on any particular boundary. 2064 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted 2065 { 2066 pragma(inline, true); 2067 static if (GDC_with_SSE2) 2068 { 2069 return __builtin_ia32_loadupd(mem_addr); 2070 } 2071 else version(LDC) 2072 { 2073 return loadUnaligned!(double2)(mem_addr); 2074 } 2075 else version(DigitalMars) 2076 { 2077 // Apparently inside __simd you can use aligned dereferences without fear. 2078 // That was issue 23048 on dlang's Bugzilla. 2079 static if (DMD_with_DSIMD) 2080 { 2081 return cast(__m128d)__simd(XMM.LODUPD, *cast(double2*)mem_addr); 2082 } 2083 else static if (SSESizedVectorsAreEmulated) 2084 { 2085 // Since this vector is emulated, it doesn't have alignement constraints 2086 // and as such we can just cast it. 2087 return *cast(__m128d*)(mem_addr); 2088 } 2089 else 2090 { 2091 __m128d result; 2092 result.ptr[0] = mem_addr[0]; 2093 result.ptr[1] = mem_addr[1]; 2094 return result; 2095 } 2096 } 2097 else 2098 { 2099 __m128d result; 2100 result.ptr[0] = mem_addr[0]; 2101 result.ptr[1] = mem_addr[1]; 2102 return result; 2103 } 2104 } 2105 unittest 2106 { 2107 double[2] A = [56.0, -75.0]; 2108 __m128d R = _mm_loadu_pd(A.ptr); 2109 double[2] correct = [56.0, -75.0]; 2110 assert(R.array == correct); 2111 } 2112 2113 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary. 2114 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 2115 { 2116 // PERF DMD 2117 pragma(inline, true); 2118 static if (GDC_with_SSE2) 2119 { 2120 return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr); 2121 } 2122 else version(LDC) 2123 { 2124 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 2125 } 2126 else 2127 { 2128 const(int)* p = cast(const(int)*)mem_addr; 2129 __m128i r = void; 2130 r.ptr[0] = p[0]; 2131 r.ptr[1] = p[1]; 2132 r.ptr[2] = p[2]; 2133 r.ptr[3] = p[3]; 2134 return r; 2135 } 2136 } 2137 unittest 2138 { 2139 align(16) int[4] correct = [-1, 2, -3, 4]; 2140 int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr); 2141 assert(A.array == correct); 2142 } 2143 2144 /// Load unaligned 16-bit integer from memory into the first element, fill with zeroes otherwise. 2145 __m128i _mm_loadu_si16(const(void)* mem_addr) pure @trusted // TODO: should be @system actually 2146 { 2147 static if (DMD_with_DSIMD) 2148 { 2149 int r = *cast(short*)(mem_addr); 2150 return cast(__m128i) __simd(XMM.LODD, *cast(__m128i*)&r); 2151 } 2152 else version(DigitalMars) 2153 { 2154 // Workaround issue: https://issues.dlang.org/show_bug.cgi?id=21672 2155 // DMD cannot handle the below code... 2156 align(16) short[8] r = [0, 0, 0, 0, 0, 0, 0, 0]; 2157 r[0] = *cast(short*)(mem_addr); 2158 return *cast(int4*)(r.ptr); 2159 } 2160 else 2161 { 2162 short r = *cast(short*)(mem_addr); 2163 short8 result = [0, 0, 0, 0, 0, 0, 0, 0]; 2164 result.ptr[0] = r; 2165 return cast(__m128i)result; 2166 } 2167 } 2168 unittest 2169 { 2170 short r = 13; 2171 short8 A = cast(short8) _mm_loadu_si16(&r); 2172 short[8] correct = [13, 0, 0, 0, 0, 0, 0, 0]; 2173 assert(A.array == correct); 2174 } 2175 2176 /// Load unaligned 32-bit integer from memory into the first element of result. 2177 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted // TODO: should be @system actually 2178 { 2179 pragma(inline, true); 2180 int r = *cast(int*)(mem_addr); 2181 int4 result = [0, 0, 0, 0]; 2182 result.ptr[0] = r; 2183 return result; 2184 } 2185 unittest 2186 { 2187 int r = 42; 2188 __m128i A = _mm_loadu_si32(&r); 2189 int[4] correct = [42, 0, 0, 0]; 2190 assert(A.array == correct); 2191 } 2192 2193 /// Load unaligned 64-bit integer from memory into the first element of result. 2194 /// Upper 64-bit is zeroed. 2195 __m128i _mm_loadu_si64 (const(void)* mem_addr) pure @system 2196 { 2197 pragma(inline, true); 2198 static if (DMD_with_DSIMD) 2199 { 2200 return cast(__m128i) __simd(XMM.LODQ, *cast(__m128i*)mem_addr); 2201 } 2202 else 2203 { 2204 auto pLong = cast(const(long)*)mem_addr; 2205 long2 r = [0, 0]; 2206 r.ptr[0] = *pLong; 2207 return cast(__m128i)r; 2208 } 2209 } 2210 unittest 2211 { 2212 long r = 446446446446; 2213 long2 A = cast(long2) _mm_loadu_si64(&r); 2214 long[2] correct = [446446446446, 0]; 2215 assert(A.array == correct); 2216 } 2217 2218 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 2219 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 2220 /// and pack the results in destination. 2221 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted 2222 { 2223 static if (GDC_with_SSE2) 2224 { 2225 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2226 } 2227 else static if (LDC_with_SSE2) 2228 { 2229 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2230 } 2231 else static if (LDC_with_ARM64) 2232 { 2233 int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b)); 2234 int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b)); 2235 int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl)); 2236 int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph)); 2237 return vcombine_s32(rl, rh); 2238 } 2239 else 2240 { 2241 short8 sa = cast(short8)a; 2242 short8 sb = cast(short8)b; 2243 int4 r; 2244 foreach(i; 0..4) 2245 { 2246 r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1]; 2247 } 2248 return r; 2249 } 2250 } 2251 unittest 2252 { 2253 short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2254 short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2255 int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B); 2256 int[4] correct = [1, 13, -2147483648, 2*32767*32767]; 2257 assert(R.array == correct); 2258 } 2259 2260 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 2261 /// (elements are not stored when the highest bit is not set in the corresponding element) 2262 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 2263 /// boundary. 2264 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted 2265 { 2266 static if (GDC_with_SSE2) 2267 { 2268 return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr); 2269 } 2270 else static if (LDC_with_SSE2) 2271 { 2272 return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr); 2273 } 2274 else static if (LDC_with_ARM64) 2275 { 2276 // PERF: catastrophic on ARM32 2277 byte16 bmask = cast(byte16)mask; 2278 byte16 shift = 7; 2279 bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask 2280 mask = cast(__m128i) bmask; 2281 __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr); 2282 dest = (a & mask) | (dest & ~mask); 2283 storeUnaligned!__m128i(dest, cast(int*)mem_addr); 2284 } 2285 else 2286 { 2287 byte16 b = cast(byte16)a; 2288 byte16 m = cast(byte16)mask; 2289 byte* dest = cast(byte*)(mem_addr); 2290 foreach(j; 0..16) 2291 { 2292 if (m.array[j] & 128) 2293 { 2294 dest[j] = b.array[j]; 2295 } 2296 } 2297 } 2298 } 2299 unittest 2300 { 2301 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]; 2302 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0); 2303 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); 2304 _mm_maskmoveu_si128(A, mask, dest.ptr); 2305 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42]; 2306 assert(dest == correct); 2307 } 2308 2309 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values. 2310 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe 2311 { 2312 static if (GDC_with_SSE2) 2313 { 2314 return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b); 2315 } 2316 else version(LDC) 2317 { 2318 // x86: pmaxsw since LDC 1.0 -O1 2319 // ARM: smax.8h since LDC 1.5 -01 2320 short8 sa = cast(short8)a; 2321 short8 sb = cast(short8)b; 2322 short8 greater = greaterMask!short8(sa, sb); 2323 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2324 } 2325 else 2326 { 2327 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else 2328 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2329 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2330 return _mm_xor_si128(b, mask); 2331 } 2332 } 2333 unittest 2334 { 2335 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57), 2336 _mm_setr_epi16(-4,-8, 9, 7, 0,-32768, 0, 0)); 2337 short[8] correct = [32767, 1, 9, 7, 9, 7, 0, 0]; 2338 assert(R.array == correct); 2339 } 2340 2341 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values. 2342 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe 2343 { 2344 version(LDC) 2345 { 2346 // x86: pmaxub since LDC 1.0.0 -O1 2347 // ARM64: umax.16b since LDC 1.5.0 -O1 2348 // PERF: catastrophic on ARM32 2349 ubyte16 sa = cast(ubyte16)a; 2350 ubyte16 sb = cast(ubyte16)b; 2351 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2352 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2353 } 2354 else 2355 { 2356 __m128i value128 = _mm_set1_epi8(-128); 2357 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2358 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2359 __m128i mask = _mm_and_si128(aTob, higher); 2360 return _mm_xor_si128(b, mask); 2361 } 2362 } 2363 unittest 2364 { 2365 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2366 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2367 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 2368 assert(R.array == correct); 2369 } 2370 2371 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 2372 /// packed maximum values. 2373 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted 2374 { 2375 static if (GDC_with_SSE2) 2376 { 2377 return __builtin_ia32_maxpd(a, b); 2378 } 2379 else 2380 { 2381 // x86: Generates maxpd starting with LDC 1.9 -O2 2382 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2383 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2384 return a; 2385 } 2386 } 2387 unittest 2388 { 2389 __m128d A = _mm_setr_pd(4.0, 1.0); 2390 __m128d B = _mm_setr_pd(1.0, 8.0); 2391 __m128d M = _mm_max_pd(A, B); 2392 assert(M.array[0] == 4.0); 2393 assert(M.array[1] == 8.0); 2394 } 2395 2396 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 2397 /// lower element of result, and copy the upper element from `a` to the upper element of result. 2398 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted 2399 { 2400 static if (GDC_with_SSE2) 2401 { 2402 return __builtin_ia32_maxsd(a, b); 2403 } 2404 else 2405 { 2406 __m128d r = a; 2407 // Generates maxsd starting with LDC 1.3 2408 r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2409 return r; 2410 } 2411 } 2412 unittest 2413 { 2414 __m128d A = _mm_setr_pd(1.0, 1.0); 2415 __m128d B = _mm_setr_pd(4.0, 2.0); 2416 __m128d M = _mm_max_sd(A, B); 2417 assert(M.array[0] == 4.0); 2418 assert(M.array[1] == 1.0); 2419 } 2420 2421 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 2422 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 2423 /// is globally visible before any memory instruction which follows the fence in program order. 2424 void _mm_mfence() @trusted // not pure! 2425 { 2426 version(GNU) 2427 { 2428 static if (GDC_with_SSE2) 2429 { 2430 __builtin_ia32_mfence(); 2431 } 2432 else version(X86) 2433 { 2434 asm pure nothrow @nogc @trusted 2435 { 2436 "mfence;\n" : : : ; 2437 } 2438 } 2439 else 2440 static assert(false); 2441 } 2442 else static if (LDC_with_SSE2) 2443 { 2444 __builtin_ia32_mfence(); 2445 } 2446 else static if (DMD_with_asm) 2447 { 2448 asm nothrow @nogc pure @safe 2449 { 2450 mfence; 2451 } 2452 } 2453 else version(LDC) 2454 { 2455 // Note: will generate the DMB ish instruction on ARM 2456 llvm_memory_fence(); 2457 } 2458 else 2459 static assert(false); 2460 } 2461 unittest 2462 { 2463 _mm_mfence(); 2464 } 2465 2466 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values. 2467 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe 2468 { 2469 static if (GDC_with_SSE2) 2470 { 2471 return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b); 2472 } 2473 else version(LDC) 2474 { 2475 // x86: pminsw since LDC 1.0 -O1 2476 // ARM64: smin.8h since LDC 1.5 -01 2477 short8 sa = cast(short8)a; 2478 short8 sb = cast(short8)b; 2479 short8 greater = greaterMask!short8(sa, sb); 2480 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2481 } 2482 else 2483 { 2484 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else 2485 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2486 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2487 return _mm_xor_si128(b, mask); 2488 } 2489 } 2490 unittest 2491 { 2492 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-32768), 2493 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 2494 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -32768]; 2495 assert(R.array == correct); 2496 } 2497 2498 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values. 2499 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe 2500 { 2501 version(LDC) 2502 { 2503 // x86: pminub since LDC 1.0.0 -O1 2504 // ARM: umin.16b since LDC 1.5.0 -O1 2505 // PERF: catastrophic on ARM32 2506 ubyte16 sa = cast(ubyte16)a; 2507 ubyte16 sb = cast(ubyte16)b; 2508 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2509 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2510 } 2511 else 2512 { 2513 __m128i value128 = _mm_set1_epi8(-128); 2514 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2515 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2516 __m128i mask = _mm_and_si128(aTob, lower); 2517 return _mm_xor_si128(b, mask); 2518 } 2519 } 2520 unittest 2521 { 2522 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2523 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2524 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 2525 assert(R.array == correct); 2526 } 2527 2528 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values. 2529 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted 2530 { 2531 static if (GDC_with_SSE2) 2532 { 2533 return __builtin_ia32_minpd(a, b); 2534 } 2535 else 2536 { 2537 // Generates minpd starting with LDC 1.9 2538 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2539 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2540 return a; 2541 } 2542 } 2543 unittest 2544 { 2545 __m128d A = _mm_setr_pd(1.0, 2.0); 2546 __m128d B = _mm_setr_pd(4.0, 1.0); 2547 __m128d M = _mm_min_pd(A, B); 2548 assert(M.array[0] == 1.0); 2549 assert(M.array[1] == 1.0); 2550 } 2551 2552 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 2553 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 2554 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe 2555 { 2556 static if (GDC_with_SSE2) 2557 { 2558 return __builtin_ia32_minsd(a, b); 2559 } 2560 else 2561 { 2562 // Generates minsd starting with LDC 1.3 2563 __m128d r = a; 2564 r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2565 return r; 2566 } 2567 } 2568 unittest 2569 { 2570 __m128d A = _mm_setr_pd(1.0, 3.0); 2571 __m128d B = _mm_setr_pd(4.0, 2.0); 2572 __m128d M = _mm_min_sd(A, B); 2573 assert(M.array[0] == 1.0); 2574 assert(M.array[1] == 3.0); 2575 } 2576 2577 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element. 2578 __m128i _mm_move_epi64 (__m128i a) pure @trusted 2579 { 2580 static if (GDC_with_SSE2) 2581 { 2582 // slightly better with GDC -O0 2583 return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 2584 } 2585 else 2586 { 2587 long2 result = [ 0, 0 ]; 2588 long2 la = cast(long2) a; 2589 result.ptr[0] = la.array[0]; 2590 return cast(__m128i)(result); 2591 } 2592 } 2593 unittest 2594 { 2595 long2 A = [13, 47]; 2596 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 2597 long[2] correct = [13, 0]; 2598 assert(B.array == correct); 2599 } 2600 2601 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 2602 /// the upper element from `a` to the upper element of dst. 2603 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted 2604 { 2605 static if (GDC_with_SSE2) 2606 { 2607 return __builtin_ia32_movsd(a, b); 2608 } 2609 else 2610 { 2611 b.ptr[1] = a.array[1]; 2612 return b; 2613 } 2614 } 2615 unittest 2616 { 2617 double2 A = [13.0, 47.0]; 2618 double2 B = [34.0, 58.0]; 2619 double2 C = _mm_move_sd(A, B); 2620 double[2] correct = [34.0, 47.0]; 2621 assert(C.array == correct); 2622 } 2623 2624 /// Create mask from the most significant bit of each 8-bit element in `v`. 2625 int _mm_movemask_epi8 (__m128i a) pure @trusted 2626 { 2627 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2628 static if (GDC_with_SSE2) 2629 { 2630 return __builtin_ia32_pmovmskb128(cast(ubyte16)a); 2631 } 2632 else static if (LDC_with_SSE2) 2633 { 2634 return __builtin_ia32_pmovmskb128(cast(byte16)a); 2635 } 2636 else static if (LDC_with_ARM64) 2637 { 2638 // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon 2639 // The other two solutions lead to unfound intrinsics in LLVM and that took a long time. 2640 // SO there might be something a bit faster, but this one is reasonable and branchless. 2641 byte8 mask_shift; 2642 mask_shift.ptr[0] = 7; 2643 mask_shift.ptr[1] = 6; 2644 mask_shift.ptr[2] = 5; 2645 mask_shift.ptr[3] = 4; 2646 mask_shift.ptr[4] = 3; 2647 mask_shift.ptr[5] = 2; 2648 mask_shift.ptr[6] = 1; 2649 mask_shift.ptr[7] = 0; 2650 byte8 mask_and = byte8(-128); 2651 byte8 lo = vget_low_u8(cast(byte16)a); 2652 byte8 hi = vget_high_u8(cast(byte16)a); 2653 lo = vand_u8(lo, mask_and); 2654 lo = vshr_u8(lo, mask_shift); 2655 hi = vand_u8(hi, mask_and); 2656 hi = vshr_u8(hi, mask_shift); 2657 lo = vpadd_u8(lo,lo); 2658 lo = vpadd_u8(lo,lo); 2659 lo = vpadd_u8(lo,lo); 2660 hi = vpadd_u8(hi,hi); 2661 hi = vpadd_u8(hi,hi); 2662 hi = vpadd_u8(hi,hi); 2663 return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]); 2664 } 2665 else 2666 { 2667 byte16 ai = cast(byte16)a; 2668 int r = 0; 2669 foreach(bit; 0..16) 2670 { 2671 if (ai.array[bit] < 0) r += (1 << bit); 2672 } 2673 return r; 2674 } 2675 } 2676 unittest 2677 { 2678 assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0))); 2679 } 2680 2681 /// Create mask from the most significant bit of each 16-bit element in `v`. #BONUS 2682 int _mm_movemask_epi16 (__m128i a) pure @trusted 2683 { 2684 return _mm_movemask_epi8(_mm_packs_epi16(a, _mm_setzero_si128())); 2685 } 2686 unittest 2687 { 2688 assert(0x9C == _mm_movemask_epi16(_mm_set_epi16(-1, 1, 2, -3, -32768, -1, 32767, 8))); 2689 } 2690 2691 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 2692 /// loating-point element in `v`. 2693 int _mm_movemask_pd(__m128d v) pure @safe 2694 { 2695 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2696 static if (GDC_or_LDC_with_SSE2) 2697 { 2698 return __builtin_ia32_movmskpd(v); 2699 } 2700 else 2701 { 2702 long2 lv = cast(long2)v; 2703 int r = 0; 2704 if (lv.array[0] < 0) r += 1; 2705 if (lv.array[1] < 0) r += 2; 2706 return r; 2707 } 2708 } 2709 unittest 2710 { 2711 __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0); 2712 assert(_mm_movemask_pd(A) == 2); 2713 } 2714 2715 /// Copy the lower 64-bit integer in `v`. 2716 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe 2717 { 2718 long2 lv = cast(long2)v; 2719 return long1(lv.array[0]); 2720 } 2721 unittest 2722 { 2723 __m128i A = _mm_set_epi64x(-1, -2); 2724 __m64 R = _mm_movepi64_pi64(A); 2725 assert(R.array[0] == -2); 2726 } 2727 2728 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element. 2729 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted 2730 { 2731 long2 r; 2732 r.ptr[0] = a.array[0]; 2733 r.ptr[1] = 0; 2734 return cast(__m128i)r; 2735 } 2736 2737 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in `a` and `b`, 2738 /// and store the unsigned 64-bit results. 2739 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted 2740 { 2741 // PERF DMD D_SIMD 2742 static if (GDC_with_SSE2) 2743 { 2744 return cast(__m128i) __builtin_ia32_pmuludq128 (a, b); 2745 } 2746 else 2747 { 2748 version(LDC) 2749 { 2750 static if (__VERSION__ >= 2088) 2751 { 2752 // Need LLVM9 for proper optimization 2753 long2 la, lb; 2754 la.ptr[0] = cast(uint)a.array[0]; 2755 la.ptr[1] = cast(uint)a.array[2]; 2756 lb.ptr[0] = cast(uint)b.array[0]; 2757 lb.ptr[1] = cast(uint)b.array[2]; 2758 } 2759 else 2760 { 2761 __m128i zero; 2762 zero = 0; 2763 long2 la = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(a, zero); 2764 long2 lb = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(b, zero); 2765 } 2766 } 2767 else 2768 { 2769 long2 la, lb; 2770 la.ptr[0] = cast(uint)a.array[0]; 2771 la.ptr[1] = cast(uint)a.array[2]; 2772 lb.ptr[0] = cast(uint)b.array[0]; 2773 lb.ptr[1] = cast(uint)b.array[2]; 2774 } 2775 2776 version(DigitalMars) 2777 { 2778 // DMD has no long2 mul 2779 la.ptr[0] *= lb.array[0]; 2780 la.ptr[1] *= lb.array[1]; 2781 return cast(__m128i)(la); 2782 } 2783 else 2784 { 2785 static if (__VERSION__ >= 2076) 2786 { 2787 return cast(__m128i)(la * lb); 2788 } 2789 else 2790 { 2791 // long2 mul not supported before LDC 1.5 2792 la.ptr[0] *= lb.array[0]; 2793 la.ptr[1] *= lb.array[1]; 2794 return cast(__m128i)(la); 2795 } 2796 } 2797 } 2798 } 2799 unittest 2800 { 2801 __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff); 2802 __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff); 2803 __m128i C = _mm_mul_epu32(A, B); 2804 long2 LC = cast(long2)C; 2805 assert(LC.array[0] == 18446744065119617025uL); 2806 assert(LC.array[1] == 12723420444339690338uL); 2807 } 2808 2809 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 2810 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 2811 { 2812 pragma(inline, true); 2813 return a * b; 2814 } 2815 unittest 2816 { 2817 __m128d a = [-2.0, 1.5]; 2818 a = _mm_mul_pd(a, a); 2819 assert(a.array == [4.0, 2.25]); 2820 } 2821 2822 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 2823 /// element of result, and copy the upper element from `a` to the upper element of result. 2824 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted 2825 { 2826 version(DigitalMars) 2827 { 2828 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 2829 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 2830 asm pure nothrow @nogc @trusted { nop;} 2831 a.array[0] = a.array[0] * b.array[0]; 2832 return a; 2833 } 2834 else static if (GDC_with_SSE2) 2835 { 2836 return __builtin_ia32_mulsd(a, b); 2837 } 2838 else 2839 { 2840 a.ptr[0] *= b.array[0]; 2841 return a; 2842 } 2843 } 2844 unittest 2845 { 2846 __m128d a = [-2.0, 1.5]; 2847 a = _mm_mul_sd(a, a); 2848 assert(a.array == [4.0, 1.5]); 2849 } 2850 2851 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 2852 /// and get an unsigned 64-bit result. 2853 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe 2854 { 2855 return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b))); 2856 } 2857 unittest 2858 { 2859 __m64 A = _mm_set_pi32(42, 0xDEADBEEF); 2860 __m64 B = _mm_set_pi32(42, 0xCAFEBABE); 2861 __m64 C = _mm_mul_su32(A, B); 2862 assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL); 2863 } 2864 2865 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2866 /// high 16 bits of the intermediate integers. 2867 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted 2868 { 2869 static if (GDC_with_SSE2) 2870 { 2871 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2872 } 2873 else static if (LDC_with_SSE2) 2874 { 2875 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2876 } 2877 else 2878 { 2879 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h 2880 // PERF: it seems the simde solution has one less instruction in ARM64. 2881 // PERF: Catastrophic in ARM32. 2882 short8 sa = cast(short8)a; 2883 short8 sb = cast(short8)b; 2884 short8 r = void; 2885 r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16; 2886 r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16; 2887 r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16; 2888 r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16; 2889 r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16; 2890 r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16; 2891 r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16; 2892 r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16; 2893 return cast(__m128i)r; 2894 } 2895 } 2896 unittest 2897 { 2898 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2899 __m128i B = _mm_set1_epi16(16384); 2900 short8 R = cast(short8)_mm_mulhi_epi16(A, B); 2901 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1]; 2902 assert(R.array == correct); 2903 } 2904 2905 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2906 /// high 16 bits of the intermediate integers. 2907 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted 2908 { 2909 static if (GDC_with_SSE2) 2910 { 2911 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2912 } 2913 else static if (LDC_with_SSE2) 2914 { 2915 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2916 } 2917 else 2918 { 2919 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h 2920 // it seems the simde solution has one less instruction in ARM64 2921 // PERF: Catastrophic in ARM32. 2922 short8 sa = cast(short8)a; 2923 short8 sb = cast(short8)b; 2924 short8 r = void; 2925 r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 ); 2926 r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 ); 2927 r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 ); 2928 r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 ); 2929 r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 ); 2930 r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 ); 2931 r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 ); 2932 r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 ); 2933 return cast(__m128i)r; 2934 } 2935 } 2936 unittest 2937 { 2938 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2939 __m128i B = _mm_set1_epi16(16384); 2940 short8 R = cast(short8)_mm_mulhi_epu16(A, B); 2941 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1]; 2942 assert(R.array == correct); 2943 } 2944 2945 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 2946 /// bits of the intermediate integers. 2947 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe 2948 { 2949 return cast(__m128i)(cast(short8)a * cast(short8)b); 2950 } 2951 unittest 2952 { 2953 __m128i A = _mm_setr_epi16(16384, -16, 0, 3, 4, 1, 16, 7); 2954 __m128i B = _mm_set1_epi16(16384); 2955 short8 R = cast(short8)_mm_mullo_epi16(A, B); 2956 short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384]; 2957 assert(R.array == correct); 2958 } 2959 2960 /// Compute the bitwise NOT of 128 bits in `a`. #BONUS 2961 __m128i _mm_not_si128 (__m128i a) pure @safe 2962 { 2963 return ~a; 2964 } 2965 unittest 2966 { 2967 __m128i A = _mm_set1_epi32(-748); 2968 int4 notA = cast(int4) _mm_not_si128(A); 2969 int[4] correct = [747, 747, 747, 747]; 2970 assert(notA.array == correct); 2971 } 2972 2973 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 2974 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 2975 { 2976 pragma(inline, true); 2977 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 2978 } 2979 2980 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`. 2981 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 2982 { 2983 pragma(inline, true); 2984 return a | b; 2985 } 2986 2987 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation. 2988 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted 2989 { 2990 static if (GDC_with_SSE2) 2991 { 2992 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2993 } 2994 else static if (LDC_with_SSE2) 2995 { 2996 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2997 } 2998 else static if (LDC_with_ARM64) 2999 { 3000 short4 ra = vqmovn_s32(cast(int4)a); 3001 short4 rb = vqmovn_s32(cast(int4)b); 3002 return cast(__m128i)vcombine_s16(ra, rb); 3003 } 3004 else 3005 { 3006 // PERF: catastrophic on ARM32 3007 short8 r; 3008 r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]); 3009 r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]); 3010 r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]); 3011 r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]); 3012 r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]); 3013 r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]); 3014 r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]); 3015 r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]); 3016 return cast(__m128i)r; 3017 } 3018 } 3019 unittest 3020 { 3021 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 3022 short8 R = cast(short8) _mm_packs_epi32(A, A); 3023 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0]; 3024 assert(R.array == correct); 3025 } 3026 3027 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation. 3028 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted 3029 { 3030 static if (GDC_with_SSE2) 3031 { 3032 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 3033 } 3034 else static if (LDC_with_SSE2) 3035 { 3036 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 3037 } 3038 else static if (LDC_with_ARM64) 3039 { 3040 // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02 3041 byte8 ra = vqmovn_s16(cast(short8)a); 3042 byte8 rb = vqmovn_s16(cast(short8)b); 3043 return cast(__m128i)vcombine_s8(ra, rb); 3044 } 3045 else 3046 { 3047 // PERF: ARM32 is missing 3048 byte16 r; 3049 short8 sa = cast(short8)a; 3050 short8 sb = cast(short8)b; 3051 foreach(i; 0..8) 3052 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]); 3053 foreach(i; 0..8) 3054 r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]); 3055 return cast(__m128i)r; 3056 } 3057 } 3058 unittest 3059 { 3060 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0); 3061 byte16 R = cast(byte16) _mm_packs_epi16(A, A); 3062 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0, 3063 127, -128, 127, 0, 127, -128, 127, 0]; 3064 assert(R.array == correct); 3065 } 3066 3067 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation. 3068 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 3069 { 3070 // PERF DMD catastrophic 3071 static if (GDC_with_SSE2) 3072 { 3073 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 3074 } 3075 else static if (LDC_with_SSE2) 3076 { 3077 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 3078 } 3079 else static if (LDC_with_ARM64) 3080 { 3081 // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02 3082 byte8 ra = vqmovun_s16(cast(short8)a); 3083 byte8 rb = vqmovun_s16(cast(short8)b); 3084 return cast(__m128i)vcombine_s8(ra, rb); 3085 } 3086 else 3087 { 3088 short8 sa = cast(short8)a; 3089 short8 sb = cast(short8)b; 3090 align(16) ubyte[16] result = void; 3091 for (int i = 0; i < 8; ++i) 3092 { 3093 short s = sa[i]; 3094 if (s < 0) s = 0; 3095 if (s > 255) s = 255; 3096 result[i] = cast(ubyte)s; 3097 3098 s = sb[i]; 3099 if (s < 0) s = 0; 3100 if (s > 255) s = 255; 3101 result[i+8] = cast(ubyte)s; 3102 } 3103 return *cast(__m128i*)(result.ptr); 3104 } 3105 } 3106 unittest 3107 { 3108 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 3109 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 3110 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 3111 0, 255, 0, 255, 255, 2, 1, 0]; 3112 foreach(i; 0..16) 3113 assert(AA.array[i] == cast(byte)(correctResult[i])); 3114 } 3115 3116 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 3117 /// and power consumption of spin-wait loops. 3118 void _mm_pause() @trusted 3119 { 3120 version(GNU) 3121 { 3122 static if (GDC_with_SSE2) 3123 { 3124 __builtin_ia32_pause(); 3125 } 3126 else version(X86) 3127 { 3128 asm pure nothrow @nogc @trusted 3129 { 3130 "pause;\n" : : : ; 3131 } 3132 } 3133 else 3134 static assert(false); 3135 } 3136 else static if (LDC_with_SSE2) 3137 { 3138 __builtin_ia32_pause(); 3139 } 3140 else static if (DMD_with_asm) 3141 { 3142 asm nothrow @nogc pure @safe 3143 { 3144 rep; nop; // F3 90 = pause 3145 } 3146 } 3147 else version (LDC) 3148 { 3149 // PERF: Do nothing currently , could be the "yield" intruction on ARM. 3150 } 3151 else 3152 static assert(false); 3153 } 3154 unittest 3155 { 3156 _mm_pause(); 3157 } 3158 3159 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 3160 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 3161 /// low 16 bits of 64-bit elements in result. 3162 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted 3163 { 3164 static if (GDC_with_SSE2) 3165 { 3166 return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b); 3167 } 3168 else static if (LDC_with_SSE2) 3169 { 3170 return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b); 3171 } 3172 else static if (LDC_with_ARM64) 3173 { 3174 ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b)); 3175 3176 // PERF: Looks suboptimal vs addp 3177 ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]); 3178 ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]); 3179 ushort8 r = 0; 3180 r[0] = r0; 3181 r[4] = r4; 3182 return cast(__m128i) r; 3183 } 3184 else 3185 { 3186 // PERF: ARM32 is lacking 3187 byte16 ab = cast(byte16)a; 3188 byte16 bb = cast(byte16)b; 3189 ubyte[16] t; 3190 foreach(i; 0..16) 3191 { 3192 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]); 3193 if (diff < 0) diff = -diff; 3194 t[i] = cast(ubyte)(diff); 3195 } 3196 int4 r = _mm_setzero_si128(); 3197 r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 3198 r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15]; 3199 return r; 3200 } 3201 } 3202 unittest 3203 { 3204 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 3205 __m128i B = _mm_set1_epi8(1); 3206 __m128i R = _mm_sad_epu8(A, B); 3207 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 3208 0, 3209 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 3210 0]; 3211 assert(R.array == correct); 3212 } 3213 3214 /// Set packed 16-bit integers with the supplied values. 3215 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 3216 { 3217 short8 r = void; 3218 r.ptr[0] = e0; 3219 r.ptr[1] = e1; 3220 r.ptr[2] = e2; 3221 r.ptr[3] = e3; 3222 r.ptr[4] = e4; 3223 r.ptr[5] = e5; 3224 r.ptr[6] = e6; 3225 r.ptr[7] = e7; 3226 return cast(__m128i) r; 3227 } 3228 unittest 3229 { 3230 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 3231 short8 B = cast(short8) A; 3232 foreach(i; 0..8) 3233 assert(B.array[i] == i); 3234 } 3235 3236 /// Set packed 32-bit integers with the supplied values. 3237 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3238 { 3239 // PERF: does a constant inline correctly? vs int4 field assignment 3240 align(16) int[4] r = [e0, e1, e2, e3]; 3241 return *cast(int4*)&r; 3242 } 3243 unittest 3244 { 3245 __m128i A = _mm_set_epi32(3, 2, 1, 0); 3246 foreach(i; 0..4) 3247 assert(A.array[i] == i); 3248 } 3249 3250 /// Set packed 64-bit integers with the supplied values. 3251 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted 3252 { 3253 pragma(inline, true); 3254 long2 r = void; 3255 r.ptr[0] = e0.array[0]; 3256 r.ptr[1] = e1.array[0]; 3257 return cast(__m128i)(r); 3258 } 3259 unittest 3260 { 3261 __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678)); 3262 long2 B = cast(long2) A; 3263 assert(B.array[0] == 5678); 3264 assert(B.array[1] == 1234); 3265 } 3266 3267 /// Set packed 64-bit integers with the supplied values. 3268 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 3269 { 3270 pragma(inline, true); 3271 long2 r = void; 3272 r.ptr[0] = e0; 3273 r.ptr[1] = e1; 3274 return cast(__m128i)(r); 3275 } 3276 unittest 3277 { 3278 __m128i A = _mm_set_epi64x(1234, -5678); 3279 long2 B = cast(long2) A; 3280 assert(B.array[0] == -5678); 3281 assert(B.array[1] == 1234); 3282 } 3283 3284 /// Set packed 8-bit integers with the supplied values. 3285 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 3286 byte e11, byte e10, byte e9, byte e8, 3287 byte e7, byte e6, byte e5, byte e4, 3288 byte e3, byte e2, byte e1, byte e0) pure @trusted 3289 { 3290 align(16) byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 3291 e8, e9, e10, e11, e12, e13, e14, e15]; 3292 return *cast(__m128i*)(result.ptr); 3293 } 3294 unittest 3295 { 3296 byte16 R = cast(byte16) _mm_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); 3297 byte[16] correct = [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1]; 3298 assert(R.array == correct); 3299 } 3300 3301 /// Set packed double-precision (64-bit) floating-point elements with the supplied values. 3302 __m128d _mm_set_pd (double e1, double e0) pure @trusted 3303 { 3304 pragma(inline, true); 3305 double2 r = void; 3306 r.ptr[0] = e0; 3307 r.ptr[1] = e1; 3308 return r; 3309 } 3310 unittest 3311 { 3312 __m128d A = _mm_set_pd(61.0, 55.0); 3313 double[2] correct = [55.0, 61.0]; 3314 assert(A.array == correct); 3315 } 3316 3317 /// Broadcast double-precision (64-bit) floating-point value `a` to all element. 3318 __m128d _mm_set_pd1 (double a) pure @trusted 3319 { 3320 pragma(inline, true); 3321 __m128d r = void; 3322 r.ptr[0] = a; 3323 r.ptr[1] = a; 3324 return r; 3325 } 3326 unittest 3327 { 3328 __m128d A = _mm_set_pd1(61.0); 3329 double[2] correct = [61.0, 61.0]; 3330 assert(A.array == correct); 3331 } 3332 3333 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 3334 /// and zero the upper element. 3335 __m128d _mm_set_sd (double a) pure @trusted 3336 { 3337 double2 r = void; 3338 r.ptr[0] = a; 3339 r.ptr[1] = 0.0; 3340 return r; 3341 } 3342 unittest 3343 { 3344 __m128d A = _mm_set_sd(61.0); 3345 double[2] correct = [61.0, 0.0]; 3346 assert(A.array == correct); 3347 } 3348 3349 /// Broadcast 16-bit integer a to all elements of dst. 3350 __m128i _mm_set1_epi16 (short a) pure @trusted 3351 { 3352 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 3353 { 3354 short8 v = a; 3355 return cast(__m128i) v; 3356 } 3357 else 3358 { 3359 pragma(inline, true); 3360 return cast(__m128i)(short8(a)); 3361 } 3362 } 3363 unittest 3364 { 3365 short8 a = cast(short8) _mm_set1_epi16(31); 3366 for (int i = 0; i < 8; ++i) 3367 assert(a.array[i] == 31); 3368 } 3369 3370 /// Broadcast 32-bit integer `a` to all elements. 3371 __m128i _mm_set1_epi32 (int a) pure @trusted 3372 { 3373 pragma(inline, true); 3374 return cast(__m128i)(int4(a)); 3375 } 3376 unittest 3377 { 3378 int4 a = cast(int4) _mm_set1_epi32(31); 3379 for (int i = 0; i < 4; ++i) 3380 assert(a.array[i] == 31); 3381 } 3382 3383 /// Broadcast 64-bit integer `a` to all elements. 3384 __m128i _mm_set1_epi64 (__m64 a) pure @safe 3385 { 3386 return _mm_set_epi64(a, a); 3387 } 3388 unittest 3389 { 3390 long b = 0x1DEADCAFE; 3391 __m64 a; 3392 a.ptr[0] = b; 3393 long2 c = cast(long2) _mm_set1_epi64(a); 3394 assert(c.array[0] == b); 3395 assert(c.array[1] == b); 3396 } 3397 3398 /// Broadcast 64-bit integer `a` to all elements 3399 __m128i _mm_set1_epi64x (long a) pure @trusted 3400 { 3401 long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3402 return cast(__m128i)(b); 3403 } 3404 unittest 3405 { 3406 long b = 0x1DEADCAFE; 3407 long2 c = cast(long2) _mm_set1_epi64x(b); 3408 for (int i = 0; i < 2; ++i) 3409 assert(c.array[i] == b); 3410 } 3411 3412 /// Broadcast 8-bit integer `a` to all elements. 3413 __m128i _mm_set1_epi8 (byte a) pure @trusted 3414 { 3415 pragma(inline, true); 3416 byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3417 return cast(__m128i)(b); 3418 } 3419 unittest 3420 { 3421 byte16 b = cast(byte16) _mm_set1_epi8(31); 3422 for (int i = 0; i < 16; ++i) 3423 assert(b.array[i] == 31); 3424 } 3425 3426 alias _mm_set1_pd = _mm_set_pd1; 3427 3428 /// Set packed 16-bit integers with the supplied values in reverse order. 3429 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 3430 short e3, short e2, short e1, short e0) pure @trusted 3431 { 3432 short8 r = void; 3433 r.ptr[0] = e7; 3434 r.ptr[1] = e6; 3435 r.ptr[2] = e5; 3436 r.ptr[3] = e4; 3437 r.ptr[4] = e3; 3438 r.ptr[5] = e2; 3439 r.ptr[6] = e1; 3440 r.ptr[7] = e0; 3441 return cast(__m128i)(r); 3442 } 3443 unittest 3444 { 3445 short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0); 3446 short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0]; 3447 assert(A.array == correct); 3448 } 3449 3450 /// Set packed 32-bit integers with the supplied values in reverse order. 3451 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3452 { 3453 // Performs better than = void; with GDC 3454 pragma(inline, true); 3455 align(16) int[4] result = [e3, e2, e1, e0]; 3456 return *cast(__m128i*)(result.ptr); 3457 } 3458 unittest 3459 { 3460 int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647); 3461 int[4] correct = [-1, 0, -2147483648, 2147483647]; 3462 assert(A.array == correct); 3463 } 3464 3465 /// Set packed 64-bit integers with the supplied values in reverse order. 3466 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 3467 { 3468 long2 r = void; 3469 r.ptr[0] = e1; 3470 r.ptr[1] = e0; 3471 return cast(__m128i)(r); 3472 } 3473 unittest 3474 { 3475 long2 A = cast(long2) _mm_setr_epi64(-1, 0); 3476 long[2] correct = [-1, 0]; 3477 assert(A.array == correct); 3478 } 3479 3480 /// Set packed 8-bit integers with the supplied values in reverse order. 3481 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12, 3482 byte e11, byte e10, byte e9, byte e8, 3483 byte e7, byte e6, byte e5, byte e4, 3484 byte e3, byte e2, byte e1, byte e0) pure @trusted 3485 { 3486 align(16) byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 3487 e7, e6, e5, e4, e3, e2, e1, e0]; 3488 return *cast(__m128i*)(result.ptr); 3489 } 3490 unittest 3491 { 3492 byte16 R = cast(byte16) _mm_setr_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); 3493 byte[16] correct = [-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]; 3494 assert(R.array == correct); 3495 } 3496 3497 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order. 3498 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 3499 { 3500 pragma(inline, true); 3501 double2 result; 3502 result.ptr[0] = e1; 3503 result.ptr[1] = e0; 3504 return result; 3505 } 3506 unittest 3507 { 3508 __m128d A = _mm_setr_pd(61.0, 55.0); 3509 double[2] correct = [61.0, 55.0]; 3510 assert(A.array == correct); 3511 } 3512 3513 /// Return vector of type `__m128d` with all elements set to zero. 3514 __m128d _mm_setzero_pd() pure @trusted 3515 { 3516 pragma(inline, true); 3517 double2 r = void; 3518 r.ptr[0] = 0.0; 3519 r.ptr[1] = 0.0; 3520 return r; 3521 } 3522 unittest 3523 { 3524 __m128d A = _mm_setzero_pd(); 3525 double[2] correct = [0.0, 0.0]; 3526 assert(A.array == correct); 3527 } 3528 3529 /// Return vector of type `__m128i` with all elements set to zero. 3530 __m128i _mm_setzero_si128() pure @trusted 3531 { 3532 pragma(inline, true); 3533 int4 r = void; 3534 r.ptr[0] = 0; 3535 r.ptr[1] = 0; 3536 r.ptr[2] = 0; 3537 r.ptr[3] = 0; 3538 return r; 3539 } 3540 unittest 3541 { 3542 __m128i A = _mm_setzero_si128(); 3543 int[4] correct = [0, 0, 0, 0]; 3544 assert(A.array == correct); 3545 } 3546 3547 /// Shuffle 32-bit integers in `a` using the control in `imm8`. 3548 /// See_also: `_MM_SHUFFLE`. 3549 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @trusted 3550 { 3551 // PERF DMD D_SIMD 3552 static if (GDC_with_SSE2) 3553 { 3554 return __builtin_ia32_pshufd(a, imm8); 3555 } 3556 else version(LDC) 3557 { 3558 return shufflevectorLDC!(int4, (imm8 >> 0) & 3, 3559 (imm8 >> 2) & 3, 3560 (imm8 >> 4) & 3, 3561 (imm8 >> 6) & 3)(a, a); 3562 } 3563 else 3564 { 3565 int4 r = void; 3566 r.ptr[0] = a.ptr[(imm8 >> 0) & 3]; 3567 r.ptr[1] = a.ptr[(imm8 >> 2) & 3]; 3568 r.ptr[2] = a.ptr[(imm8 >> 4) & 3]; 3569 r.ptr[3] = a.ptr[(imm8 >> 6) & 3]; 3570 return r; 3571 } 3572 } 3573 unittest 3574 { 3575 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 3576 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3577 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 3578 int[4] expectedB = [ 3, 2, 1, 0 ]; 3579 assert(B.array == expectedB); 3580 } 3581 3582 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`. 3583 /// See_also: `_MM_SHUFFLE2`. 3584 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @trusted 3585 { 3586 // PERF DMD D_SIMD 3587 static if (GDC_with_SSE2) 3588 { 3589 return __builtin_ia32_shufpd(a, b, imm8); 3590 } 3591 else version(LDC) 3592 { 3593 return shufflevectorLDC!(double2, 0 + ( imm8 & 1 ), 3594 2 + ( (imm8 >> 1) & 1 ))(a, b); 3595 } 3596 else 3597 { 3598 double2 r = void; 3599 r.ptr[0] = a.array[imm8 & 1]; 3600 r.ptr[1] = b.array[(imm8 >> 1) & 1]; 3601 return r; 3602 } 3603 } 3604 unittest 3605 { 3606 __m128d A = _mm_setr_pd(0.5, 2.0); 3607 __m128d B = _mm_setr_pd(4.0, 5.0); 3608 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 3609 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B); 3610 double[2] correct = [ 2.0, 5.0 ]; 3611 assert(R.array == correct); 3612 } 3613 3614 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 3615 /// 64 bits of result, with the low 64 bits being copied from from `a` to result. 3616 /// See also: `_MM_SHUFFLE`. 3617 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @trusted 3618 { 3619 // PERF DMD D_SIMD 3620 static if (GDC_with_SSE2) 3621 { 3622 return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8); 3623 } 3624 else version(LDC) 3625 { 3626 return cast(__m128i) shufflevectorLDC!(short8, 0, 1, 2, 3, 3627 4 + ( (imm8 >> 0) & 3 ), 3628 4 + ( (imm8 >> 2) & 3 ), 3629 4 + ( (imm8 >> 4) & 3 ), 3630 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 3631 } 3632 else 3633 { 3634 short8 r = cast(short8)a; 3635 short8 sa = cast(short8)a; 3636 r.ptr[4] = sa.array[4 + ( (imm8 >> 0) & 3 ) ]; 3637 r.ptr[5] = sa.array[4 + ( (imm8 >> 2) & 3 ) ]; 3638 r.ptr[6] = sa.array[4 + ( (imm8 >> 4) & 3 ) ]; 3639 r.ptr[7] = sa.array[4 + ( (imm8 >> 6) & 3 ) ]; 3640 return cast(__m128i) r; 3641 } 3642 } 3643 unittest 3644 { 3645 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3646 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3647 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 3648 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 3649 assert(C.array == expectedC); 3650 } 3651 3652 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 3653 /// bits of result, with the high 64 bits being copied from from `a` to result. 3654 /// See_also: `_MM_SHUFFLE`. 3655 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @trusted 3656 { 3657 // PERF DMD D_SIMD 3658 static if (GDC_with_SSE2) 3659 { 3660 return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8); 3661 } 3662 else version(LDC) 3663 { 3664 return cast(__m128i) shufflevectorLDC!(short8, ( (imm8 >> 0) & 3 ), 3665 ( (imm8 >> 2) & 3 ), 3666 ( (imm8 >> 4) & 3 ), 3667 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 3668 } 3669 else 3670 { 3671 short8 r = cast(short8)a; 3672 short8 sa = cast(short8)a; 3673 r.ptr[0] = sa.array[(imm8 >> 0) & 3]; 3674 r.ptr[1] = sa.array[(imm8 >> 2) & 3]; 3675 r.ptr[2] = sa.array[(imm8 >> 4) & 3]; 3676 r.ptr[3] = sa.array[(imm8 >> 6) & 3]; 3677 return cast(__m128i) r; 3678 } 3679 } 3680 unittest 3681 { 3682 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3683 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3684 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 3685 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 3686 assert(B.array == expectedB); 3687 } 3688 3689 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros. 3690 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted 3691 { 3692 static if (LDC_with_SSE2) 3693 { 3694 return __builtin_ia32_pslld128(a, count); 3695 } 3696 else static if (GDC_with_SSE2) 3697 { 3698 return __builtin_ia32_pslld128(a, count); 3699 } 3700 else static if (DMD_with_32bit_asm) 3701 { 3702 asm pure nothrow @nogc @trusted 3703 { 3704 movdqu XMM0, a; 3705 movdqu XMM1, count; 3706 pslld XMM0, XMM1; 3707 movdqu a, XMM0; 3708 } 3709 return a; 3710 } 3711 else 3712 { 3713 int4 r = void; 3714 long2 lc = cast(long2)count; 3715 int bits = cast(int)(lc.array[0]); 3716 foreach(i; 0..4) 3717 r[i] = cast(uint)(a[i]) << bits; 3718 return r; 3719 } 3720 } 3721 3722 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros. 3723 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted 3724 { 3725 static if (LDC_with_SSE2) 3726 { 3727 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3728 } 3729 else static if (GDC_with_SSE2) 3730 { 3731 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3732 } 3733 else static if (DMD_with_32bit_asm) 3734 { 3735 asm pure nothrow @nogc @trusted 3736 { 3737 movdqu XMM0, a; 3738 movdqu XMM1, count; 3739 psllq XMM0, XMM1; 3740 movdqu a, XMM0; 3741 } 3742 return a; 3743 } 3744 else 3745 { 3746 // ARM: good since LDC 1.12 -O2 3747 // ~but -O0 version is catastrophic 3748 long2 r = void; 3749 long2 sa = cast(long2)a; 3750 long2 lc = cast(long2)count; 3751 int bits = cast(int)(lc.array[0]); 3752 foreach(i; 0..2) 3753 r.array[i] = cast(ulong)(sa.array[i]) << bits; 3754 return cast(__m128i)r; 3755 } 3756 } 3757 3758 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros. 3759 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 3760 { 3761 static if (LDC_with_SSE2) 3762 { 3763 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3764 } 3765 else static if (GDC_with_SSE2) 3766 { 3767 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3768 } 3769 else static if (DMD_with_32bit_asm) 3770 { 3771 asm pure nothrow @nogc 3772 { 3773 movdqu XMM0, a; 3774 movdqu XMM1, count; 3775 psllw XMM0, XMM1; 3776 movdqu a, XMM0; 3777 } 3778 return a; 3779 } 3780 else 3781 { 3782 short8 sa = cast(short8)a; 3783 long2 lc = cast(long2)count; 3784 int bits = cast(int)(lc.array[0]); 3785 short8 r = void; 3786 foreach(i; 0..8) 3787 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits); 3788 return cast(int4)r; 3789 } 3790 } 3791 3792 3793 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 3794 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted 3795 { 3796 static if (GDC_with_SSE2) 3797 { 3798 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3799 } 3800 else static if (LDC_with_SSE2) 3801 { 3802 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3803 } 3804 else 3805 { 3806 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3807 // D says "It's illegal to shift by the same or more bits 3808 // than the size of the quantity being shifted" 3809 // and it's UB instead. 3810 int4 r = _mm_setzero_si128(); 3811 3812 ubyte count = cast(ubyte) imm8; 3813 if (count > 31) 3814 return r; 3815 3816 foreach(i; 0..4) 3817 r.array[i] = cast(uint)(a.array[i]) << count; 3818 return r; 3819 } 3820 } 3821 unittest 3822 { 3823 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3824 __m128i B = _mm_slli_epi32(A, 1); 3825 __m128i B2 = _mm_slli_epi32(A, 1 + 256); 3826 int[4] expectedB = [ 0, 4, 6, -8]; 3827 assert(B.array == expectedB); 3828 assert(B2.array == expectedB); 3829 3830 __m128i C = _mm_slli_epi32(A, 0); 3831 int[4] expectedC = [ 0, 2, 3, -4]; 3832 assert(C.array == expectedC); 3833 3834 __m128i D = _mm_slli_epi32(A, 65); 3835 int[4] expectedD = [ 0, 0, 0, 0]; 3836 assert(D.array == expectedD); 3837 } 3838 3839 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros. 3840 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted 3841 { 3842 static if (GDC_with_SSE2) 3843 { 3844 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3845 } 3846 else static if (LDC_with_SSE2) 3847 { 3848 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3849 } 3850 else 3851 { 3852 long2 sa = cast(long2)a; 3853 3854 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3855 // D says "It's illegal to shift by the same or more bits 3856 // than the size of the quantity being shifted" 3857 // and it's UB instead. 3858 long2 r = cast(long2) _mm_setzero_si128(); 3859 ubyte count = cast(ubyte) imm8; 3860 if (count > 63) 3861 return cast(__m128i)r; 3862 3863 r.ptr[0] = cast(ulong)(sa.array[0]) << count; 3864 r.ptr[1] = cast(ulong)(sa.array[1]) << count; 3865 return cast(__m128i)r; 3866 } 3867 } 3868 unittest 3869 { 3870 __m128i A = _mm_setr_epi64(8, -4); 3871 long2 B = cast(long2) _mm_slli_epi64(A, 1); 3872 long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024); 3873 long[2] expectedB = [ 16, -8]; 3874 assert(B.array == expectedB); 3875 assert(B2.array == expectedB); 3876 3877 long2 C = cast(long2) _mm_slli_epi64(A, 0); 3878 long[2] expectedC = [ 8, -4]; 3879 assert(C.array == expectedC); 3880 3881 long2 D = cast(long2) _mm_slli_epi64(A, 64); 3882 long[2] expectedD = [ 0, -0]; 3883 assert(D.array == expectedD); 3884 } 3885 3886 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 3887 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted 3888 { 3889 static if (GDC_with_SSE2) 3890 { 3891 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3892 } 3893 else static if (LDC_with_SSE2) 3894 { 3895 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3896 } 3897 else static if (LDC_with_ARM64) 3898 { 3899 short8 sa = cast(short8)a; 3900 short8 r = cast(short8)_mm_setzero_si128(); 3901 ubyte count = cast(ubyte) imm8; 3902 if (count > 15) 3903 return cast(__m128i)r; 3904 r = sa << short8(count); 3905 return cast(__m128i)r; 3906 } 3907 else 3908 { 3909 short8 sa = cast(short8)a; 3910 short8 r = cast(short8)_mm_setzero_si128(); 3911 ubyte count = cast(ubyte) imm8; 3912 if (count > 15) 3913 return cast(__m128i)r; 3914 foreach(i; 0..8) 3915 r.ptr[i] = cast(short)(sa.array[i] << count); 3916 return cast(__m128i)r; 3917 } 3918 } 3919 unittest 3920 { 3921 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3922 short8 B = cast(short8)( _mm_slli_epi16(A, 1) ); 3923 short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) ); 3924 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 3925 assert(B.array == expectedB); 3926 assert(B2.array == expectedB); 3927 3928 short8 C = cast(short8)( _mm_slli_epi16(A, 16) ); 3929 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 3930 assert(C.array == expectedC); 3931 } 3932 3933 3934 /// Shift `a` left by `bytes` bytes while shifting in zeros. 3935 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted 3936 { 3937 static if (bytes & 0xF0) 3938 { 3939 return _mm_setzero_si128(); 3940 } 3941 else static if (DMD_with_DSIMD) 3942 { 3943 return cast(__m128i) __simd_ib(XMM.PSLLDQ, op, bytes); 3944 } 3945 else static if (GDC_with_SSE2) 3946 { 3947 return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 3948 } 3949 else version(LDC) 3950 { 3951 return cast(__m128i) shufflevectorLDC!(byte16, 3952 16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes, 3953 22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes, 3954 28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes) 3955 (cast(byte16)_mm_setzero_si128(), cast(byte16)op); 3956 } 3957 else static if (DMD_with_32bit_asm) 3958 { 3959 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64 3960 { 3961 movdqu XMM0, op; 3962 pslldq XMM0, bytes; 3963 movdqu op, XMM0; 3964 } 3965 return op; 3966 } 3967 else 3968 { 3969 byte16 A = cast(byte16)op; 3970 byte16 R = void; 3971 for (int n = 15; n >= bytes; --n) 3972 R.ptr[n] = A.array[n-bytes]; 3973 for (int n = bytes-1; n >= 0; --n) 3974 R.ptr[n] = 0; 3975 return cast(__m128i)R; 3976 } 3977 } 3978 unittest 3979 { 3980 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3981 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left 3982 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ]; 3983 assert(R.array == correct); 3984 3985 __m128i B = _mm_slli_si128!16(_mm_set1_epi32(-1)); 3986 int[4] expectedB = [0, 0, 0, 0]; 3987 assert(B.array == expectedB); 3988 } 3989 3990 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`. 3991 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted 3992 { 3993 version(LDC) 3994 { 3995 // Disappeared with LDC 1.11 3996 static if (__VERSION__ < 2081) 3997 return __builtin_ia32_sqrtpd(vec); 3998 else 3999 { 4000 // PERF: use llvm_sqrt on the vector 4001 vec.array[0] = llvm_sqrt(vec.array[0]); 4002 vec.array[1] = llvm_sqrt(vec.array[1]); 4003 return vec; 4004 } 4005 } 4006 else static if (GDC_with_SSE2) 4007 { 4008 return __builtin_ia32_sqrtpd(vec); 4009 } 4010 else 4011 { 4012 vec.ptr[0] = sqrt(vec.array[0]); 4013 vec.ptr[1] = sqrt(vec.array[1]); 4014 return vec; 4015 } 4016 } 4017 4018 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 4019 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 4020 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted 4021 { 4022 // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only. 4023 // "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 4024 // The quadword at bits 127:64 of the destination operand remains unchanged." 4025 version(LDC) 4026 { 4027 // Disappeared with LDC 1.11 4028 static if (__VERSION__ < 2081) 4029 { 4030 __m128d c = __builtin_ia32_sqrtsd(b); 4031 a[0] = c[0]; 4032 return a; 4033 } 4034 else 4035 { 4036 a.array[0] = llvm_sqrt(b.array[0]); 4037 return a; 4038 } 4039 } 4040 else static if (GDC_with_SSE2) 4041 { 4042 __m128d c = __builtin_ia32_sqrtsd(b); 4043 a.ptr[0] = c.array[0]; 4044 return a; 4045 } 4046 else 4047 { 4048 a.ptr[0] = sqrt(b.array[0]); 4049 return a; 4050 } 4051 } 4052 unittest 4053 { 4054 __m128d A = _mm_setr_pd(1.0, 3.0); 4055 __m128d B = _mm_setr_pd(4.0, 5.0); 4056 __m128d R = _mm_sqrt_sd(A, B); 4057 double[2] correct = [2.0, 3.0 ]; 4058 assert(R.array == correct); 4059 } 4060 4061 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits. 4062 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted 4063 { 4064 static if (GDC_with_SSE2) 4065 { 4066 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 4067 } 4068 else static if (LDC_with_SSE2) 4069 { 4070 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 4071 } 4072 else 4073 { 4074 short8 sa = cast(short8)a; 4075 long2 lc = cast(long2)count; 4076 int bits = cast(int)(lc.array[0]); 4077 short8 r = void; 4078 foreach(i; 0..8) 4079 r.ptr[i] = cast(short)(sa.array[i] >> bits); 4080 return cast(int4)r; 4081 } 4082 } 4083 4084 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits. 4085 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted 4086 { 4087 static if (LDC_with_SSE2) 4088 { 4089 return __builtin_ia32_psrad128(a, count); 4090 } 4091 else static if (GDC_with_SSE2) 4092 { 4093 return __builtin_ia32_psrad128(a, count); 4094 } 4095 else 4096 { 4097 int4 r = void; 4098 long2 lc = cast(long2)count; 4099 int bits = cast(int)(lc.array[0]); 4100 r.ptr[0] = (a.array[0] >> bits); 4101 r.ptr[1] = (a.array[1] >> bits); 4102 r.ptr[2] = (a.array[2] >> bits); 4103 r.ptr[3] = (a.array[3] >> bits); 4104 return r; 4105 } 4106 } 4107 4108 4109 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 4110 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted 4111 { 4112 static if (GDC_with_SSE2) 4113 { 4114 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 4115 } 4116 else static if (LDC_with_SSE2) 4117 { 4118 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 4119 } 4120 else static if (LDC_with_ARM64) 4121 { 4122 short8 sa = cast(short8)a; 4123 ubyte count = cast(ubyte)imm8; 4124 if (count > 15) 4125 count = 15; 4126 short8 r = sa >> short8(count); 4127 return cast(__m128i)r; 4128 } 4129 else 4130 { 4131 short8 sa = cast(short8)a; 4132 short8 r = void; 4133 4134 // Note: the intrinsics guarantee imm8[0..7] is taken, however 4135 // D says "It's illegal to shift by the same or more bits 4136 // than the size of the quantity being shifted" 4137 // and it's UB instead. 4138 ubyte count = cast(ubyte)imm8; 4139 if (count > 15) 4140 count = 15; 4141 foreach(i; 0..8) 4142 r.ptr[i] = cast(short)(sa.array[i] >> count); 4143 return cast(int4)r; 4144 } 4145 } 4146 unittest 4147 { 4148 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 4149 short8 B = cast(short8)( _mm_srai_epi16(A, 1) ); 4150 short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) ); 4151 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 4152 assert(B.array == expectedB); 4153 assert(B2.array == expectedB); 4154 4155 short8 C = cast(short8)( _mm_srai_epi16(A, 18) ); 4156 short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ]; 4157 assert(C.array == expectedC); 4158 } 4159 4160 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 4161 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted 4162 { 4163 static if (LDC_with_SSE2) 4164 { 4165 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 4166 } 4167 else static if (GDC_with_SSE2) 4168 { 4169 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 4170 } 4171 else 4172 { 4173 int4 r = void; 4174 4175 // Note: the intrinsics guarantee imm8[0..7] is taken, however 4176 // D says "It's illegal to shift by the same or more bits 4177 // than the size of the quantity being shifted" 4178 // and it's UB instead. 4179 ubyte count = cast(ubyte) imm8; 4180 if (count > 31) 4181 count = 31; 4182 4183 r.ptr[0] = (a.array[0] >> count); 4184 r.ptr[1] = (a.array[1] >> count); 4185 r.ptr[2] = (a.array[2] >> count); 4186 r.ptr[3] = (a.array[3] >> count); 4187 return r; 4188 } 4189 } 4190 unittest 4191 { 4192 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 4193 __m128i B = _mm_srai_epi32(A, 1); 4194 __m128i B2 = _mm_srai_epi32(A, 1 + 256); 4195 int[4] expectedB = [ 0, 1, 1, -2]; 4196 assert(B.array == expectedB); 4197 assert(B2.array == expectedB); 4198 4199 __m128i C = _mm_srai_epi32(A, 32); 4200 int[4] expectedC = [ 0, 0, 0, -1]; 4201 assert(C.array == expectedC); 4202 4203 __m128i D = _mm_srai_epi32(A, 0); 4204 int[4] expectedD = [ 0, 2, 3, -4]; 4205 assert(D.array == expectedD); 4206 } 4207 4208 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted 4209 { 4210 static if (LDC_with_SSE2) 4211 { 4212 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 4213 } 4214 else static if (GDC_with_SSE2) 4215 { 4216 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 4217 } 4218 else 4219 { 4220 short8 sa = cast(short8)a; 4221 long2 lc = cast(long2)count; 4222 int bits = cast(int)(lc.array[0]); 4223 short8 r = void; 4224 foreach(i; 0..8) 4225 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits); 4226 return cast(int4)r; 4227 } 4228 } 4229 4230 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted 4231 { 4232 static if (LDC_with_SSE2) 4233 { 4234 return __builtin_ia32_psrld128(a, count); 4235 } 4236 else static if (GDC_with_SSE2) 4237 { 4238 return __builtin_ia32_psrld128(a, count); 4239 } 4240 else 4241 { 4242 int4 r = void; 4243 long2 lc = cast(long2)count; 4244 int bits = cast(int)(lc.array[0]); 4245 r.ptr[0] = cast(uint)(a.array[0]) >> bits; 4246 r.ptr[1] = cast(uint)(a.array[1]) >> bits; 4247 r.ptr[2] = cast(uint)(a.array[2]) >> bits; 4248 r.ptr[3] = cast(uint)(a.array[3]) >> bits; 4249 return r; 4250 } 4251 } 4252 4253 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted 4254 { 4255 static if (LDC_with_SSE2) 4256 { 4257 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 4258 } 4259 else static if (GDC_with_SSE2) 4260 { 4261 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 4262 } 4263 else 4264 { 4265 // Workaround for https://issues.dlang.org/show_bug.cgi?id=23047 4266 // => avoid void initialization. 4267 long2 r; 4268 long2 sa = cast(long2)a; 4269 long2 lc = cast(long2)count; 4270 int bits = cast(int)(lc.array[0]); 4271 r.ptr[0] = cast(ulong)(sa.array[0]) >> bits; 4272 r.ptr[1] = cast(ulong)(sa.array[1]) >> bits; 4273 return cast(__m128i)r; 4274 } 4275 } 4276 4277 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 4278 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted 4279 { 4280 static if (GDC_with_SSE2) 4281 { 4282 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 4283 } 4284 else static if (LDC_with_SSE2) 4285 { 4286 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 4287 } 4288 else static if (LDC_with_ARM64) 4289 { 4290 short8 sa = cast(short8)a; 4291 short8 r = cast(short8) _mm_setzero_si128(); 4292 4293 ubyte count = cast(ubyte)imm8; 4294 if (count >= 16) 4295 return cast(__m128i)r; 4296 4297 r = sa >>> short8(count); // This facility offered with LDC, but not DMD. 4298 return cast(__m128i)r; 4299 } 4300 else 4301 { 4302 short8 sa = cast(short8)a; 4303 ubyte count = cast(ubyte)imm8; 4304 4305 short8 r = cast(short8) _mm_setzero_si128(); 4306 if (count >= 16) 4307 return cast(__m128i)r; 4308 4309 foreach(i; 0..8) 4310 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count); 4311 return cast(__m128i)r; 4312 } 4313 } 4314 unittest 4315 { 4316 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 4317 short8 B = cast(short8)( _mm_srli_epi16(A, 1) ); 4318 short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) ); 4319 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 4320 assert(B.array == expectedB); 4321 assert(B2.array == expectedB); 4322 4323 short8 C = cast(short8)( _mm_srli_epi16(A, 16) ); 4324 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0]; 4325 assert(C.array == expectedC); 4326 4327 short8 D = cast(short8)( _mm_srli_epi16(A, 0) ); 4328 short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ]; 4329 assert(D.array == expectedD); 4330 } 4331 4332 4333 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 4334 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted 4335 { 4336 static if (GDC_with_SSE2) 4337 { 4338 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 4339 } 4340 else static if (LDC_with_SSE2) 4341 { 4342 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 4343 } 4344 else 4345 { 4346 ubyte count = cast(ubyte) imm8; 4347 4348 // Note: the intrinsics guarantee imm8[0..7] is taken, however 4349 // D says "It's illegal to shift by the same or more bits 4350 // than the size of the quantity being shifted" 4351 // and it's UB instead. 4352 int4 r = _mm_setzero_si128(); 4353 if (count >= 32) 4354 return r; 4355 r.ptr[0] = a.array[0] >>> count; 4356 r.ptr[1] = a.array[1] >>> count; 4357 r.ptr[2] = a.array[2] >>> count; 4358 r.ptr[3] = a.array[3] >>> count; 4359 return r; 4360 } 4361 } 4362 unittest 4363 { 4364 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 4365 __m128i B = _mm_srli_epi32(A, 1); 4366 __m128i B2 = _mm_srli_epi32(A, 1 + 256); 4367 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 4368 assert(B.array == expectedB); 4369 assert(B2.array == expectedB); 4370 4371 __m128i C = _mm_srli_epi32(A, 255); 4372 int[4] expectedC = [ 0, 0, 0, 0 ]; 4373 assert(C.array == expectedC); 4374 } 4375 4376 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros. 4377 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted 4378 { 4379 static if (GDC_with_SSE2) 4380 { 4381 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4382 } 4383 else static if (LDC_with_SSE2) 4384 { 4385 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4386 } 4387 else 4388 { 4389 long2 r = cast(long2) _mm_setzero_si128(); 4390 long2 sa = cast(long2)a; 4391 4392 ubyte count = cast(ubyte) imm8; 4393 if (count >= 64) 4394 return cast(__m128i)r; 4395 4396 r.ptr[0] = sa.array[0] >>> count; 4397 r.ptr[1] = sa.array[1] >>> count; 4398 return cast(__m128i)r; 4399 } 4400 } 4401 unittest 4402 { 4403 __m128i A = _mm_setr_epi64(8, -4); 4404 long2 B = cast(long2) _mm_srli_epi64(A, 1); 4405 long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512); 4406 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 4407 assert(B.array == expectedB); 4408 assert(B2.array == expectedB); 4409 4410 long2 C = cast(long2) _mm_srli_epi64(A, 64); 4411 long[2] expectedC = [ 0, 0 ]; 4412 assert(C.array == expectedC); 4413 } 4414 4415 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4416 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @trusted 4417 { 4418 static if (bytes & 0xF0) 4419 { 4420 return _mm_setzero_si128(); 4421 } 4422 else static if (DMD_with_DSIMD) 4423 { 4424 return cast(__m128i) __simd_ib(XMM.PSRLDQ, v, bytes); 4425 } 4426 else static if (GDC_with_SSE2) 4427 { 4428 return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8)); 4429 } 4430 else static if (DMD_with_32bit_asm) 4431 { 4432 asm pure nothrow @nogc @trusted 4433 { 4434 movdqu XMM0, v; 4435 psrldq XMM0, bytes; 4436 movdqu v, XMM0; 4437 } 4438 return v; 4439 } 4440 else version(LDC) 4441 { 4442 return cast(__m128i) shufflevectorLDC!(byte16, 4443 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7, 4444 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15) 4445 (cast(byte16) v, cast(byte16)_mm_setzero_si128()); 4446 } 4447 else 4448 { 4449 byte16 A = cast(byte16)v; 4450 byte16 R = void; 4451 for (int n = 0; n < bytes; ++n) 4452 R.ptr[15-n] = 0; 4453 for (int n = bytes; n < 16; ++n) 4454 R.ptr[15-n] = A.array[15 - n + bytes]; 4455 return cast(__m128i)R; 4456 } 4457 } 4458 unittest 4459 { 4460 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, -2, 1)); 4461 int[4] correct = [-2, 3, 4, 0]; 4462 assert(R.array == correct); 4463 4464 __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1)); 4465 int[4] expectedA = [0, 0, 0, 0]; 4466 assert(A.array == expectedA); 4467 } 4468 4469 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4470 /// #BONUS 4471 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe 4472 { 4473 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v); 4474 } 4475 unittest 4476 { 4477 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)); 4478 float[4] correct = [3.0f, 4.0f, 0, 0]; 4479 assert(R.array == correct); 4480 } 4481 4482 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4483 /// #BONUS 4484 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe 4485 { 4486 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v); 4487 } 4488 4489 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4490 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4491 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted 4492 { 4493 pragma(inline, true); 4494 __m128d* aligned = cast(__m128d*)mem_addr; 4495 *aligned = a; 4496 } 4497 unittest 4498 { 4499 align(16) double[2] A; 4500 __m128d B = _mm_setr_pd(-8.0, 9.0); 4501 _mm_store_pd(A.ptr, B); 4502 assert(A == [-8.0, 9.0]); 4503 } 4504 4505 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 4506 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4507 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted 4508 { 4509 __m128d* aligned = cast(__m128d*)mem_addr; 4510 __m128d r; // PERF =void; 4511 r.ptr[0] = a.array[0]; 4512 r.ptr[1] = a.array[0]; 4513 *aligned = r; 4514 } 4515 4516 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 4517 /// be aligned on any particular boundary. 4518 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 4519 { 4520 pragma(inline, true); 4521 *mem_addr = a.array[0]; 4522 } 4523 4524 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 4525 /// general-protection exception may be generated. 4526 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 4527 { 4528 pragma(inline, true); 4529 *mem_addr = a; 4530 } 4531 4532 alias _mm_store1_pd = _mm_store_pd1; /// 4533 4534 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory. 4535 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 4536 { 4537 pragma(inline, true); 4538 *mem_addr = a.array[1]; 4539 } 4540 4541 // Note: `mem_addr` doesn't have to actually be aligned, which breaks 4542 // expectations from the user point of view. This problem also exist in C++. 4543 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 4544 { 4545 pragma(inline, true); 4546 long* dest = cast(long*)mem_addr; 4547 long2 la = cast(long2)a; 4548 *dest = la.array[0]; 4549 } 4550 unittest 4551 { 4552 long[3] A = [1, 2, 3]; 4553 _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 4554 long[3] correct = [1, 0x1_0000_0000, 3]; 4555 assert(A == correct); 4556 } 4557 4558 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. 4559 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 4560 { 4561 pragma(inline, true); 4562 *mem_addr = a.array[0]; 4563 } 4564 4565 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse 4566 /// order. `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception 4567 /// may be generated. 4568 void _mm_storer_pd (double* mem_addr, __m128d a) pure @system 4569 { 4570 __m128d reversed = void; 4571 reversed.ptr[0] = a.array[1]; 4572 reversed.ptr[1] = a.array[0]; 4573 *cast(__m128d*)mem_addr = reversed; 4574 } 4575 unittest 4576 { 4577 align(16) double[2] A = [0.0, 1.0]; 4578 _mm_storer_pd(A.ptr, _mm_setr_pd(2.0, 3.0)); 4579 assert(A[0] == 3.0 && A[1] == 2.0); 4580 } 4581 4582 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 4583 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary. 4584 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @trusted // TODO: signature, should be system 4585 { 4586 // PERF DMD 4587 pragma(inline, true); 4588 static if (GDC_with_SSE2) 4589 { 4590 __builtin_ia32_storeupd(mem_addr, a); 4591 } 4592 else version(LDC) 4593 { 4594 storeUnaligned!double2(a, mem_addr); 4595 } 4596 else 4597 { 4598 mem_addr[0] = a.array[0]; 4599 mem_addr[1] = a.array[1]; 4600 } 4601 } 4602 unittest 4603 { 4604 __m128d A = _mm_setr_pd(3.0, 4.0); 4605 align(16) double[4] R = [0.0, 0, 0, 0]; 4606 double[2] correct = [3.0, 4.0]; 4607 _mm_storeu_pd(&R[1], A); 4608 assert(R[1..3] == correct); 4609 } 4610 4611 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 4612 /// boundary. 4613 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @trusted // TODO: signature is wrong, mem_addr is not aligned. Make it @system 4614 { 4615 // PERF: DMD 4616 pragma(inline, true); 4617 static if (GDC_with_SSE2) 4618 { 4619 __builtin_ia32_storedqu(cast(char*)mem_addr, cast(ubyte16)a); 4620 } 4621 else version(LDC) 4622 { 4623 storeUnaligned!__m128i(a, cast(int*)mem_addr); 4624 } 4625 else 4626 { 4627 int* p = cast(int*)mem_addr; 4628 p[0] = a.array[0]; 4629 p[1] = a.array[1]; 4630 p[2] = a.array[2]; 4631 p[3] = a.array[3]; 4632 } 4633 } 4634 unittest 4635 { 4636 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 4637 align(16) int[6] R = [0, 0, 0, 0, 0, 0]; 4638 int[4] correct = [1, 2, 3, 4]; 4639 _mm_storeu_si128(cast(__m128i*)(&R[1]), A); 4640 assert(R[1..5] == correct); 4641 } 4642 4643 /// Store 16-bit integer from the first element of `a` into memory. 4644 /// `mem_addr` does not need to be aligned on any particular boundary. 4645 void _mm_storeu_si16 (void* mem_addr, __m128i a) pure @system 4646 { 4647 short* dest = cast(short*)mem_addr; 4648 *dest = (cast(short8)a).array[0]; 4649 } 4650 unittest 4651 { 4652 short[2] arr = [-24, 12]; 4653 _mm_storeu_si16(&arr[1], _mm_set1_epi16(26)); 4654 short[2] correct = [-24, 26]; 4655 assert(arr == correct); 4656 } 4657 4658 /// Store 32-bit integer from the first element of `a` into memory. 4659 /// `mem_addr` does not need to be aligned on any particular boundary. 4660 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted // TODO should really be @ssytem 4661 { 4662 pragma(inline, true); 4663 int* dest = cast(int*)mem_addr; 4664 *dest = a.array[0]; 4665 } 4666 unittest 4667 { 4668 int[2] arr = [-24, 12]; 4669 _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7)); 4670 assert(arr == [-24, -1]); 4671 } 4672 4673 /// Store 64-bit integer from the first element of `a` into memory. 4674 /// `mem_addr` does not need to be aligned on any particular boundary. 4675 void _mm_storeu_si64 (void* mem_addr, __m128i a) pure @system 4676 { 4677 pragma(inline, true); 4678 long* dest = cast(long*)mem_addr; 4679 long2 la = cast(long2)a; 4680 *dest = la.array[0]; 4681 } 4682 unittest 4683 { 4684 long[3] A = [1, 2, 3]; 4685 _mm_storeu_si64(&A[1], _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 4686 long[3] correct = [1, 0x1_0000_0000, 3]; 4687 assert(A == correct); 4688 } 4689 4690 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 4691 /// from `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 16-byte 4692 /// boundary or a general-protection exception may be generated. 4693 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4694 void _mm_stream_pd (double* mem_addr, __m128d a) pure @system 4695 { 4696 // PERF DMD D_SIMD 4697 static if (GDC_with_SSE2) 4698 { 4699 return __builtin_ia32_movntpd(mem_addr, a); 4700 } 4701 else version(LDC) 4702 { 4703 enum prefix = `!0 = !{ i32 1 }`; 4704 enum ir = ` 4705 store <2 x double> %1, <2 x double>* %0, align 16, !nontemporal !0 4706 ret void`; 4707 LDCInlineIREx!(prefix, ir, "", void, double2*, double2)(cast(double2*)mem_addr, a); 4708 } 4709 else 4710 { 4711 // Regular store instead. 4712 __m128d* dest = cast(__m128d*)mem_addr; 4713 *dest = a; 4714 } 4715 } 4716 unittest 4717 { 4718 align(16) double[2] A; 4719 __m128d B = _mm_setr_pd(-8.0, 9.0); 4720 _mm_stream_pd(A.ptr, B); 4721 assert(A == [-8.0, 9.0]); 4722 } 4723 4724 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 4725 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception 4726 /// may be generated. 4727 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4728 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) pure @trusted 4729 { 4730 // PERF DMD D_SIMD 4731 static if (GDC_with_SSE2) 4732 { 4733 return __builtin_ia32_movntdq (cast(long2*)mem_addr, cast(long2)a); 4734 } 4735 else version(LDC) 4736 { 4737 enum prefix = `!0 = !{ i32 1 }`; 4738 enum ir = ` 4739 store <4 x i32> %1, <4 x i32>* %0, align 16, !nontemporal !0 4740 ret void`; 4741 LDCInlineIREx!(prefix, ir, "", void, int4*, int4)(cast(int4*)mem_addr, a); 4742 } 4743 else 4744 { 4745 // Regular store instead. 4746 __m128i* dest = cast(__m128i*)mem_addr; 4747 *dest = a; 4748 } 4749 } 4750 unittest 4751 { 4752 align(16) int[4] A; 4753 __m128i B = _mm_setr_epi32(-8, 9, 10, -11); 4754 _mm_stream_si128(cast(__m128i*)A.ptr, B); 4755 assert(A == [-8, 9, 10, -11]); 4756 } 4757 4758 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 4759 /// pollution. If the cache line containing address `mem_addr` is already in the cache, 4760 /// the cache will be updated. 4761 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4762 void _mm_stream_si32 (int* mem_addr, int a) pure @trusted 4763 { 4764 // PERF DMD D_SIMD 4765 static if (GDC_with_SSE2) 4766 { 4767 return __builtin_ia32_movnti(mem_addr, a); 4768 } 4769 else version(LDC) 4770 { 4771 enum prefix = `!0 = !{ i32 1 }`; 4772 enum ir = ` 4773 store i32 %1, i32* %0, !nontemporal !0 4774 ret void`; 4775 LDCInlineIREx!(prefix, ir, "", void, int*, int)(mem_addr, a); 4776 } 4777 else 4778 { 4779 // Regular store instead. 4780 *mem_addr = a; 4781 } 4782 } 4783 unittest 4784 { 4785 int A; 4786 _mm_stream_si32(&A, -34); 4787 assert(A == -34); 4788 } 4789 4790 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 4791 /// cache pollution. If the cache line containing address `mem_addr` is already 4792 /// in the cache, the cache will be updated. 4793 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4794 void _mm_stream_si64 (long* mem_addr, long a) pure @trusted 4795 { 4796 // PERF DMD D_SIMD 4797 static if (GDC_with_SSE2) 4798 { 4799 return __builtin_ia32_movnti64(mem_addr, a); 4800 } 4801 else version(LDC) 4802 { 4803 enum prefix = `!0 = !{ i32 1 }`; 4804 enum ir = ` 4805 store i64 %1, i64* %0, !nontemporal !0 4806 ret void`; 4807 LDCInlineIREx!(prefix, ir, "", void, long*, long)(mem_addr, a); 4808 4809 } 4810 else 4811 { 4812 // Regular store instead. 4813 *mem_addr = a; 4814 } 4815 } 4816 unittest 4817 { 4818 long A; 4819 _mm_stream_si64(&A, -46); 4820 assert(A == -46); 4821 } 4822 4823 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 4824 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 4825 { 4826 pragma(inline, true); 4827 return cast(__m128i)(cast(short8)a - cast(short8)b); 4828 } 4829 unittest 4830 { 4831 __m128i A = _mm_setr_epi16(16, 32767, 1, 2, 3, 4, 6, 6); 4832 __m128i B = _mm_setr_epi16(15, -32768, 6, 8, 1000, 1, 5, 6); 4833 short8 C = cast(short8) _mm_sub_epi16(A, B); 4834 short[8] correct = [ 1, -1,-5,-6, -997, 3, 1, 0]; 4835 assert(C.array == correct); 4836 } 4837 4838 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 4839 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 4840 { 4841 pragma(inline, true); 4842 return cast(__m128i)(cast(int4)a - cast(int4)b); 4843 } 4844 unittest 4845 { 4846 __m128i A = _mm_setr_epi32(16, int.max, 1, 8); 4847 __m128i B = _mm_setr_epi32(15, int.min, 6, 2); 4848 int4 C = cast(int4) _mm_sub_epi32(A, B); 4849 int[4] correct = [ 1, -1,-5, 6]; 4850 assert(C.array == correct); 4851 } 4852 4853 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`. 4854 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 4855 { 4856 pragma(inline, true); 4857 return cast(__m128i)(cast(long2)a - cast(long2)b); 4858 } 4859 unittest 4860 { 4861 __m128i A = _mm_setr_epi64( 16, long.max); 4862 __m128i B = _mm_setr_epi64( 199, long.min); 4863 long2 C = cast(long2) _mm_sub_epi64(A, B); 4864 long[2] correct = [-183, -1]; 4865 assert(C.array == correct); 4866 } 4867 4868 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 4869 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 4870 { 4871 pragma(inline, true); 4872 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 4873 } 4874 unittest 4875 { 4876 __m128i A = _mm_setr_epi8(16, 127, 1, 2, 3, 4, 6, 6, 16, 127, 1, 2, 3, 4, 6, 6); 4877 __m128i B = _mm_setr_epi8(15, -128, 6, 8, 3, 1, 5, 6, 16, 127, 1, 2, 3, 4, 6, 6); 4878 byte16 C = cast(byte16) _mm_sub_epi8(A, B); 4879 byte[16] correct = [ 1, -1,-5,-6, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4880 assert(C.array == correct); 4881 } 4882 4883 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 4884 /// floating-point elements in `a`. 4885 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 4886 { 4887 pragma(inline, true); 4888 return a - b; 4889 } 4890 unittest 4891 { 4892 __m128d A = _mm_setr_pd(4000.0, -8.0); 4893 __m128d B = _mm_setr_pd(12.0, -8450.0); 4894 __m128d C = _mm_sub_pd(A, B); 4895 double[2] correct = [3988.0, 8442.0]; 4896 assert(C.array == correct); 4897 } 4898 4899 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 4900 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the 4901 /// upper element of result. 4902 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted 4903 { 4904 version(DigitalMars) 4905 { 4906 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 4907 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 4908 asm pure nothrow @nogc @trusted { nop;} 4909 a[0] = a[0] - b[0]; 4910 return a; 4911 } 4912 else static if (GDC_with_SSE2) 4913 { 4914 return __builtin_ia32_subsd(a, b); 4915 } 4916 else 4917 { 4918 a.ptr[0] -= b.array[0]; 4919 return a; 4920 } 4921 } 4922 unittest 4923 { 4924 __m128d a = [1.5, -2.0]; 4925 a = _mm_sub_sd(a, a); 4926 assert(a.array == [0.0, -2.0]); 4927 } 4928 4929 /// Subtract 64-bit integer `b` from 64-bit integer `a`. 4930 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe 4931 { 4932 pragma(inline, true); 4933 return a - b; 4934 } 4935 unittest 4936 { 4937 __m64 A, B; 4938 A = -1214; 4939 B = 489415; 4940 __m64 C = _mm_sub_si64(B, A); 4941 assert(C.array[0] == 489415 + 1214); 4942 } 4943 4944 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4945 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 4946 { 4947 version(LDC) 4948 { 4949 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4950 { 4951 // Generates PSUBSW since LDC 1.15 -O0 4952 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4953 4954 enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4955 enum ir = ` 4956 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4957 ret <8 x i16> %r`; 4958 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4959 } 4960 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4961 { 4962 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4963 short[8] res; // PERF: =void; 4964 short8 sa = cast(short8)a; 4965 short8 sb = cast(short8)b; 4966 foreach(i; 0..8) 4967 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4968 return _mm_loadu_si128(cast(int4*)res.ptr); 4969 } 4970 else static if (LDC_with_SSE2) 4971 { 4972 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 4973 } 4974 else 4975 static assert(false); 4976 } 4977 else static if (GDC_with_SSE2) 4978 { 4979 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 4980 } 4981 else 4982 { 4983 short[8] res; // PERF =void; 4984 short8 sa = cast(short8)a; 4985 short8 sb = cast(short8)b; 4986 foreach(i; 0..8) 4987 res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4988 return _mm_loadu_si128(cast(int4*)res.ptr); 4989 } 4990 } 4991 unittest 4992 { 4993 short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0), 4994 _mm_setr_epi16(-10 , 16, 5, 4, 3, 2, 1, 0)); 4995 static immutable short[8] correctResult = [32767, -32768, 0, 0, 0, 0, 0, 0]; 4996 assert(res.array == correctResult); 4997 } 4998 4999 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 5000 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 5001 { 5002 version(LDC) 5003 { 5004 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 5005 { 5006 // x86: Generates PSUBSB since LDC 1.15 -O0 5007 // ARM: Generates sqsub.16b since LDC 1.21 -O0 5008 enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 5009 enum ir = ` 5010 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 5011 ret <16 x i8> %r`; 5012 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 5013 } 5014 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 5015 { 5016 byte[16] res; // PERF =void; 5017 byte16 sa = cast(byte16)a; 5018 byte16 sb = cast(byte16)b; 5019 foreach(i; 0..16) 5020 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 5021 return _mm_loadu_si128(cast(int4*)res.ptr); 5022 } 5023 else static if (LDC_with_SSE2) 5024 { 5025 return cast(__m128i) __builtin_ia32_psubsb128(cast(byte16) a, cast(byte16) b); 5026 } 5027 else 5028 static assert(false); 5029 } 5030 else static if (GDC_with_SSE2) 5031 { 5032 return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b); 5033 } 5034 else 5035 { 5036 byte[16] res; // PERF =void; 5037 byte16 sa = cast(byte16)a; 5038 byte16 sb = cast(byte16)b; 5039 foreach(i; 0..16) 5040 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 5041 return _mm_loadu_si128(cast(int4*)res.ptr); 5042 } 5043 } 5044 unittest 5045 { 5046 byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 5047 _mm_setr_epi8( 15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 5048 static immutable byte[16] correctResult = [-128, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 5049 assert(res.array == correctResult); 5050 } 5051 5052 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 5053 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 5054 { 5055 version(LDC) 5056 { 5057 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 5058 { 5059 // x86: Generates PSUBUSW since LDC 1.15 -O0 5060 // ARM: Generates uqsub.8h since LDC 1.21 -O0 5061 enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 5062 enum ir = ` 5063 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 5064 ret <8 x i16> %r`; 5065 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 5066 } 5067 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 5068 { 5069 short[8] res; // PERF =void; 5070 short8 sa = cast(short8)a; 5071 short8 sb = cast(short8)b; 5072 foreach(i; 0..8) 5073 { 5074 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 5075 res[i] = saturateSignedIntToUnsignedShort(sum); 5076 } 5077 return _mm_loadu_si128(cast(int4*)res.ptr); 5078 } 5079 else static if (LDC_with_SSE2) 5080 { 5081 return cast(__m128i) __builtin_ia32_psubusw128(a, b); 5082 } 5083 else 5084 static assert(false); 5085 } 5086 else static if (GDC_with_SSE2) 5087 { 5088 return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b); 5089 } 5090 else 5091 { 5092 short[8] res; // PERF =void; 5093 short8 sa = cast(short8)a; 5094 short8 sb = cast(short8)b; 5095 foreach(i; 0..8) 5096 { 5097 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 5098 res[i] = saturateSignedIntToUnsignedShort(sum); 5099 } 5100 return _mm_loadu_si128(cast(int4*)res.ptr); 5101 } 5102 } 5103 unittest 5104 { 5105 short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534, 1, 5, 4, 3, 2, 1, 0), 5106 _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0)); 5107 static immutable short[8] correct = [ 0, 0, 1, 0, 0, 2, 0, 0]; 5108 assert(R.array == correct); 5109 } 5110 5111 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 5112 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 5113 { 5114 version(LDC) 5115 { 5116 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 5117 { 5118 // x86: Generates PSUBUSB since LDC 1.15 -O0 5119 // ARM: Generates uqsub.16b since LDC 1.21 -O0 5120 enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 5121 enum ir = ` 5122 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 5123 ret <16 x i8> %r`; 5124 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 5125 } 5126 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 5127 { 5128 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 5129 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 5130 { 5131 ubyte[16] res; // PERF =void; 5132 byte16 sa = cast(byte16)a; 5133 byte16 sb = cast(byte16)b; 5134 foreach(i; 0..16) 5135 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 5136 return _mm_loadu_si128(cast(int4*)res.ptr); 5137 } 5138 } 5139 else static if (LDC_with_SSE2) 5140 { 5141 return __builtin_ia32_psubusb128(a, b); 5142 } 5143 else 5144 static assert(false); 5145 } 5146 else static if (GDC_with_SSE2) 5147 { 5148 return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b); 5149 } 5150 else 5151 { 5152 ubyte[16] res; // PERF =void; 5153 byte16 sa = cast(byte16)a; 5154 byte16 sb = cast(byte16)b; 5155 foreach(i; 0..16) 5156 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 5157 return _mm_loadu_si128(cast(int4*)res.ptr); 5158 } 5159 } 5160 unittest 5161 { 5162 byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 5163 _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 5164 static immutable byte[16] correctResult = [ 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 5165 assert(res.array == correctResult); 5166 } 5167 5168 // Note: the only difference between these intrinsics is the signalling 5169 // behaviour of quiet NaNs. This is incorrect but the case where 5170 // you would want to differentiate between qNaN and sNaN and then 5171 // treat them differently on purpose seems extremely rare. 5172 alias _mm_ucomieq_sd = _mm_comieq_sd; /// 5173 alias _mm_ucomige_sd = _mm_comige_sd; /// 5174 alias _mm_ucomigt_sd = _mm_comigt_sd; /// 5175 alias _mm_ucomile_sd = _mm_comile_sd; /// 5176 alias _mm_ucomilt_sd = _mm_comilt_sd; /// 5177 alias _mm_ucomineq_sd = _mm_comineq_sd; /// 5178 5179 /// Return vector of type `__m128d` with undefined elements. 5180 __m128d _mm_undefined_pd() pure @safe 5181 { 5182 pragma(inline, true); 5183 __m128d result = void; 5184 return result; 5185 } 5186 5187 /// Return vector of type `__m128i` with undefined elements. 5188 __m128i _mm_undefined_si128() pure @safe 5189 { 5190 pragma(inline, true); 5191 __m128i result = void; 5192 return result; 5193 } 5194 5195 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. 5196 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @trusted 5197 { 5198 // PERF DMD D_SIMD 5199 static if (GDC_with_SSE2) 5200 { 5201 return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b); 5202 } 5203 else version(LDC) 5204 { 5205 return cast(__m128i) shufflevectorLDC!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 5206 (cast(short8)a, cast(short8)b); 5207 } 5208 else static if (DMD_with_32bit_asm) 5209 { 5210 asm pure nothrow @nogc @trusted 5211 { 5212 movdqu XMM0, a; 5213 movdqu XMM1, b; 5214 punpckhwd XMM0, XMM1; 5215 movdqu a, XMM0; 5216 } 5217 return a; 5218 } 5219 else 5220 { 5221 short8 r = void; 5222 short8 sa = cast(short8)a; 5223 short8 sb = cast(short8)b; 5224 r.ptr[0] = sa.array[4]; 5225 r.ptr[1] = sb.array[4]; 5226 r.ptr[2] = sa.array[5]; 5227 r.ptr[3] = sb.array[5]; 5228 r.ptr[4] = sa.array[6]; 5229 r.ptr[5] = sb.array[6]; 5230 r.ptr[6] = sa.array[7]; 5231 r.ptr[7] = sb.array[7]; 5232 return cast(__m128i)r; 5233 } 5234 } 5235 unittest 5236 { 5237 __m128i A = _mm_setr_epi16(4, 5, 6, 7, 8, 9, 10, 11); 5238 __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19); 5239 short8 C = cast(short8)(_mm_unpackhi_epi16(A, B)); 5240 short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19]; 5241 assert(C.array == correct); 5242 } 5243 5244 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. 5245 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted 5246 { 5247 static if (GDC_with_SSE2) 5248 { 5249 return __builtin_ia32_punpckhdq128(a, b); 5250 } 5251 else version(LDC) 5252 { 5253 return shufflevectorLDC!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 5254 } 5255 else 5256 { 5257 __m128i r = void; 5258 r.ptr[0] = a.array[2]; 5259 r.ptr[1] = b.array[2]; 5260 r.ptr[2] = a.array[3]; 5261 r.ptr[3] = b.array[3]; 5262 return r; 5263 } 5264 } 5265 unittest 5266 { 5267 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 5268 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 5269 __m128i C = _mm_unpackhi_epi32(A, B); 5270 int[4] correct = [3, 7, 4, 8]; 5271 assert(C.array == correct); 5272 } 5273 5274 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`. 5275 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted 5276 { 5277 static if (GDC_with_SSE2) 5278 { 5279 return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b); 5280 } 5281 else 5282 { 5283 __m128i r = cast(__m128i)b; 5284 r[0] = a[2]; 5285 r[1] = a[3]; 5286 return r; 5287 } 5288 } 5289 unittest // Issue #36 5290 { 5291 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 5292 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 5293 long2 C = cast(long2)(_mm_unpackhi_epi64(A, B)); 5294 long[2] correct = [0x33333333_33333333, 0x55555555_55555555]; 5295 assert(C.array == correct); 5296 } 5297 5298 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. 5299 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @trusted 5300 { 5301 // PERF DMD D_SIMD 5302 static if (GDC_with_SSE2) 5303 { 5304 return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b); 5305 } 5306 else static if (DMD_with_32bit_asm) 5307 { 5308 asm pure nothrow @nogc @trusted 5309 { 5310 movdqu XMM0, a; 5311 movdqu XMM1, b; 5312 punpckhbw XMM0, XMM1; 5313 movdqu a, XMM0; 5314 } 5315 return a; 5316 } 5317 else version(LDC) 5318 { 5319 return cast(__m128i)shufflevectorLDC!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 5320 12, 28, 13, 29, 14, 30, 15, 31) 5321 (cast(byte16)a, cast(byte16)b); 5322 } 5323 else 5324 { 5325 byte16 r = void; 5326 byte16 ba = cast(byte16)a; 5327 byte16 bb = cast(byte16)b; 5328 r.ptr[0] = ba.array[8]; 5329 r.ptr[1] = bb.array[8]; 5330 r.ptr[2] = ba.array[9]; 5331 r.ptr[3] = bb.array[9]; 5332 r.ptr[4] = ba.array[10]; 5333 r.ptr[5] = bb.array[10]; 5334 r.ptr[6] = ba.array[11]; 5335 r.ptr[7] = bb.array[11]; 5336 r.ptr[8] = ba.array[12]; 5337 r.ptr[9] = bb.array[12]; 5338 r.ptr[10] = ba.array[13]; 5339 r.ptr[11] = bb.array[13]; 5340 r.ptr[12] = ba.array[14]; 5341 r.ptr[13] = bb.array[14]; 5342 r.ptr[14] = ba.array[15]; 5343 r.ptr[15] = bb.array[15]; 5344 return cast(__m128i)r; 5345 } 5346 } 5347 unittest 5348 { 5349 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 5350 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 5351 byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B); 5352 byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]; 5353 assert(C.array == correct); 5354 } 5355 5356 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`. 5357 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @trusted 5358 { 5359 // PERF DMD D_SIMD 5360 static if (GDC_with_SSE2) 5361 { 5362 return __builtin_ia32_unpckhpd(a, b); 5363 } 5364 else version(LDC) 5365 { 5366 return shufflevectorLDC!(__m128d, 1, 3)(a, b); 5367 } 5368 else 5369 { 5370 double2 r = void; 5371 r.ptr[0] = a.array[1]; 5372 r.ptr[1] = b.array[1]; 5373 return r; 5374 } 5375 } 5376 unittest 5377 { 5378 __m128d A = _mm_setr_pd(4.0, 6.0); 5379 __m128d B = _mm_setr_pd(7.0, 9.0); 5380 __m128d C = _mm_unpackhi_pd(A, B); 5381 double[2] correct = [6.0, 9.0]; 5382 assert(C.array == correct); 5383 } 5384 5385 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. 5386 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @trusted 5387 { 5388 // PERF DMD SIMD 5389 static if (GDC_with_SSE2) 5390 { 5391 return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b); 5392 } 5393 else version(LDC) 5394 { 5395 return cast(__m128i) shufflevectorLDC!(short8, 0, 8, 1, 9, 2, 10, 3, 11)(cast(short8)a, cast(short8)b); 5396 } 5397 else static if (DMD_with_32bit_asm) 5398 { 5399 asm pure nothrow @nogc @trusted 5400 { 5401 movdqu XMM0, a; 5402 movdqu XMM1, b; 5403 punpcklwd XMM0, XMM1; 5404 movdqu a, XMM0; 5405 } 5406 return a; 5407 } 5408 else 5409 { 5410 short8 r = void; 5411 short8 sa = cast(short8)a; 5412 short8 sb = cast(short8)b; 5413 r.ptr[0] = sa.array[0]; 5414 r.ptr[1] = sb.array[0]; 5415 r.ptr[2] = sa.array[1]; 5416 r.ptr[3] = sb.array[1]; 5417 r.ptr[4] = sa.array[2]; 5418 r.ptr[5] = sb.array[2]; 5419 r.ptr[6] = sa.array[3]; 5420 r.ptr[7] = sb.array[3]; 5421 return cast(__m128i)r; 5422 } 5423 } 5424 unittest 5425 { 5426 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 5427 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 5428 short8 C = cast(short8) _mm_unpacklo_epi16(A, B); 5429 short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11]; 5430 assert(C.array == correct); 5431 } 5432 5433 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. 5434 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted 5435 { 5436 // PERF DMD 5437 static if (GDC_with_SSE2) 5438 { 5439 return __builtin_ia32_punpckldq128(a, b); 5440 } 5441 else version(LDC) 5442 { 5443 return shufflevectorLDC!(int4, 0, 4, 1, 5)(cast(int4)a, cast(int4)b); 5444 } 5445 else 5446 { 5447 __m128i r; 5448 r.ptr[0] = a.array[0]; 5449 r.ptr[1] = b.array[0]; 5450 r.ptr[2] = a.array[1]; 5451 r.ptr[3] = b.array[1]; 5452 return r; 5453 } 5454 } 5455 unittest 5456 { 5457 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 5458 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 5459 __m128i C = _mm_unpacklo_epi32(A, B); 5460 int[4] correct = [1, 5, 2, 6]; 5461 assert(C.array == correct); 5462 } 5463 5464 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`. 5465 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted 5466 { 5467 static if (GDC_with_SSE2) 5468 { 5469 return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b); 5470 } 5471 else 5472 { 5473 long2 lA = cast(long2)a; 5474 long2 lB = cast(long2)b; 5475 long2 R; // PERF =void; 5476 R.ptr[0] = lA.array[0]; 5477 R.ptr[1] = lB.array[0]; 5478 return cast(__m128i)R; 5479 } 5480 } 5481 unittest // Issue #36 5482 { 5483 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 5484 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 5485 long2 C = cast(long2)(_mm_unpacklo_epi64(A, B)); 5486 long[2] correct = [0x22222222_22222222, 0x44444444_44444444]; 5487 assert(C.array == correct); 5488 } 5489 5490 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. 5491 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @trusted 5492 { 5493 // PERF DMD D_SIMD 5494 static if (GDC_with_SSE2) 5495 { 5496 return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b); 5497 } 5498 else static if (DMD_with_32bit_asm) 5499 { 5500 asm pure nothrow @nogc @trusted 5501 { 5502 movdqu XMM0, a; 5503 movdqu XMM1, b; 5504 punpcklbw XMM0, XMM1; 5505 movdqu a, XMM0; 5506 } 5507 return a; 5508 } 5509 else version(LDC) 5510 { 5511 return cast(__m128i) shufflevectorLDC!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 5512 4, 20, 5, 21, 6, 22, 7, 23) 5513 (cast(byte16)a, cast(byte16)b); 5514 } 5515 else 5516 { 5517 byte16 r = void; 5518 byte16 ba = cast(byte16)a; 5519 byte16 bb = cast(byte16)b; 5520 r.ptr[0] = ba.array[0]; 5521 r.ptr[1] = bb.array[0]; 5522 r.ptr[2] = ba.array[1]; 5523 r.ptr[3] = bb.array[1]; 5524 r.ptr[4] = ba.array[2]; 5525 r.ptr[5] = bb.array[2]; 5526 r.ptr[6] = ba.array[3]; 5527 r.ptr[7] = bb.array[3]; 5528 r.ptr[8] = ba.array[4]; 5529 r.ptr[9] = bb.array[4]; 5530 r.ptr[10] = ba.array[5]; 5531 r.ptr[11] = bb.array[5]; 5532 r.ptr[12] = ba.array[6]; 5533 r.ptr[13] = bb.array[6]; 5534 r.ptr[14] = ba.array[7]; 5535 r.ptr[15] = bb.array[7]; 5536 return cast(__m128i)r; 5537 } 5538 } 5539 unittest 5540 { 5541 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 5542 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 5543 byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B); 5544 byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]; 5545 assert(C.array == correct); 5546 } 5547 5548 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`. 5549 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @trusted 5550 { 5551 // PERF DMD D_SIMD 5552 static if (GDC_with_SSE2) 5553 { 5554 return __builtin_ia32_unpcklpd(a, b); 5555 } 5556 else version(LDC) 5557 { 5558 return shufflevectorLDC!(__m128d, 0, 2)(a, b); 5559 } 5560 else 5561 { 5562 double2 r = void; 5563 r.ptr[0] = a.array[0]; 5564 r.ptr[1] = b.array[0]; 5565 return r; 5566 } 5567 } 5568 unittest 5569 { 5570 __m128d A = _mm_setr_pd(4.0, 6.0); 5571 __m128d B = _mm_setr_pd(7.0, 9.0); 5572 __m128d C = _mm_unpacklo_pd(A, B); 5573 double[2] correct = [4.0, 7.0]; 5574 assert(C.array == correct); 5575 } 5576 5577 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 5578 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 5579 { 5580 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 5581 } 5582 unittest 5583 { 5584 __m128d A = _mm_setr_pd(-4.0, 6.0); 5585 __m128d B = _mm_setr_pd(4.0, -6.0); 5586 long2 R = cast(long2) _mm_xor_pd(A, B); 5587 long[2] correct = [long.min, long.min]; 5588 assert(R.array == correct); 5589 } 5590 5591 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`. 5592 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 5593 { 5594 return a ^ b; 5595 } 5596 unittest 5597 { 5598 __m128i A = _mm_setr_epi64(975394, 619809709); 5599 __m128i B = _mm_setr_epi64(-920275025, -6); 5600 long2 R = cast(long2) _mm_xor_si128(A, B); 5601 long[2] correct = [975394 ^ (-920275025L), 619809709L ^ -6]; 5602 assert(R.array == correct); 5603 } 5604 5605 unittest 5606 { 5607 float distance(float[4] a, float[4] b) nothrow @nogc 5608 { 5609 __m128 va = _mm_loadu_ps(a.ptr); 5610 __m128 vb = _mm_loadu_ps(b.ptr); 5611 __m128 diffSquared = _mm_sub_ps(va, vb); 5612 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 5613 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared)); 5614 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum)); 5615 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 5616 } 5617 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 5618 }