1 /** 2 * SSE2 intrinsics. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.emmintrin; 8 9 public import inteli.types; 10 public import inteli.xmmintrin; // SSE2 includes SSE1 11 import inteli.mmx; 12 import inteli.internals; 13 14 nothrow @nogc: 15 16 17 // SSE2 instructions 18 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 19 20 /// Add packed 16-bit integers in `a` and `b`. 21 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 22 { 23 return cast(__m128i)(cast(short8)a + cast(short8)b); 24 } 25 unittest 26 { 27 __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77); 28 short8 R = cast(short8) _mm_add_epi16(A, A); 29 short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154]; 30 assert(R.array == correct); 31 } 32 33 /// Add packed 32-bit integers in `a` and `b`. 34 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 35 { 36 return cast(__m128i)(cast(int4)a + cast(int4)b); 37 } 38 unittest 39 { 40 __m128i A = _mm_setr_epi32( -7, -1, 0, 9); 41 int4 R = _mm_add_epi32(A, A); 42 int[4] correct = [ -14, -2, 0, 18 ]; 43 assert(R.array == correct); 44 } 45 46 /// Add packed 64-bit integers in `a` and `b`. 47 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 48 { 49 return cast(__m128i)(cast(long2)a + cast(long2)b); 50 } 51 unittest 52 { 53 __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000); 54 long2 R = cast(long2) _mm_add_epi64(A, A); 55 long[2] correct = [ -2, 0 ]; 56 assert(R.array == correct); 57 } 58 59 /// Add packed 8-bit integers in `a` and `b`. 60 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 61 { 62 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 63 } 64 unittest 65 { 66 __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78); 67 byte16 R = cast(byte16) _mm_add_epi8(A, A); 68 byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100]; 69 assert(R.array == correct); 70 } 71 72 /// Add the lower double-precision (64-bit) floating-point element 73 /// in `a` and `b`, store the result in the lower element of dst, 74 /// and copy the upper element from `a` to the upper element of destination. 75 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 76 { 77 static if (GDC_with_SSE2) 78 { 79 return __builtin_ia32_addsd(a, b); 80 } 81 else version(DigitalMars) 82 { 83 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 84 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 85 asm pure nothrow @nogc @trusted { nop;} 86 a[0] = a[0] + b[0]; 87 return a; 88 } 89 else 90 { 91 a[0] += b[0]; 92 return a; 93 } 94 } 95 unittest 96 { 97 __m128d a = [1.5, -2.0]; 98 a = _mm_add_sd(a, a); 99 assert(a.array == [3.0, -2.0]); 100 } 101 102 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 103 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 104 { 105 return a + b; 106 } 107 unittest 108 { 109 __m128d a = [1.5, -2.0]; 110 a = _mm_add_pd(a, a); 111 assert(a.array == [3.0, -4.0]); 112 } 113 114 /// Add 64-bit integers `a` and `b`. 115 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe 116 { 117 return a + b; 118 } 119 120 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 121 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 122 { 123 static if (GDC_with_SSE2) 124 { 125 return __builtin_ia32_paddsw128(a, b); 126 } 127 else version(LDC) 128 { 129 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 130 { 131 // x86: Generates PADDSW since LDC 1.15 -O0 132 // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20 133 enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 134 enum ir = ` 135 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 136 ret <8 x i16> %r`; 137 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 138 } 139 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 140 { 141 short[8] res; 142 short8 sa = cast(short8)a; 143 short8 sb = cast(short8)b; 144 foreach(i; 0..8) 145 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 146 return _mm_loadu_si128(cast(int4*)res.ptr); 147 } 148 else 149 return __builtin_ia32_paddsw128(a, b); 150 } 151 else 152 { 153 short[8] res; 154 short8 sa = cast(short8)a; 155 short8 sb = cast(short8)b; 156 foreach(i; 0..8) 157 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 158 return _mm_loadu_si128(cast(int4*)res.ptr); 159 } 160 } 161 unittest 162 { 163 short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), 164 _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); 165 static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 166 assert(res.array == correctResult); 167 } 168 169 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 170 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 171 { 172 static if (GDC_with_SSE2) 173 { 174 return __builtin_ia32_paddsb128(a, b); 175 } 176 else version(LDC) 177 { 178 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 179 { 180 // x86: Generates PADDSB since LDC 1.15 -O0 181 // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20 182 enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 183 enum ir = ` 184 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 185 ret <16 x i8> %r`; 186 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 187 } 188 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 189 { 190 byte[16] res; 191 byte16 sa = cast(byte16)a; 192 byte16 sb = cast(byte16)b; 193 foreach(i; 0..16) 194 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 195 return _mm_loadu_si128(cast(int4*)res.ptr); 196 } 197 else 198 return __builtin_ia32_paddsb128(a, b); 199 } 200 else 201 { 202 byte[16] res; 203 byte16 sa = cast(byte16)a; 204 byte16 sb = cast(byte16)b; 205 foreach(i; 0..16) 206 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 207 return _mm_loadu_si128(cast(int4*)res.ptr); 208 } 209 } 210 unittest 211 { 212 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 213 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 214 static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14, 215 16, 18, 20, 22, 24, 26, 28, 30]; 216 assert(res.array == correctResult); 217 } 218 219 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 220 // PERF: #GDC version? 221 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 222 { 223 version(LDC) 224 { 225 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 226 { 227 // x86: Generates PADDUSB since LDC 1.15 -O0 228 // ARM: Generates uqadd.16b since LDC 1.21 -O1 229 enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 230 enum ir = ` 231 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 232 ret <16 x i8> %r`; 233 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 234 } 235 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 236 { 237 ubyte[16] res; 238 byte16 sa = cast(byte16)a; 239 byte16 sb = cast(byte16)b; 240 foreach(i; 0..16) 241 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 242 return _mm_loadu_si128(cast(int4*)res.ptr); 243 } 244 else 245 return __builtin_ia32_paddusb128(a, b); 246 } 247 else 248 { 249 ubyte[16] res; 250 byte16 sa = cast(byte16)a; 251 byte16 sb = cast(byte16)b; 252 foreach(i; 0..16) 253 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 254 return _mm_loadu_si128(cast(int4*)res.ptr); 255 } 256 } 257 unittest 258 { 259 byte16 res = cast(byte16) 260 _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0), 261 _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0)); 262 static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 263 0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 264 assert(res.array == correctResult); 265 } 266 267 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation. 268 // PERF: #GDC version? 269 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 270 { 271 version(LDC) 272 { 273 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 274 { 275 // x86: Generates PADDUSW since LDC 1.15 -O0 276 // ARM: Generates uqadd.8h since LDC 1.21 -O1 277 enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 278 enum ir = ` 279 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 280 ret <8 x i16> %r`; 281 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 282 } 283 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 284 { 285 ushort[8] res; 286 short8 sa = cast(short8)a; 287 short8 sb = cast(short8)b; 288 foreach(i; 0..8) 289 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 290 return _mm_loadu_si128(cast(int4*)res.ptr); 291 } 292 else 293 return __builtin_ia32_paddusw128(a, b); 294 } 295 else 296 { 297 ushort[8] res; 298 short8 sa = cast(short8)a; 299 short8 sb = cast(short8)b; 300 foreach(i; 0..8) 301 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 302 return _mm_loadu_si128(cast(int4*)res.ptr); 303 } 304 } 305 unittest 306 { 307 short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0), 308 _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0)); 309 static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6]; 310 assert(res.array == correctResult); 311 } 312 313 /// Compute the bitwise AND of packed double-precision (64-bit) 314 /// floating-point elements in `a` and `b`. 315 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 316 { 317 return cast(__m128d)( cast(long2)a & cast(long2)b ); 318 } 319 unittest 320 { 321 double a = 4.32; 322 double b = -78.99; 323 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 324 __m128d A = _mm_set_pd(a, b); 325 __m128d B = _mm_set_pd(b, a); 326 long2 R = cast(long2)( _mm_and_pd(A, B) ); 327 assert(R.array[0] == correct); 328 assert(R.array[1] == correct); 329 } 330 331 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`. 332 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 333 { 334 return a & b; 335 } 336 unittest 337 { 338 __m128i A = _mm_set1_epi32(7); 339 __m128i B = _mm_set1_epi32(14); 340 __m128i R = _mm_and_si128(A, B); 341 int[4] correct = [6, 6, 6, 6]; 342 assert(R.array == correct); 343 } 344 345 /// Compute the bitwise NOT of packed double-precision (64-bit) 346 /// floating-point elements in `a` and then AND with `b`. 347 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 348 { 349 return cast(__m128d)( ~(cast(long2)a) & cast(long2)b); 350 } 351 unittest 352 { 353 double a = 4.32; 354 double b = -78.99; 355 long correct = (~*cast(long*)(&a)) & ( *cast(long*)(&b)); 356 long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b)); 357 __m128d A = _mm_setr_pd(a, b); 358 __m128d B = _mm_setr_pd(b, a); 359 long2 R = cast(long2)( _mm_andnot_pd(A, B) ); 360 assert(R.array[0] == correct); 361 assert(R.array[1] == correct2); 362 } 363 364 /// Compute the bitwise NOT of 128 bits (representing integer data) 365 /// in `a` and then AND with `b`. 366 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 367 { 368 return (~a) & b; 369 } 370 unittest 371 { 372 __m128i A = _mm_set1_epi32(7); 373 __m128i B = _mm_set1_epi32(14); 374 __m128i R = _mm_andnot_si128(A, B); 375 int[4] correct = [8, 8, 8, 8]; 376 assert(R.array == correct); 377 } 378 379 /// Average packed unsigned 16-bit integers in `a` and `b`. 380 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted 381 { 382 static if (GDC_with_SSE2) 383 { 384 return __builtin_ia32_pavgw128(a, b); 385 } 386 else static if (LDC_with_ARM64) 387 { 388 return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b); 389 } 390 else version(LDC) 391 { 392 // Generates pavgw even in LDC 1.0, even in -O0 393 // But not in ARM 394 enum ir = ` 395 %ia = zext <8 x i16> %0 to <8 x i32> 396 %ib = zext <8 x i16> %1 to <8 x i32> 397 %isum = add <8 x i32> %ia, %ib 398 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 399 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 400 %r = trunc <8 x i32> %isums to <8 x i16> 401 ret <8 x i16> %r`; 402 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 403 } 404 else 405 { 406 short8 sa = cast(short8)a; 407 short8 sb = cast(short8)b; 408 short8 sr = void; 409 foreach(i; 0..8) 410 { 411 sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 ); 412 } 413 return cast(int4)sr; 414 } 415 } 416 unittest 417 { 418 __m128i A = _mm_set1_epi16(31); 419 __m128i B = _mm_set1_epi16(64); 420 short8 avg = cast(short8)(_mm_avg_epu16(A, B)); 421 foreach(i; 0..8) 422 assert(avg.array[i] == 48); 423 } 424 425 /// Average packed unsigned 8-bit integers in `a` and `b`. 426 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted 427 { 428 static if (GDC_with_SSE2) 429 { 430 return __builtin_ia32_pavgb128(a, b); 431 } 432 else static if (LDC_with_ARM64) 433 { 434 return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b); 435 } 436 else version(LDC) 437 { 438 // Generates pavgb even in LDC 1.0, even in -O0 439 // But not in ARM 440 enum ir = ` 441 %ia = zext <16 x i8> %0 to <16 x i16> 442 %ib = zext <16 x i8> %1 to <16 x i16> 443 %isum = add <16 x i16> %ia, %ib 444 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 445 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 446 %r = trunc <16 x i16> %isums to <16 x i8> 447 ret <16 x i8> %r`; 448 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 449 } 450 else 451 { 452 byte16 sa = cast(byte16)a; 453 byte16 sb = cast(byte16)b; 454 byte16 sr = void; 455 foreach(i; 0..16) 456 { 457 sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 ); 458 } 459 return cast(int4)sr; 460 } 461 } 462 unittest 463 { 464 __m128i A = _mm_set1_epi8(31); 465 __m128i B = _mm_set1_epi8(64); 466 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B)); 467 foreach(i; 0..16) 468 assert(avg.array[i] == 48); 469 } 470 471 /// Shift `a` left by `bytes` bytes while shifting in zeros. 472 alias _mm_bslli_si128 = _mm_slli_si128; 473 unittest 474 { 475 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 476 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 477 __m128i result = _mm_bslli_si128!5(toShift); 478 assert( (cast(byte16)result).array == exact); 479 } 480 481 /// Shift `v` right by `bytes` bytes while shifting in zeros. 482 alias _mm_bsrli_si128 = _mm_srli_si128; 483 unittest 484 { 485 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 486 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0]; 487 __m128i result = _mm_bsrli_si128!5(toShift); 488 assert( (cast(byte16)result).array == exact); 489 } 490 491 /// Cast vector of type `__m128d` to type `__m128`. 492 /// Note: Also possible with a regular `cast(__m128)(a)`. 493 __m128 _mm_castpd_ps (__m128d a) pure @safe 494 { 495 return cast(__m128)a; 496 } 497 498 /// Cast vector of type `__m128d` to type `__m128i`. 499 /// Note: Also possible with a regular `cast(__m128i)(a)`. 500 __m128i _mm_castpd_si128 (__m128d a) pure @safe 501 { 502 return cast(__m128i)a; 503 } 504 505 /// Cast vector of type `__m128` to type `__m128d`. 506 /// Note: Also possible with a regular `cast(__m128d)(a)`. 507 __m128d _mm_castps_pd (__m128 a) pure @safe 508 { 509 return cast(__m128d)a; 510 } 511 512 /// Cast vector of type `__m128` to type `__m128i`. 513 /// Note: Also possible with a regular `cast(__m128i)(a)`. 514 __m128i _mm_castps_si128 (__m128 a) pure @safe 515 { 516 return cast(__m128i)a; 517 } 518 519 /// Cast vector of type `__m128i` to type `__m128d`. 520 /// Note: Also possible with a regular `cast(__m128d)(a)`. 521 __m128d _mm_castsi128_pd (__m128i a) pure @safe 522 { 523 return cast(__m128d)a; 524 } 525 526 /// Cast vector of type `__m128i` to type `__m128`. 527 /// Note: Also possible with a regular `cast(__m128)(a)`. 528 __m128 _mm_castsi128_ps (__m128i a) pure @safe 529 { 530 return cast(__m128)a; 531 } 532 533 /// Invalidate and flush the cache line that contains `p` 534 /// from all levels of the cache hierarchy. 535 void _mm_clflush (const(void)* p) @trusted 536 { 537 static if (GDC_with_SSE2) 538 { 539 __builtin_ia32_clflush(p); 540 } 541 else static if (LDC_with_SSE2) 542 { 543 __builtin_ia32_clflush(cast(void*)p); 544 } 545 else version(D_InlineAsm_X86) 546 { 547 asm pure nothrow @nogc @safe 548 { 549 mov EAX, p; 550 clflush [EAX]; 551 } 552 } 553 else version(D_InlineAsm_X86_64) 554 { 555 asm pure nothrow @nogc @safe 556 { 557 mov RAX, p; 558 clflush [RAX]; 559 } 560 } 561 else 562 { 563 // Do nothing. Invalidating cacheline does 564 // not affect correctness. 565 } 566 } 567 unittest 568 { 569 ubyte[64] cacheline; 570 _mm_clflush(cacheline.ptr); 571 } 572 573 /// Compare packed 16-bit integers in `a` and `b` for equality. 574 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 575 { 576 static if (GDC_with_SSE2) 577 { 578 return __builtin_ia32_pcmpeqw128(a, b); 579 } 580 else 581 { 582 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 583 } 584 } 585 unittest 586 { 587 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 588 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 589 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 590 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 591 assert(R.array == E); 592 } 593 594 /// Compare packed 32-bit integers in `a` and `b` for equality. 595 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 596 { 597 static if (GDC_with_SSE2) 598 { 599 return __builtin_ia32_pcmpeqd128(a, b); 600 } 601 else 602 { 603 return equalMask!__m128i(a, b); 604 } 605 } 606 unittest 607 { 608 int4 A = [-3, -2, -1, 0]; 609 int4 B = [ 4, -2, 2, 0]; 610 int[4] E = [ 0, -1, 0, -1]; 611 int4 R = cast(int4)(_mm_cmpeq_epi16(A, B)); 612 assert(R.array == E); 613 } 614 615 /// Compare packed 8-bit integers in `a` and `b` for equality. 616 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 617 { 618 static if (GDC_with_SSE2) 619 { 620 return __builtin_ia32_pcmpeqb128(a, b); 621 } 622 else 623 { 624 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 625 } 626 } 627 unittest 628 { 629 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 630 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 631 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 632 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 633 assert(C.array == correct); 634 } 635 636 /// Compare packed double-precision (64-bit) floating-point elements 637 /// in `a` and `b` for equality. 638 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 639 { 640 static if (GDC_with_SSE2) 641 { 642 return __builtin_ia32_cmpeqpd(a, b); 643 } 644 else 645 { 646 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b); 647 } 648 } 649 650 /// Compare the lower double-precision (64-bit) floating-point elements 651 /// in `a` and `b` for equality, store the result in the lower element, 652 /// and copy the upper element from `a`. 653 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 654 { 655 static if (GDC_with_SSE2) 656 { 657 return __builtin_ia32_cmpeqsd(a, b); 658 } 659 else 660 { 661 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b); 662 } 663 } 664 665 /// Compare packed double-precision (64-bit) floating-point elements 666 /// in `a` and `b` for greater-than-or-equal. 667 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 668 { 669 static if (GDC_with_SSE2) 670 { 671 return __builtin_ia32_cmpgepd(a, b); 672 } 673 else 674 { 675 return cast(__m128d) cmppd!(FPComparison.oge)(a, b); 676 } 677 } 678 679 /// Compare the lower double-precision (64-bit) floating-point elements 680 /// in `a` and `b` for greater-than-or-equal, store the result in the 681 /// lower element, and copy the upper element from `a`. 682 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 683 { 684 // Note: There is no __builtin_ia32_cmpgesd builtin. 685 static if (GDC_with_SSE2) 686 { 687 return __builtin_ia32_cmpnltsd(b, a); 688 } 689 else 690 { 691 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b); 692 } 693 } 694 695 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 696 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 697 { 698 static if (GDC_with_SSE2) 699 { 700 return __builtin_ia32_pcmpgtw128(a, b); 701 } 702 else 703 { 704 return cast(__m128i)( greaterMask!short8(cast(short8)a, cast(short8)b)); 705 } 706 } 707 unittest 708 { 709 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 710 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 711 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 712 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 713 assert(R.array == E); 714 } 715 716 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 717 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 718 { 719 static if (GDC_with_SSE2) 720 { 721 return __builtin_ia32_pcmpgtd128(a, b); 722 } 723 else 724 { 725 return cast(__m128i)( greaterMask!int4(a, b)); 726 } 727 } 728 unittest 729 { 730 int4 A = [-3, 2, -1, 0]; 731 int4 B = [ 4, -2, 2, 0]; 732 int[4] E = [ 0, -1, 0, 0]; 733 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 734 assert(R.array == E); 735 } 736 737 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 738 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 739 { 740 static if (GDC_with_SSE2) 741 { 742 return __builtin_ia32_pcmpgtb128(a, b); 743 } 744 else 745 { 746 return cast(__m128i)( greaterMask!byte16(cast(byte16)a, cast(byte16)b)); 747 } 748 } 749 unittest 750 { 751 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 752 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 753 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 754 byte[16] correct = [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 755 __m128i D = _mm_cmpeq_epi8(A, B); 756 assert(C.array == correct); 757 } 758 759 /// Compare packed double-precision (64-bit) floating-point elements 760 /// in `a` and `b` for greater-than. 761 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 762 { 763 static if (GDC_with_SSE2) 764 { 765 return __builtin_ia32_cmpgtpd(a, b); 766 } 767 else 768 { 769 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b); 770 } 771 } 772 773 /// Compare the lower double-precision (64-bit) floating-point elements 774 /// in `a` and `b` for greater-than, store the result in the lower element, 775 /// and copy the upper element from `a`. 776 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 777 { 778 // Note: There is no __builtin_ia32_cmpgtsd builtin. 779 static if (GDC_with_SSE2) 780 { 781 return __builtin_ia32_cmpnlesd(b, a); 782 } 783 else 784 { 785 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b); 786 } 787 } 788 789 /// Compare packed double-precision (64-bit) floating-point elements 790 /// in `a` and `b` for less-than-or-equal. 791 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 792 { 793 static if (GDC_with_SSE2) 794 { 795 return __builtin_ia32_cmplepd(a, b); 796 } 797 else 798 { 799 return cast(__m128d) cmppd!(FPComparison.ole)(a, b); 800 } 801 } 802 803 /// Compare the lower double-precision (64-bit) floating-point elements 804 /// in `a` and `b` for less-than-or-equal, store the result in the 805 /// lower element, and copy the upper element from `a`. 806 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 807 { 808 static if (GDC_with_SSE2) 809 { 810 return __builtin_ia32_cmplesd(a, b); 811 } 812 else 813 { 814 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b); 815 } 816 } 817 818 /// Compare packed 16-bit integers in `a` and `b` for less-than. 819 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 820 { 821 return _mm_cmpgt_epi16(b, a); 822 } 823 824 /// Compare packed 32-bit integers in `a` and `b` for less-than. 825 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 826 { 827 return _mm_cmpgt_epi32(b, a); 828 } 829 830 /// Compare packed 8-bit integers in `a` and `b` for less-than. 831 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 832 { 833 return _mm_cmpgt_epi8(b, a); 834 } 835 836 /// Compare packed double-precision (64-bit) floating-point elements 837 /// in `a` and `b` for less-than. 838 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 839 { 840 static if (GDC_with_SSE2) 841 { 842 return __builtin_ia32_cmpltpd(a, b); 843 } 844 else 845 { 846 return cast(__m128d) cmppd!(FPComparison.olt)(a, b); 847 } 848 } 849 850 /// Compare the lower double-precision (64-bit) floating-point elements 851 /// in `a` and `b` for less-than, store the result in the lower 852 /// element, and copy the upper element from `a`. 853 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 854 { 855 static if (GDC_with_SSE2) 856 { 857 return __builtin_ia32_cmpltsd(a, b); 858 } 859 else 860 { 861 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b); 862 } 863 } 864 865 /// Compare packed double-precision (64-bit) floating-point elements 866 /// in `a` and `b` for not-equal. 867 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 868 { 869 static if (GDC_with_SSE2) 870 { 871 return __builtin_ia32_cmpneqpd(a, b); 872 } 873 else 874 { 875 return cast(__m128d) cmppd!(FPComparison.une)(a, b); 876 } 877 } 878 879 /// Compare the lower double-precision (64-bit) floating-point elements 880 /// in `a` and `b` for not-equal, store the result in the lower 881 /// element, and copy the upper element from `a`. 882 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 883 { 884 static if (GDC_with_SSE2) 885 { 886 return __builtin_ia32_cmpneqsd(a, b); 887 } 888 else 889 { 890 return cast(__m128d) cmpsd!(FPComparison.une)(a, b); 891 } 892 } 893 894 /// Compare packed double-precision (64-bit) floating-point elements 895 /// in `a` and `b` for not-greater-than-or-equal. 896 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 897 { 898 static if (GDC_with_SSE2) 899 { 900 return __builtin_ia32_cmpngepd(a, b); 901 } 902 else 903 { 904 return cast(__m128d) cmppd!(FPComparison.ult)(a, b); 905 } 906 } 907 908 /// Compare the lower double-precision (64-bit) floating-point elements 909 /// in `a` and `b` for not-greater-than-or-equal, store the result in 910 /// the lower element, and copy the upper element from `a`. 911 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 912 { 913 // Note: There is no __builtin_ia32_cmpngesd builtin. 914 static if (GDC_with_SSE2) 915 { 916 return __builtin_ia32_cmpltsd(b, a); 917 } 918 else 919 { 920 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b); 921 } 922 } 923 924 /// Compare packed double-precision (64-bit) floating-point elements 925 /// in `a` and `b` for not-greater-than. 926 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 927 { 928 static if (GDC_with_SSE2) 929 { 930 return __builtin_ia32_cmpngtpd(a, b); 931 } 932 else 933 { 934 return cast(__m128d) cmppd!(FPComparison.ule)(a, b); 935 } 936 } 937 938 /// Compare the lower double-precision (64-bit) floating-point elements 939 /// in `a` and `b` for not-greater-than, store the result in the 940 /// lower element, and copy the upper element from `a`. 941 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 942 { 943 // Note: There is no __builtin_ia32_cmpngtsd builtin. 944 static if (GDC_with_SSE2) 945 { 946 return __builtin_ia32_cmplesd(b, a); 947 } 948 else 949 { 950 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b); 951 } 952 } 953 954 /// Compare packed double-precision (64-bit) floating-point elements 955 /// in `a` and `b` for not-less-than-or-equal. 956 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 957 { 958 static if (GDC_with_SSE2) 959 { 960 return __builtin_ia32_cmpnlepd(a, b); 961 } 962 else 963 { 964 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b); 965 } 966 } 967 968 /// Compare the lower double-precision (64-bit) floating-point elements 969 /// in `a` and `b` for not-less-than-or-equal, store the result in the 970 /// lower element, and copy the upper element from `a`. 971 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 972 { 973 static if (GDC_with_SSE2) 974 { 975 return __builtin_ia32_cmpnlesd(a, b); 976 } 977 else 978 { 979 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b); 980 } 981 } 982 983 /// Compare packed double-precision (64-bit) floating-point elements 984 /// in `a` and `b` for not-less-than. 985 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 986 { 987 static if (GDC_with_SSE2) 988 { 989 return __builtin_ia32_cmpnltpd(a, b); 990 } 991 else 992 { 993 return cast(__m128d) cmppd!(FPComparison.uge)(a, b); 994 } 995 } 996 997 /// Compare the lower double-precision (64-bit) floating-point elements 998 /// in `a` and `b` for not-less-than, store the result in the lower 999 /// element, and copy the upper element from `a`. 1000 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 1001 { 1002 static if (GDC_with_SSE2) 1003 { 1004 return __builtin_ia32_cmpnltsd(a, b); 1005 } 1006 else 1007 { 1008 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b); 1009 } 1010 } 1011 1012 /// Compare packed double-precision (64-bit) floating-point elements 1013 /// in `a` and `b` to see if neither is NaN. 1014 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 1015 { 1016 static if (GDC_with_SSE2) 1017 { 1018 return __builtin_ia32_cmpordpd(a, b); 1019 } 1020 else 1021 { 1022 return cast(__m128d) cmppd!(FPComparison.ord)(a, b); 1023 } 1024 } 1025 1026 /// Compare the lower double-precision (64-bit) floating-point elements 1027 /// in `a` and `b` to see if neither is NaN, store the result in the 1028 /// lower element, and copy the upper element from `a` to the upper element. 1029 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 1030 { 1031 static if (GDC_with_SSE2) 1032 { 1033 return __builtin_ia32_cmpordsd(a, b); 1034 } 1035 else 1036 { 1037 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b); 1038 } 1039 } 1040 1041 /// Compare packed double-precision (64-bit) floating-point elements 1042 /// in `a` and `b` to see if either is NaN. 1043 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 1044 { 1045 static if (GDC_with_SSE2) 1046 { 1047 return __builtin_ia32_cmpunordpd(a, b); 1048 } 1049 else 1050 { 1051 return cast(__m128d) cmppd!(FPComparison.uno)(a, b); 1052 } 1053 } 1054 1055 /// Compare the lower double-precision (64-bit) floating-point elements 1056 /// in `a` and `b` to see if either is NaN, store the result in the lower 1057 /// element, and copy the upper element from `a` to the upper element. 1058 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 1059 { 1060 static if (GDC_with_SSE2) 1061 { 1062 return __builtin_ia32_cmpunordsd(a, b); 1063 } 1064 else 1065 { 1066 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b); 1067 } 1068 } 1069 1070 1071 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS 1072 // Some such comparisons yields true for NaNs, other don't. 1073 1074 /// Compare the lower double-precision (64-bit) floating-point element 1075 /// in `a` and `b` for equality, and return the boolean result (0 or 1). 1076 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe 1077 { 1078 static if (GDC_with_SSE2) 1079 { 1080 return __builtin_ia32_comieq(a, b); 1081 } 1082 else 1083 { 1084 return comsd!(FPComparison.ueq)(a, b); // yields true for NaN, same as GCC 1085 } 1086 } 1087 1088 /// Compare the lower double-precision (64-bit) floating-point element 1089 /// in `a` and `b` for greater-than-or-equal, and return the boolean 1090 /// result (0 or 1). 1091 int _mm_comige_sd (__m128d a, __m128d b) pure @safe 1092 { 1093 static if (GDC_with_SSE2) 1094 { 1095 return __builtin_ia32_comige(a, b); 1096 } 1097 else 1098 { 1099 return comsd!(FPComparison.oge)(a, b); 1100 } 1101 } 1102 1103 /// Compare the lower double-precision (64-bit) floating-point element 1104 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1). 1105 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe 1106 { 1107 static if (GDC_with_SSE2) 1108 { 1109 return __builtin_ia32_comigt(a, b); 1110 } 1111 else 1112 { 1113 return comsd!(FPComparison.ogt)(a, b); 1114 } 1115 } 1116 1117 /// Compare the lower double-precision (64-bit) floating-point element 1118 /// in `a` and `b` for less-than-or-equal. 1119 int _mm_comile_sd (__m128d a, __m128d b) pure @safe 1120 { 1121 static if (GDC_with_SSE2) 1122 { 1123 return __builtin_ia32_comile(a, b); 1124 } 1125 else 1126 { 1127 return comsd!(FPComparison.ule)(a, b); // yields true for NaN, same as GCC 1128 } 1129 } 1130 1131 /// Compare the lower double-precision (64-bit) floating-point element 1132 /// in `a` and `b` for less-than, and return the boolean result (0 or 1). 1133 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe 1134 { 1135 static if (GDC_with_SSE2) 1136 { 1137 return __builtin_ia32_comilt(a, b); 1138 } 1139 else 1140 { 1141 return comsd!(FPComparison.ult)(a, b); // yields true for NaN, same as GCC 1142 } 1143 } 1144 1145 /// Compare the lower double-precision (64-bit) floating-point element 1146 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1). 1147 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe 1148 { 1149 static if (GDC_with_SSE2) 1150 { 1151 return __builtin_ia32_comineq(a, b); 1152 } 1153 else 1154 { 1155 return comsd!(FPComparison.one)(a, b); 1156 } 1157 } 1158 1159 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) 1160 /// floating-point elements. 1161 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted 1162 { 1163 version(LDC) 1164 { 1165 // Generates cvtdq2pd since LDC 1.0, even without optimizations 1166 enum ir = ` 1167 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 1168 %r = sitofp <2 x i32> %v to <2 x double> 1169 ret <2 x double> %r`; 1170 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a); 1171 } 1172 else static if (GDC_with_SSE2) 1173 { 1174 return __builtin_ia32_cvtdq2pd(a); 1175 } 1176 else 1177 { 1178 double2 r = void; 1179 r.ptr[0] = a.array[0]; 1180 r.ptr[1] = a.array[1]; 1181 return r; 1182 } 1183 } 1184 unittest 1185 { 1186 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54)); 1187 assert(A.array[0] == 54.0); 1188 assert(A.array[1] == 54.0); 1189 } 1190 1191 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 1192 /// floating-point elements. 1193 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted 1194 { 1195 static if (GDC_with_SSE2) 1196 { 1197 return __builtin_ia32_cvtdq2ps(a); 1198 } 1199 else 1200 { 1201 // x86: Generates cvtdq2ps since LDC 1.0.0 -O1 1202 // ARM: Generats scvtf.4s since LDC 1.8.0 -02 1203 __m128 res; 1204 res.ptr[0] = cast(float)a.array[0]; 1205 res.ptr[1] = cast(float)a.array[1]; 1206 res.ptr[2] = cast(float)a.array[2]; 1207 res.ptr[3] = cast(float)a.array[3]; 1208 return res; 1209 } 1210 } 1211 unittest 1212 { 1213 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 1214 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 1215 } 1216 1217 /// Convert packed double-precision (64-bit) floating-point elements 1218 /// in `a` to packed 32-bit integers. 1219 // PERF #ARM 1220 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted 1221 { 1222 static if (LDC_with_SSE2) 1223 { 1224 // Like in clang, implemented with a magic intrinsic right now 1225 return __builtin_ia32_cvtpd2dq(a); 1226 } 1227 else static if (GDC_with_SSE2) 1228 { 1229 return __builtin_ia32_cvtpd2dq(a); 1230 } 1231 else 1232 { 1233 __m128i r = _mm_setzero_si128(); 1234 r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]); 1235 r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]); 1236 return r; 1237 } 1238 } 1239 unittest 1240 { 1241 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0)); 1242 assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0); 1243 } 1244 1245 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1246 /// to packed 32-bit integers 1247 __m64 _mm_cvtpd_pi32 (__m128d v) @safe 1248 { 1249 return to_m64(_mm_cvtpd_epi32(v)); 1250 } 1251 unittest 1252 { 1253 int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0)); 1254 assert(A.array[0] == 55 && A.array[1] == 61); 1255 } 1256 1257 /// Convert packed double-precision (64-bit) floating-point elements 1258 /// in `a` to packed single-precision (32-bit) floating-point elements. 1259 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted 1260 { 1261 static if (LDC_with_SSE2) 1262 { 1263 return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately 1264 } 1265 else static if (GDC_with_SSE2) 1266 { 1267 return __builtin_ia32_cvtpd2ps(a); 1268 } 1269 else 1270 { 1271 __m128 r = void; 1272 r.ptr[0] = a.array[0]; 1273 r.ptr[1] = a.array[1]; 1274 r.ptr[2] = 0; 1275 r.ptr[3] = 0; 1276 return r; 1277 } 1278 } 1279 unittest 1280 { 1281 __m128d A = _mm_set_pd(5.25, 4.0); 1282 __m128 B = _mm_cvtpd_ps(A); 1283 assert(B.array == [4.0f, 5.25f, 0, 0]); 1284 } 1285 1286 /// Convert packed 32-bit integers in `v` to packed double-precision 1287 /// (64-bit) floating-point elements. 1288 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe 1289 { 1290 return _mm_cvtepi32_pd(to_m128i(v)); 1291 } 1292 unittest 1293 { 1294 __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5)); 1295 assert(A.array[0] == 4.0 && A.array[1] == -5.0); 1296 } 1297 1298 /// Convert packed single-precision (32-bit) floating-point elements 1299 /// in `a` to packed 32-bit integers 1300 __m128i _mm_cvtps_epi32 (__m128 a) @trusted 1301 { 1302 static if (LDC_with_SSE2) 1303 { 1304 // Disabled, since it fail with optimizations unfortunately 1305 //alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq; 1306 return __asm!__m128i("cvtps2dq $1,$0","=x,x",a); 1307 } 1308 else static if (GDC_with_SSE2) 1309 { 1310 return __builtin_ia32_cvtps2dq(a); 1311 } 1312 else static if (LDC_with_ARM64) 1313 { 1314 // Get current rounding mode. 1315 uint fpscr = arm_get_fpcr(); 1316 switch(fpscr & _MM_ROUND_MASK_ARM) 1317 { 1318 default: 1319 case _MM_ROUND_NEAREST_ARM: return vcvtnq_s32_f32(a); 1320 case _MM_ROUND_DOWN_ARM: return vcvtmq_s32_f32(a); 1321 case _MM_ROUND_UP_ARM: return vcvtpq_s32_f32(a); 1322 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a); 1323 } 1324 } 1325 else 1326 { 1327 __m128i r = void; 1328 r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]); 1329 r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]); 1330 r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]); 1331 r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]); 1332 return r; 1333 } 1334 } 1335 unittest 1336 { 1337 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1338 1339 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1340 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1341 assert(A.array == [1, -2, 54, -3]); 1342 1343 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1344 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1345 assert(A.array == [1, -3, 53, -3]); 1346 1347 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1348 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1349 assert(A.array == [2, -2, 54, -2]); 1350 1351 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1352 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1353 assert(A.array == [1, -2, 53, -2]); 1354 1355 _MM_SET_ROUNDING_MODE(savedRounding); 1356 } 1357 1358 /// Convert packed single-precision (32-bit) floating-point elements 1359 /// in `a` to packed double-precision (64-bit) floating-point elements. 1360 __m128d _mm_cvtps_pd (__m128 a) pure @trusted 1361 { 1362 version(LDC) 1363 { 1364 // Generates cvtps2pd since LDC 1.0 -O0 1365 enum ir = ` 1366 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1> 1367 %r = fpext <2 x float> %v to <2 x double> 1368 ret <2 x double> %r`; 1369 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a); 1370 } 1371 else static if (GDC_with_SSE2) 1372 { 1373 return __builtin_ia32_cvtps2pd(a); 1374 } 1375 else 1376 { 1377 double2 r = void; 1378 r.ptr[0] = a.array[0]; 1379 r.ptr[1] = a.array[1]; 1380 return r; 1381 } 1382 } 1383 unittest 1384 { 1385 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f)); 1386 assert(A.array[0] == 54.0); 1387 assert(A.array[1] == 54.0); 1388 } 1389 1390 /// Copy the lower double-precision (64-bit) floating-point element of `a`. 1391 double _mm_cvtsd_f64 (__m128d a) pure @safe 1392 { 1393 return a.array[0]; 1394 } 1395 1396 /// Convert the lower double-precision (64-bit) floating-point element 1397 /// in `a` to a 32-bit integer. 1398 int _mm_cvtsd_si32 (__m128d a) @safe 1399 { 1400 static if (LDC_with_SSE2) 1401 { 1402 return __builtin_ia32_cvtsd2si(a); 1403 } 1404 else static if (GDC_with_SSE2) 1405 { 1406 return __builtin_ia32_cvtsd2si(a); 1407 } 1408 else 1409 { 1410 return convertDoubleToInt32UsingMXCSR(a[0]); 1411 } 1412 } 1413 unittest 1414 { 1415 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0))); 1416 } 1417 1418 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer. 1419 long _mm_cvtsd_si64 (__m128d a) @trusted 1420 { 1421 version(LDC) 1422 { 1423 // Unfortunately this builtin crashes in 32-bit 1424 version(X86_64) 1425 return __builtin_ia32_cvtsd2si64(a); 1426 else 1427 { 1428 return convertDoubleToInt64UsingMXCSR(a[0]); 1429 } 1430 } 1431 else 1432 { 1433 return convertDoubleToInt64UsingMXCSR(a.array[0]); 1434 } 1435 } 1436 unittest 1437 { 1438 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0))); 1439 1440 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1441 1442 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1443 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49))); 1444 1445 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1446 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1))); 1447 1448 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1449 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1))); 1450 1451 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1452 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9))); 1453 1454 _MM_SET_ROUNDING_MODE(savedRounding); 1455 } 1456 1457 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; /// 1458 1459 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 1460 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a` 1461 /// to the upper elements of result. 1462 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted 1463 { 1464 static if (GDC_with_SSE2) 1465 { 1466 return __builtin_ia32_cvtsd2ss(a, b); 1467 } 1468 else 1469 { 1470 // Generates cvtsd2ss since LDC 1.3 -O0 1471 a.ptr[0] = b.array[0]; 1472 return a; 1473 } 1474 } 1475 unittest 1476 { 1477 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0)); 1478 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]); 1479 } 1480 1481 /// Get the lower 32-bit integer in `a`. 1482 int _mm_cvtsi128_si32 (__m128i a) pure @safe 1483 { 1484 return a.array[0]; 1485 } 1486 1487 /// Get the lower 64-bit integer in `a`. 1488 long _mm_cvtsi128_si64 (__m128i a) pure @safe 1489 { 1490 long2 la = cast(long2)a; 1491 return la.array[0]; 1492 } 1493 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 1494 1495 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 1496 /// lower element of result, and copy the upper element from `a` to the upper element of result. 1497 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted 1498 { 1499 a.ptr[0] = cast(double)b; 1500 return a; 1501 } 1502 unittest 1503 { 1504 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 1505 assert(a.array == [42.0, 0]); 1506 } 1507 1508 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements. 1509 __m128i _mm_cvtsi32_si128 (int a) pure @trusted 1510 { 1511 int4 r = [0, 0, 0, 0]; 1512 r.ptr[0] = a; 1513 return r; 1514 } 1515 unittest 1516 { 1517 __m128i a = _mm_cvtsi32_si128(65); 1518 assert(a.array == [65, 0, 0, 0]); 1519 } 1520 1521 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 1522 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 1523 1524 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted 1525 { 1526 a.ptr[0] = cast(double)b; 1527 return a; 1528 } 1529 unittest 1530 { 1531 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 1532 assert(a.array == [42.0, 0]); 1533 } 1534 1535 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element. 1536 __m128i _mm_cvtsi64_si128 (long a) pure @trusted 1537 { 1538 long2 r = [0, 0]; 1539 r.ptr[0] = a; 1540 return cast(__m128i)(r); 1541 } 1542 1543 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; /// 1544 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; /// 1545 1546 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 1547 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 1548 // element of result. 1549 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted 1550 { 1551 a.ptr[0] = b.array[0]; 1552 return a; 1553 } 1554 unittest 1555 { 1556 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 1557 assert(a.array == [42.0, 0]); 1558 } 1559 1560 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation. 1561 long _mm_cvttss_si64 (__m128 a) pure @safe 1562 { 1563 return cast(long)(a.array[0]); // Generates cvttss2si as expected 1564 } 1565 unittest 1566 { 1567 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1568 } 1569 1570 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1571 /// Put zeroes in the upper elements of result. 1572 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted 1573 { 1574 static if (LDC_with_SSE2) 1575 { 1576 return __builtin_ia32_cvttpd2dq(a); 1577 } 1578 else static if (GDC_with_SSE2) 1579 { 1580 return __builtin_ia32_cvttpd2dq(a); 1581 } 1582 else 1583 { 1584 // Note: doesn't generate cvttpd2dq as of LDC 1.13 1585 __m128i r; 1586 r.ptr[0] = cast(int)a.array[0]; 1587 r.ptr[1] = cast(int)a.array[1]; 1588 r.ptr[2] = 0; 1589 r.ptr[3] = 0; 1590 return r; 1591 } 1592 } 1593 unittest 1594 { 1595 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f)); 1596 assert(R.array == [-4, 45641, 0, 0]); 1597 } 1598 1599 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1600 /// to packed 32-bit integers with truncation. 1601 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe 1602 { 1603 return to_m64(_mm_cvttpd_epi32(v)); 1604 } 1605 unittest 1606 { 1607 int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f)); 1608 int[2] correct = [-4, 45641]; 1609 assert(R.array == correct); 1610 } 1611 1612 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1613 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted 1614 { 1615 // x86: Generates cvttps2dq since LDC 1.3 -O2 1616 // ARM64: generates fcvtze since LDC 1.8 -O2 1617 __m128i r; 1618 r.ptr[0] = cast(int)a.array[0]; 1619 r.ptr[1] = cast(int)a.array[1]; 1620 r.ptr[2] = cast(int)a.array[2]; 1621 r.ptr[3] = cast(int)a.array[3]; 1622 return r; 1623 } 1624 unittest 1625 { 1626 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f)); 1627 assert(R.array == [-4, 45641, 0, 1]); 1628 } 1629 1630 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation. 1631 int _mm_cvttsd_si32 (__m128d a) 1632 { 1633 // Generates cvttsd2si since LDC 1.3 -O0 1634 return cast(int)a.array[0]; 1635 } 1636 1637 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation. 1638 long _mm_cvttsd_si64 (__m128d a) 1639 { 1640 // Generates cvttsd2si since LDC 1.3 -O0 1641 // but in 32-bit instead, it's a long sequence that resort to FPU 1642 return cast(long)a.array[0]; 1643 } 1644 1645 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; /// 1646 1647 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`. 1648 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe 1649 { 1650 return a / b; 1651 } 1652 1653 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted 1654 { 1655 static if (GDC_with_SSE2) 1656 { 1657 return __builtin_ia32_divsd(a, b); 1658 } 1659 else version(DigitalMars) 1660 { 1661 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1662 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 1663 asm pure nothrow @nogc @trusted { nop;} 1664 a.array[0] = a.array[0] / b.array[0]; 1665 return a; 1666 } 1667 else 1668 { 1669 a.ptr[0] /= b.array[0]; 1670 return a; 1671 } 1672 } 1673 unittest 1674 { 1675 __m128d a = [2.0, 4.5]; 1676 a = _mm_div_sd(a, a); 1677 assert(a.array == [1.0, 4.5]); 1678 } 1679 1680 /// Extract a 16-bit integer from `v`, selected with `index` 1681 // PERF: ARM version has array bound check 1682 int _mm_extract_epi16(__m128i v, int index) pure @safe 1683 { 1684 short8 r = cast(short8)v; 1685 return cast(ushort)(r.array[index]); 1686 } 1687 unittest 1688 { 1689 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1); 1690 assert(_mm_extract_epi16(A, 6) == 6); 1691 assert(_mm_extract_epi16(A, 0) == 65535); 1692 } 1693 1694 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`. 1695 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted 1696 { 1697 short8 r = cast(short8)v; 1698 r.ptr[index & 7] = cast(short)i; 1699 return cast(__m128i)r; 1700 } 1701 unittest 1702 { 1703 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1704 short8 R = cast(short8) _mm_insert_epi16(A, 42, 6); 1705 short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7]; 1706 assert(R.array == correct); 1707 } 1708 1709 1710 void _mm_lfence() @trusted 1711 { 1712 version(GNU) 1713 { 1714 1715 static if (GDC_with_SSE2) 1716 { 1717 __builtin_ia32_lfence(); 1718 } 1719 else version(X86) 1720 { 1721 asm pure nothrow @nogc @trusted 1722 { 1723 "lfence;\n" : : : ; 1724 } 1725 } 1726 else 1727 static assert(false); 1728 } 1729 else static if (LDC_with_SSE2) 1730 { 1731 __builtin_ia32_lfence(); 1732 } 1733 else static if (DMD_with_asm) 1734 { 1735 asm nothrow @nogc pure @safe 1736 { 1737 lfence; 1738 } 1739 } 1740 else version(LDC) 1741 { 1742 llvm_memory_fence(); // PERF actually generates mfence 1743 } 1744 else 1745 static assert(false); 1746 } 1747 unittest 1748 { 1749 _mm_lfence(); 1750 } 1751 1752 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1753 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1754 __m128d _mm_load_pd (const(double) * mem_addr) pure 1755 { 1756 __m128d* aligned = cast(__m128d*)mem_addr; 1757 return *aligned; 1758 } 1759 unittest 1760 { 1761 align(16) double[2] S = [-5.0, 7.0]; 1762 __m128d R = _mm_load_pd(S.ptr); 1763 assert(R.array == S); 1764 } 1765 1766 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst. 1767 /// `mem_addr` does not need to be aligned on any particular boundary. 1768 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 1769 { 1770 double[2] arr = [*mem_addr, *mem_addr]; 1771 return loadUnaligned!(double2)(&arr[0]); 1772 } 1773 unittest 1774 { 1775 double what = 4; 1776 __m128d R = _mm_load_pd1(&what); 1777 double[2] correct = [4.0, 4]; 1778 assert(R.array == correct); 1779 } 1780 1781 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 1782 /// element. `mem_addr` does not need to be aligned on any particular boundary. 1783 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted 1784 { 1785 double2 r = [0, 0]; 1786 r.ptr[0] = *mem_addr; 1787 return r; 1788 } 1789 unittest 1790 { 1791 double x = -42; 1792 __m128d a = _mm_load_sd(&x); 1793 assert(a.array == [-42.0, 0.0]); 1794 } 1795 1796 /// Load 128-bits of integer data from memory into dst. 1797 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1798 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted // TODO: shoudln't be trusted because alignment, Issue #62 1799 { 1800 return *mem_addr; 1801 } 1802 unittest 1803 { 1804 align(16) int[4] correct = [-1, 2, 3, 4]; 1805 int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr); 1806 assert(A.array == correct); 1807 } 1808 1809 alias _mm_load1_pd = _mm_load_pd1; /// 1810 1811 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 1812 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary. 1813 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted 1814 { 1815 a.ptr[1] = *mem_addr; 1816 return a; 1817 } 1818 unittest 1819 { 1820 double A = 7.0; 1821 __m128d B = _mm_setr_pd(4.0, -5.0); 1822 __m128d R = _mm_loadh_pd(B, &A); 1823 double[2] correct = [ 4.0, 7.0 ]; 1824 assert(R.array == correct); 1825 } 1826 1827 /// Load 64-bit integer from memory into the first element of result. Zero out the other. 1828 // Note: strange signature since the memory doesn't have to aligned (Issue #60) 1829 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted // TODO signature 1830 { 1831 auto pLong = cast(const(long)*)mem_addr; 1832 long2 r = [0, 0]; 1833 r.ptr[0] = *pLong; 1834 return cast(__m128i)(r); 1835 } 1836 unittest 1837 { 1838 long A = 0x7878787870707070; 1839 long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A); 1840 long[2] correct = [0x7878787870707070, 0]; 1841 assert(R.array == correct); 1842 } 1843 1844 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 1845 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary. 1846 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted 1847 { 1848 a.ptr[0] = *mem_addr; 1849 return a; 1850 } 1851 unittest 1852 { 1853 double A = 7.0; 1854 __m128d B = _mm_setr_pd(4.0, -5.0); 1855 __m128d R = _mm_loadl_pd(B, &A); 1856 double[2] correct = [ 7.0, -5.0 ]; 1857 assert(R.array == correct); 1858 } 1859 1860 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 1861 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1862 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted // TODO: shouldn't be trusted 1863 { 1864 __m128d a = *cast(__m128d*)(mem_addr); 1865 __m128d r; 1866 r.ptr[0] = a.array[1]; 1867 r.ptr[1] = a.array[0]; 1868 return r; 1869 } 1870 unittest 1871 { 1872 align(16) double[2] A = [56.0, -74.0]; 1873 __m128d R = _mm_loadr_pd(A.ptr); 1874 double[2] correct = [-74.0, 56.0]; 1875 assert(R.array == correct); 1876 } 1877 1878 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1879 /// `mem_addr` does not need to be aligned on any particular boundary. 1880 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @safe 1881 { 1882 static if (GDC_with_SSE2) 1883 { 1884 return __builtin_ia32_loadupd(mem_addr); 1885 } 1886 else 1887 { 1888 return loadUnaligned!(double2)(mem_addr); 1889 } 1890 } 1891 unittest 1892 { 1893 double[2] A = [56.0, -75.0]; 1894 __m128d R = _mm_loadu_pd(A.ptr); 1895 double[2] correct = [56.0, -75.0]; 1896 assert(R.array == correct); 1897 } 1898 1899 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary. 1900 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 1901 { 1902 static if (GDC_with_SSE2) 1903 { 1904 return __builtin_ia32_loaddqu(cast(const(char*))mem_addr); 1905 } 1906 else 1907 { 1908 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 1909 } 1910 } 1911 unittest 1912 { 1913 align(16) int[4] correct = [-1, 2, -3, 4]; 1914 int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr); 1915 assert(A.array == correct); 1916 } 1917 1918 /// Load unaligned 32-bit integer from memory into the first element of result. 1919 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted 1920 { 1921 int r = *cast(int*)(mem_addr); 1922 int4 result = [0, 0, 0, 0]; 1923 result.ptr[0] = r; 1924 return result; 1925 } 1926 unittest 1927 { 1928 int r = 42; 1929 __m128i A = _mm_loadu_si32(&r); 1930 int[4] correct = [42, 0, 0, 0]; 1931 assert(A.array == correct); 1932 } 1933 1934 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 1935 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 1936 /// and pack the results in destination. 1937 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted 1938 { 1939 static if (GDC_with_SSE2) 1940 { 1941 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 1942 } 1943 else static if (LDC_with_SSE2) 1944 { 1945 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 1946 } 1947 else static if (LDC_with_ARM64) 1948 { 1949 int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b)); 1950 int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b)); 1951 int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl)); 1952 int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph)); 1953 return vcombine_s32(rl, rh); 1954 } 1955 else 1956 { 1957 short8 sa = cast(short8)a; 1958 short8 sb = cast(short8)b; 1959 int4 r; 1960 foreach(i; 0..4) 1961 { 1962 r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1]; 1963 } 1964 return r; 1965 } 1966 } 1967 unittest 1968 { 1969 short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 1970 short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 1971 int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B); 1972 int[4] correct = [1, 13, -2147483648, 2*32767*32767]; 1973 assert(R.array == correct); 1974 } 1975 1976 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 1977 /// (elements are not stored when the highest bit is not set in the corresponding element) 1978 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 1979 /// boundary. 1980 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted 1981 { 1982 static if (GDC_with_SSE2) 1983 { 1984 return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr); 1985 } 1986 else static if (LDC_with_SSE2) 1987 { 1988 return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr); 1989 } 1990 else 1991 { 1992 // PERF: catastrophic on ARM 1993 byte16 b = cast(byte16)a; 1994 byte16 m = cast(byte16)mask; 1995 byte* dest = cast(byte*)(mem_addr); 1996 foreach(j; 0..16) 1997 { 1998 if (m.array[j] & 128) 1999 { 2000 dest[j] = b.array[j]; 2001 } 2002 } 2003 } 2004 } 2005 unittest 2006 { 2007 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]; 2008 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0); 2009 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); 2010 _mm_maskmoveu_si128(A, mask, dest.ptr); 2011 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42]; 2012 assert(dest == correct); 2013 } 2014 2015 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values. 2016 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe 2017 { 2018 version(GNU) 2019 { 2020 // PERF: not necessarily the best for GDC 2021 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else 2022 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2023 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2024 return _mm_xor_si128(b, mask); 2025 } 2026 else 2027 { 2028 // x86: pmaxsw since LDC 1.0 -O1 2029 // ARM: smax.8h since LDC 1.5 -01 2030 short8 sa = cast(short8)a; 2031 short8 sb = cast(short8)b; 2032 short8 greater = greaterMask!short8(sa, sb); 2033 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2034 } 2035 } 2036 unittest 2037 { 2038 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57), 2039 _mm_setr_epi16(-4,-8, 9, 7, 0,-32768, 0, 0)); 2040 short[8] correct = [32767, 1, 9, 7, 9, 7, 0, 0]; 2041 assert(R.array == correct); 2042 } 2043 2044 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values. 2045 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe 2046 { 2047 version(LDC) 2048 { 2049 // x86: pmaxub since LDC 1.0.0 -O1 2050 // ARM64: umax.16b since LDC 1.5.0 -O1 2051 // PERF: catastrophic on ARM32 2052 alias ubyte16 = Vector!(ubyte[16]); 2053 ubyte16 sa = cast(ubyte16)a; 2054 ubyte16 sb = cast(ubyte16)b; 2055 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2056 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2057 } 2058 else 2059 { 2060 __m128i value128 = _mm_set1_epi8(-128); 2061 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2062 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2063 __m128i mask = _mm_and_si128(aTob, higher); 2064 return _mm_xor_si128(b, mask); 2065 } 2066 } 2067 unittest 2068 { 2069 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2070 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2071 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 2072 assert(R.array == correct); 2073 } 2074 2075 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed maximum values. 2076 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted 2077 { 2078 static if (GDC_with_SSE2) 2079 { 2080 return __builtin_ia32_maxpd(a, b); 2081 } 2082 else 2083 { 2084 // x86: Generates maxpd starting with LDC 1.9 -O2 2085 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2086 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2087 return a; 2088 } 2089 } 2090 unittest 2091 { 2092 __m128d A = _mm_setr_pd(4.0, 1.0); 2093 __m128d B = _mm_setr_pd(1.0, 8.0); 2094 __m128d M = _mm_max_pd(A, B); 2095 assert(M.array[0] == 4.0); 2096 assert(M.array[1] == 8.0); 2097 } 2098 2099 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 2100 /// lower element of result, and copy the upper element from `a` to the upper element of result. 2101 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted 2102 { 2103 static if (GDC_with_SSE2) 2104 { 2105 return __builtin_ia32_maxsd(a, b); 2106 } 2107 else 2108 { 2109 __m128d r = a; 2110 // Generates maxsd starting with LDC 1.3 2111 r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2112 return r; 2113 } 2114 } 2115 unittest 2116 { 2117 __m128d A = _mm_setr_pd(1.0, 1.0); 2118 __m128d B = _mm_setr_pd(4.0, 2.0); 2119 __m128d M = _mm_max_sd(A, B); 2120 assert(M.array[0] == 4.0); 2121 assert(M.array[1] == 1.0); 2122 } 2123 2124 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 2125 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 2126 /// is globally visible before any memory instruction which follows the fence in program order. 2127 void _mm_mfence() @trusted 2128 { 2129 version(GNU) 2130 { 2131 static if (GDC_with_SSE2) 2132 { 2133 __builtin_ia32_mfence(); 2134 } 2135 else version(X86) 2136 { 2137 asm pure nothrow @nogc @trusted 2138 { 2139 "mfence;\n" : : : ; 2140 } 2141 } 2142 else 2143 static assert(false); 2144 } 2145 else static if (LDC_with_SSE2) 2146 { 2147 __builtin_ia32_mfence(); 2148 } 2149 else static if (DMD_with_asm) 2150 { 2151 asm nothrow @nogc pure @safe 2152 { 2153 mfence; 2154 } 2155 } 2156 else version(LDC) 2157 { 2158 void _mm_mfence() pure @safe 2159 { 2160 // Note: will generate the DMB instruction on ARM 2161 llvm_memory_fence(); 2162 } 2163 } 2164 else 2165 static assert(false); 2166 } 2167 unittest 2168 { 2169 _mm_mfence(); 2170 } 2171 2172 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values. 2173 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe 2174 { 2175 version(GNU) 2176 { 2177 // PERF: not necessarily the best for GDC 2178 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else 2179 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2180 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2181 return _mm_xor_si128(b, mask); 2182 } 2183 else 2184 { 2185 // x86: pminsw since LDC 1.0 -O1 2186 // ARM: smin.8h since LDC 1.5 -01 2187 short8 sa = cast(short8)a; 2188 short8 sb = cast(short8)b; 2189 short8 greater = greaterMask!short8(sa, sb); 2190 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2191 } 2192 } 2193 unittest 2194 { 2195 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-32768), 2196 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 2197 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -32768]; 2198 assert(R.array == correct); 2199 } 2200 2201 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values. 2202 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe 2203 { 2204 version(LDC) 2205 { 2206 // x86: pminub since LDC 1.0.0 -O1 2207 // ARM: umin.16b since LDC 1.5.0 -O1 2208 // PERF: catastrophic on ARM32 2209 alias ubyte16 = Vector!(ubyte[16]); 2210 ubyte16 sa = cast(ubyte16)a; 2211 ubyte16 sb = cast(ubyte16)b; 2212 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2213 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2214 } 2215 else 2216 { 2217 __m128i value128 = _mm_set1_epi8(-128); 2218 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2219 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2220 __m128i mask = _mm_and_si128(aTob, lower); 2221 return _mm_xor_si128(b, mask); 2222 } 2223 } 2224 unittest 2225 { 2226 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2227 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2228 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 2229 assert(R.array == correct); 2230 } 2231 2232 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values. 2233 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted 2234 { 2235 static if (GDC_with_SSE2) 2236 { 2237 return __builtin_ia32_minpd(a, b); 2238 } 2239 else 2240 { 2241 // Generates minpd starting with LDC 1.9 2242 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2243 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2244 return a; 2245 } 2246 } 2247 unittest 2248 { 2249 __m128d A = _mm_setr_pd(1.0, 2.0); 2250 __m128d B = _mm_setr_pd(4.0, 1.0); 2251 __m128d M = _mm_min_pd(A, B); 2252 assert(M.array[0] == 1.0); 2253 assert(M.array[1] == 1.0); 2254 } 2255 2256 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 2257 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 2258 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe 2259 { 2260 static if (GDC_with_SSE2) 2261 { 2262 return __builtin_ia32_minsd(a, b); 2263 } 2264 else 2265 { 2266 // Generates minsd starting with LDC 1.3 2267 __m128d r = a; 2268 r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2269 return r; 2270 } 2271 } 2272 unittest 2273 { 2274 __m128d A = _mm_setr_pd(1.0, 3.0); 2275 __m128d B = _mm_setr_pd(4.0, 2.0); 2276 __m128d M = _mm_min_sd(A, B); 2277 assert(M.array[0] == 1.0); 2278 assert(M.array[1] == 3.0); 2279 } 2280 2281 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element. 2282 __m128i _mm_move_epi64 (__m128i a) pure @trusted 2283 { 2284 static if (GDC_with_SSE2) 2285 { 2286 return __builtin_ia32_movq128(a); 2287 } 2288 else 2289 { 2290 long2 result = [ 0, 0 ]; 2291 long2 la = cast(long2) a; 2292 result.ptr[0] = la.array[0]; 2293 return cast(__m128i)(result); 2294 } 2295 } 2296 unittest 2297 { 2298 long2 A = [13, 47]; 2299 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 2300 long[2] correct = [13, 0]; 2301 assert(B.array == correct); 2302 } 2303 2304 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 2305 /// the upper element from `a` to the upper element of dst. 2306 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted 2307 { 2308 static if (GDC_with_SSE2) 2309 { 2310 return __builtin_ia32_movsd(a, b); 2311 } 2312 else 2313 { 2314 b.ptr[1] = a.array[1]; 2315 return b; 2316 } 2317 } 2318 unittest 2319 { 2320 double2 A = [13.0, 47.0]; 2321 double2 B = [34.0, 58.0]; 2322 double2 C = _mm_move_sd(A, B); 2323 double[2] correct = [34.0, 47.0]; 2324 assert(C.array == correct); 2325 } 2326 2327 /// Create mask from the most significant bit of each 8-bit element in `v`. 2328 int _mm_movemask_epi8 (__m128i a) pure @trusted 2329 { 2330 static if (GDC_with_SSE2) 2331 { 2332 /// Create mask from the most significant bit of each 8-bit element in `v`. 2333 return __builtin_ia32_pmovmskb128(cast(byte16)a); 2334 } 2335 else static if (LDC_with_SSE2) 2336 { 2337 return __builtin_ia32_pmovmskb128(cast(byte16)a); 2338 } 2339 else static if (LDC_with_ARM64) 2340 { 2341 // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon 2342 // The other two solutions lead to unfound intrinsics in LLVM and that took a long time. 2343 // SO there might be something a bit faster, but this one is reasonable and branchless. 2344 byte8 mask_shift; 2345 mask_shift.ptr[0] = 7; 2346 mask_shift.ptr[1] = 6; 2347 mask_shift.ptr[2] = 5; 2348 mask_shift.ptr[3] = 4; 2349 mask_shift.ptr[4] = 3; 2350 mask_shift.ptr[5] = 2; 2351 mask_shift.ptr[6] = 1; 2352 mask_shift.ptr[7] = 0; 2353 byte8 mask_and = byte8(-128); 2354 byte8 lo = vget_low_u8(cast(byte16)a); 2355 byte8 hi = vget_high_u8(cast(byte16)a); 2356 lo = vand_u8(lo, mask_and); 2357 lo = vshr_u8(lo, mask_shift); 2358 hi = vand_u8(hi, mask_and); 2359 hi = vshr_u8(hi, mask_shift); 2360 lo = vpadd_u8(lo,lo); 2361 lo = vpadd_u8(lo,lo); 2362 lo = vpadd_u8(lo,lo); 2363 hi = vpadd_u8(hi,hi); 2364 hi = vpadd_u8(hi,hi); 2365 hi = vpadd_u8(hi,hi); 2366 return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]); 2367 } 2368 else 2369 { 2370 byte16 ai = cast(byte16)a; 2371 int r = 0; 2372 foreach(bit; 0..16) 2373 { 2374 if (ai.array[bit] < 0) r += (1 << bit); 2375 } 2376 return r; 2377 } 2378 } 2379 unittest 2380 { 2381 assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0))); 2382 } 2383 2384 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 2385 /// loating-point element in `v`. 2386 int _mm_movemask_pd(__m128d v) pure @safe 2387 { 2388 static if (GDC_with_SSE2) 2389 { 2390 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2391 /// packed double-precision (64-bit) floating-point element in `v`. 2392 return __builtin_ia32_movmskpd(v); 2393 } 2394 else static if (LDC_with_SSE2) 2395 { 2396 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2397 /// packed double-precision (64-bit) floating-point element in `v`. 2398 return __builtin_ia32_movmskpd(v); 2399 } 2400 else 2401 { 2402 long2 lv = cast(long2)v; 2403 int r = 0; 2404 if (lv.array[0] < 0) r += 1; 2405 if (lv.array[1] < 0) r += 2; 2406 return r; 2407 } 2408 } 2409 unittest 2410 { 2411 __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0); 2412 assert(_mm_movemask_pd(A) == 2); 2413 } 2414 2415 /// Copy the lower 64-bit integer in `v`. 2416 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe 2417 { 2418 long2 lv = cast(long2)v; 2419 return long1(lv.array[0]); 2420 } 2421 unittest 2422 { 2423 __m128i A = _mm_set_epi64x(-1, -2); 2424 __m64 R = _mm_movepi64_pi64(A); 2425 assert(R.array[0] == -2); 2426 } 2427 2428 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element. 2429 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted 2430 { 2431 long2 r; 2432 r.ptr[0] = a.array[0]; 2433 r.ptr[1] = 0; 2434 return cast(__m128i)r; 2435 } 2436 2437 // Note: generates pmuludq in LDC with -O1 2438 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted 2439 { 2440 __m128i zero = _mm_setzero_si128(); 2441 2442 static if (__VERSION__ >= 2088) 2443 { 2444 // Need LLVM9 to avoid this shufflevector 2445 long2 la, lb; 2446 la.ptr[0] = cast(uint)a.array[0]; 2447 la.ptr[1] = cast(uint)a.array[2]; 2448 lb.ptr[0] = cast(uint)b.array[0]; 2449 lb.ptr[1] = cast(uint)b.array[2]; 2450 } 2451 else 2452 { 2453 long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero); 2454 long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero); 2455 } 2456 2457 version(DigitalMars) 2458 { 2459 // DMD has no long2 mul 2460 // long2 mul not supported before LDC 1.5 2461 la.ptr[0] *= lb.array[0]; 2462 la.ptr[1] *= lb.array[1]; 2463 return cast(__m128i)(la); 2464 } 2465 else 2466 { 2467 static if (__VERSION__ >= 2076) 2468 { 2469 return cast(__m128i)(la * lb); 2470 } 2471 else 2472 { 2473 // long2 mul not supported before LDC 1.5 2474 la.ptr[0] *= lb.array[0]; 2475 la.ptr[1] *= lb.array[1]; 2476 return cast(__m128i)(la); 2477 } 2478 } 2479 } 2480 unittest 2481 { 2482 __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff); 2483 __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff); 2484 __m128i C = _mm_mul_epu32(A, B); 2485 long2 LC = cast(long2)C; 2486 assert(LC.array[0] == 18446744065119617025uL); 2487 assert(LC.array[1] == 12723420444339690338uL); 2488 } 2489 2490 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 2491 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 2492 { 2493 return a * b; 2494 } 2495 unittest 2496 { 2497 __m128d a = [-2.0, 1.5]; 2498 a = _mm_mul_pd(a, a); 2499 assert(a.array == [4.0, 2.25]); 2500 } 2501 2502 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 2503 /// element of result, and copy the upper element from `a` to the upper element of result. 2504 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted 2505 { 2506 version(DigitalMars) 2507 { 2508 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 2509 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 2510 asm pure nothrow @nogc @trusted { nop;} 2511 a.array[0] = a.array[0] * b.array[0]; 2512 return a; 2513 } 2514 else static if (GDC_with_SSE2) 2515 { 2516 return __builtin_ia32_mulsd(a, b); 2517 } 2518 else 2519 { 2520 a.ptr[0] *= b.array[0]; 2521 return a; 2522 } 2523 } 2524 unittest 2525 { 2526 __m128d a = [-2.0, 1.5]; 2527 a = _mm_mul_sd(a, a); 2528 assert(a.array == [4.0, 1.5]); 2529 } 2530 2531 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 2532 /// and get an unsigned 64-bit result. 2533 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe 2534 { 2535 return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b))); 2536 } 2537 unittest 2538 { 2539 __m64 A = _mm_set_pi32(42, 0xDEADBEEF); 2540 __m64 B = _mm_set_pi32(42, 0xCAFEBABE); 2541 __m64 C = _mm_mul_su32(A, B); 2542 assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL); 2543 } 2544 2545 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2546 /// high 16 bits of the intermediate integers. 2547 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted 2548 { 2549 static if (GDC_with_SSE2) 2550 { 2551 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2552 } 2553 else static if (LDC_with_SSE2) 2554 { 2555 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2556 } 2557 else 2558 { 2559 // PERF ARM? 2560 short8 sa = cast(short8)a; 2561 short8 sb = cast(short8)b; 2562 short8 r = void; 2563 r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16; 2564 r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16; 2565 r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16; 2566 r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16; 2567 r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16; 2568 r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16; 2569 r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16; 2570 r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16; 2571 return cast(__m128i)r; 2572 } 2573 } 2574 unittest 2575 { 2576 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2577 __m128i B = _mm_set1_epi16(16384); 2578 short8 R = cast(short8)_mm_mulhi_epi16(A, B); 2579 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1]; 2580 assert(R.array == correct); 2581 } 2582 2583 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2584 /// high 16 bits of the intermediate integers. 2585 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted 2586 { 2587 static if (GDC_with_SSE2) 2588 { 2589 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2590 } 2591 else static if (LDC_with_SSE2) 2592 { 2593 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2594 } 2595 else 2596 { 2597 // PERF ARM?? 2598 short8 sa = cast(short8)a; 2599 short8 sb = cast(short8)b; 2600 short8 r = void; 2601 r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 ); 2602 r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 ); 2603 r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 ); 2604 r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 ); 2605 r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 ); 2606 r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 ); 2607 r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 ); 2608 r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 ); 2609 return cast(__m128i)r; 2610 } 2611 } 2612 unittest 2613 { 2614 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2615 __m128i B = _mm_set1_epi16(16384); 2616 short8 R = cast(short8)_mm_mulhi_epu16(A, B); 2617 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1]; 2618 assert(R.array == correct); 2619 } 2620 2621 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 2622 /// bits of the intermediate integers. 2623 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe 2624 { 2625 return cast(__m128i)(cast(short8)a * cast(short8)b); 2626 } 2627 unittest 2628 { 2629 __m128i A = _mm_setr_epi16(16384, -16, 0, 3, 4, 1, 16, 7); 2630 __m128i B = _mm_set1_epi16(16384); 2631 short8 R = cast(short8)_mm_mullo_epi16(A, B); 2632 short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384]; 2633 assert(R.array == correct); 2634 } 2635 2636 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 2637 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 2638 { 2639 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 2640 } 2641 2642 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`. 2643 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 2644 { 2645 return a | b; 2646 } 2647 2648 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation. 2649 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted 2650 { 2651 static if (GDC_with_SSE2) 2652 { 2653 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2654 } 2655 else static if (LDC_with_SSE2) 2656 { 2657 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2658 } 2659 else static if (LDC_with_ARM64) 2660 { 2661 short4 ra = vqmovn_s32(cast(int4)a); 2662 short4 rb = vqmovn_s32(cast(int4)b); 2663 return cast(__m128i)vcombine_s16(ra, rb); 2664 } 2665 else 2666 { 2667 // PERF: catastrophic on ARM 2668 short8 r; 2669 r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]); 2670 r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]); 2671 r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]); 2672 r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]); 2673 r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]); 2674 r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]); 2675 r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]); 2676 r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]); 2677 return cast(__m128i)r; 2678 } 2679 } 2680 unittest 2681 { 2682 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 2683 short8 R = cast(short8) _mm_packs_epi32(A, A); 2684 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0]; 2685 assert(R.array == correct); 2686 } 2687 2688 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation. 2689 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted 2690 { 2691 static if (GDC_with_SSE2) 2692 { 2693 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2694 } 2695 else static if (LDC_with_SSE2) 2696 { 2697 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2698 } 2699 else static if (LDC_with_ARM64) 2700 { 2701 // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02 2702 byte8 ra = vqmovn_s16(cast(short8)a); 2703 byte8 rb = vqmovn_s16(cast(short8)b); 2704 return cast(__m128i)vcombine_s8(ra, rb); 2705 } 2706 else 2707 { 2708 // PERF: ARM32 is missing 2709 byte16 r; 2710 short8 sa = cast(short8)a; 2711 short8 sb = cast(short8)b; 2712 foreach(i; 0..8) 2713 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]); 2714 foreach(i; 0..8) 2715 r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]); 2716 return cast(__m128i)r; 2717 } 2718 } 2719 unittest 2720 { 2721 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0); 2722 byte16 R = cast(byte16) _mm_packs_epi16(A, A); 2723 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0, 2724 127, -128, 127, 0, 127, -128, 127, 0]; 2725 assert(R.array == correct); 2726 } 2727 2728 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation. 2729 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 2730 { 2731 static if (GDC_with_SSE2) 2732 { 2733 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2734 } 2735 else static if (LDC_with_SSE2) 2736 { 2737 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2738 } 2739 else static if (LDC_with_ARM64) 2740 { 2741 // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02 2742 byte8 ra = vqmovun_s16(cast(short8)a); 2743 byte8 rb = vqmovun_s16(cast(short8)b); 2744 return cast(__m128i)vcombine_s8(ra, rb); 2745 } 2746 else 2747 { 2748 short8 sa = cast(short8)a; 2749 short8 sb = cast(short8)b; 2750 ubyte[16] result = void; 2751 for (int i = 0; i < 8; ++i) 2752 { 2753 short s = sa[i]; 2754 if (s < 0) s = 0; 2755 if (s > 255) s = 255; 2756 result[i] = cast(ubyte)s; 2757 2758 s = sb[i]; 2759 if (s < 0) s = 0; 2760 if (s > 255) s = 255; 2761 result[i+8] = cast(ubyte)s; 2762 } 2763 return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr); 2764 } 2765 } 2766 unittest 2767 { 2768 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 2769 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 2770 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 2771 0, 255, 0, 255, 255, 2, 1, 0]; 2772 foreach(i; 0..16) 2773 assert(AA.array[i] == cast(byte)(correctResult[i])); 2774 } 2775 2776 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 2777 /// and power consumption of spin-wait loops. 2778 void _mm_pause() @trusted 2779 { 2780 version(GNU) 2781 { 2782 static if (GDC_with_SSE2) 2783 { 2784 __builtin_ia32_pause(); 2785 } 2786 else version(X86) 2787 { 2788 asm pure nothrow @nogc @trusted 2789 { 2790 "pause;\n" : : : ; 2791 } 2792 } 2793 else 2794 static assert(false); 2795 } 2796 else static if (LDC_with_SSE2) 2797 { 2798 __builtin_ia32_pause(); 2799 } 2800 else static if (DMD_with_asm) 2801 { 2802 asm nothrow @nogc pure @safe 2803 { 2804 rep; nop; // F3 90 = pause 2805 } 2806 } 2807 else version (LDC) 2808 { 2809 // PERF: Do nothing currently , could be the "yield" intruction on ARM. 2810 } 2811 else 2812 static assert(false); 2813 } 2814 unittest 2815 { 2816 _mm_pause(); 2817 } 2818 2819 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 2820 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 2821 /// low 16 bits of 64-bit elements in result. 2822 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted 2823 { 2824 static if (GDC_with_SSE2) 2825 { 2826 return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b); 2827 } 2828 else static if (LDC_with_SSE2) 2829 { 2830 return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b); 2831 } 2832 else 2833 { 2834 // PERF: ARM?? 2835 byte16 ab = cast(byte16)a; 2836 byte16 bb = cast(byte16)b; 2837 ubyte[16] t; 2838 foreach(i; 0..16) 2839 { 2840 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]); 2841 if (diff < 0) diff = -diff; 2842 t[i] = cast(ubyte)(diff); 2843 } 2844 int4 r = _mm_setzero_si128(); 2845 r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 2846 r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15]; 2847 return r; 2848 } 2849 } 2850 unittest 2851 { 2852 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 2853 __m128i B = _mm_set1_epi8(1); 2854 __m128i R = _mm_sad_epu8(A, B); 2855 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 2856 0, 2857 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 2858 0]; 2859 assert(R.array == correct); 2860 } 2861 2862 /// Set packed 16-bit integers with the supplied values. 2863 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 2864 { 2865 short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7]; 2866 return cast(__m128i) loadUnaligned!(short8)(result.ptr); 2867 } 2868 unittest 2869 { 2870 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 2871 short8 B = cast(short8) A; 2872 foreach(i; 0..8) 2873 assert(B.array[i] == i); 2874 } 2875 2876 /// Set packed 32-bit integers with the supplied values. 2877 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 2878 { 2879 int[4] result = [e0, e1, e2, e3]; 2880 return loadUnaligned!(int4)(result.ptr); 2881 } 2882 unittest 2883 { 2884 __m128i A = _mm_set_epi32(3, 2, 1, 0); 2885 foreach(i; 0..4) 2886 assert(A.array[i] == i); 2887 } 2888 2889 /// Set packed 64-bit integers with the supplied values. 2890 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted 2891 { 2892 long[2] result = [e0.array[0], e1.array[0]]; 2893 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 2894 } 2895 unittest 2896 { 2897 __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678)); 2898 long2 B = cast(long2) A; 2899 assert(B.array[0] == 5678); 2900 assert(B.array[1] == 1234); 2901 } 2902 2903 /// Set packed 64-bit integers with the supplied values. 2904 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 2905 { 2906 long[2] result = [e0, e1]; 2907 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 2908 } 2909 unittest 2910 { 2911 __m128i A = _mm_set_epi64x(1234, 5678); 2912 long2 B = cast(long2) A; 2913 assert(B.array[0] == 5678); 2914 assert(B.array[1] == 1234); 2915 } 2916 2917 /// Set packed 8-bit integers with the supplied values. 2918 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 2919 byte e11, byte e10, byte e9, byte e8, 2920 byte e7, byte e6, byte e5, byte e4, 2921 byte e3, byte e2, byte e1, byte e0) pure @trusted 2922 { 2923 byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 2924 e8, e9, e10, e11, e12, e13, e14, e15]; 2925 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 2926 } 2927 2928 /// Set packed double-precision (64-bit) floating-point elements with the supplied values. 2929 __m128d _mm_set_pd (double e1, double e0) pure @trusted 2930 { 2931 double[2] result = [e0, e1]; 2932 return loadUnaligned!(double2)(result.ptr); 2933 } 2934 unittest 2935 { 2936 __m128d A = _mm_set_pd(61.0, 55.0); 2937 double[2] correct = [55.0, 61.0]; 2938 assert(A.array == correct); 2939 } 2940 2941 /// Broadcast double-precision (64-bit) floating-point value `a` to all element. 2942 __m128d _mm_set_pd1 (double a) pure @trusted 2943 { 2944 double[2] result = [a, a]; 2945 return loadUnaligned!(double2)(result.ptr); 2946 } 2947 unittest 2948 { 2949 __m128d A = _mm_set_pd1(61.0); 2950 double[2] correct = [61.0, 61.0]; 2951 assert(A.array == correct); 2952 } 2953 2954 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 2955 /// and zero the upper element. 2956 __m128d _mm_set_sd (double a) pure @trusted 2957 { 2958 double[2] result = [a, 0]; 2959 return loadUnaligned!(double2)(result.ptr); 2960 } 2961 2962 /// Broadcast 16-bit integer a to all elements of dst. 2963 __m128i _mm_set1_epi16 (short a) pure @trusted 2964 { 2965 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 2966 { 2967 short8 v = a; 2968 return cast(__m128i) v; 2969 } 2970 else 2971 return cast(__m128i)(short8(a)); 2972 } 2973 unittest 2974 { 2975 short8 a = cast(short8) _mm_set1_epi16(31); 2976 for (int i = 0; i < 8; ++i) 2977 assert(a.array[i] == 31); 2978 } 2979 2980 /// Broadcast 32-bit integer `a` to all elements. 2981 __m128i _mm_set1_epi32 (int a) pure @trusted 2982 { 2983 return cast(__m128i)(int4(a)); 2984 } 2985 unittest 2986 { 2987 __m128 a = _mm_set1_ps(-1.0f); 2988 __m128 b = cast(__m128) _mm_set1_epi32(0x7fffffff); 2989 assert(_mm_and_ps(a, b).array == [1.0f, 1, 1, 1]); 2990 } 2991 2992 /// Broadcast 64-bit integer `a` to all elements. 2993 __m128i _mm_set1_epi64 (__m64 a) pure @safe 2994 { 2995 return _mm_set_epi64(a, a); 2996 } 2997 unittest 2998 { 2999 long b = 0x1DEADCAFE; 3000 __m64 a; 3001 a.ptr[0] = b; 3002 long2 c = cast(long2) _mm_set1_epi64(a); 3003 assert(c.array[0] == b); 3004 assert(c.array[1] == b); 3005 } 3006 3007 /// Broadcast 64-bit integer `a` to all elements 3008 __m128i _mm_set1_epi64x (long a) pure @trusted 3009 { 3010 long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3011 return cast(__m128i)(b); 3012 } 3013 unittest 3014 { 3015 long b = 0x1DEADCAFE; 3016 long2 c = cast(long2) _mm_set1_epi64x(b); 3017 for (int i = 0; i < 2; ++i) 3018 assert(c.array[i] == b); 3019 } 3020 3021 /// Broadcast 8-bit integer `a` to all elements. 3022 __m128i _mm_set1_epi8 (byte a) pure @trusted 3023 { 3024 byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3025 return cast(__m128i)(b); 3026 } 3027 unittest 3028 { 3029 byte16 b = cast(byte16) _mm_set1_epi8(31); 3030 for (int i = 0; i < 16; ++i) 3031 assert(b.array[i] == 31); 3032 } 3033 3034 alias _mm_set1_pd = _mm_set_pd1; 3035 3036 /// Set packed 16-bit integers with the supplied values in reverse order. 3037 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 3038 short e3, short e2, short e1, short e0) pure @trusted 3039 { 3040 short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0]; 3041 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 3042 } 3043 unittest 3044 { 3045 short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0); 3046 short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0]; 3047 assert(A.array == correct); 3048 } 3049 3050 /// Set packed 32-bit integers with the supplied values in reverse order. 3051 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3052 { 3053 int[4] result = [e3, e2, e1, e0]; 3054 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 3055 } 3056 unittest 3057 { 3058 int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647); 3059 int[4] correct = [-1, 0, -2147483648, 2147483647]; 3060 assert(A.array == correct); 3061 } 3062 3063 /// Set packed 64-bit integers with the supplied values in reverse order. 3064 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 3065 { 3066 long[2] result = [e1, e0]; 3067 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 3068 } 3069 unittest 3070 { 3071 long2 A = cast(long2) _mm_setr_epi64(-1, 0); 3072 long[2] correct = [-1, 0]; 3073 assert(A.array == correct); 3074 } 3075 3076 /// Set packed 8-bit integers with the supplied values in reverse order. 3077 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12, 3078 byte e11, byte e10, byte e9, byte e8, 3079 byte e7, byte e6, byte e5, byte e4, 3080 byte e3, byte e2, byte e1, byte e0) pure @trusted 3081 { 3082 byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 3083 e7, e6, e5, e4, e3, e2, e1, e0]; 3084 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 3085 } 3086 3087 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order. 3088 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 3089 { 3090 double[2] result = [e1, e0]; 3091 return loadUnaligned!(double2)(result.ptr); 3092 } 3093 unittest 3094 { 3095 __m128d A = _mm_setr_pd(61.0, 55.0); 3096 double[2] correct = [61.0, 55.0]; 3097 assert(A.array == correct); 3098 } 3099 3100 /// Return vector of type `__m128d` with all elements set to zero. 3101 __m128d _mm_setzero_pd () pure @trusted 3102 { 3103 // Note: using loadUnaligned has better -O0 codegen compared to .ptr 3104 double[2] result = [0.0, 0.0]; 3105 return loadUnaligned!(double2)(result.ptr); 3106 } 3107 3108 /// Return vector of type `__m128i` with all elements set to zero. 3109 __m128i _mm_setzero_si128() pure @trusted 3110 { 3111 // Note: using loadUnaligned has better -O0 codegen compared to .ptr 3112 int[4] result = [0, 0, 0, 0]; 3113 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 3114 } 3115 3116 /// Shuffle 32-bit integers in a using the control in `imm8`. 3117 /// See_also: `_MM_SHUFFLE`. 3118 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe 3119 { 3120 static if (GDC_with_SSE2) 3121 { 3122 return __builtin_ia32_pshufd(a, imm8); 3123 } 3124 else 3125 { 3126 return shufflevector!(int4, (imm8 >> 0) & 3, 3127 (imm8 >> 2) & 3, 3128 (imm8 >> 4) & 3, 3129 (imm8 >> 6) & 3)(a, a); 3130 } 3131 } 3132 unittest 3133 { 3134 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 3135 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3136 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 3137 int[4] expectedB = [ 3, 2, 1, 0 ]; 3138 assert(B.array == expectedB); 3139 } 3140 3141 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`. 3142 /// See_also: `_MM_SHUFFLE2`. 3143 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe 3144 { 3145 static if (GDC_with_SSE2) 3146 { 3147 return __builtin_ia32_shufpd(a, b, imm8); 3148 } 3149 else 3150 { 3151 return shufflevector!(double2, 0 + ( imm8 & 1 ), 3152 2 + ( (imm8 >> 1) & 1 ))(a, b); 3153 } 3154 } 3155 unittest 3156 { 3157 __m128d A = _mm_setr_pd(0.5, 2.0); 3158 __m128d B = _mm_setr_pd(4.0, 5.0); 3159 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 3160 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B); 3161 double[2] correct = [ 2.0, 5.0 ]; 3162 assert(R.array == correct); 3163 } 3164 3165 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 3166 /// 64 bits of result, with the low 64 bits being copied from from `a` to result. 3167 /// See also: `_MM_SHUFFLE`. 3168 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe 3169 { 3170 static if (GDC_with_SSE2) 3171 { 3172 return __builtin_ia32_pshufhw(a, imm8); 3173 } 3174 else 3175 { 3176 return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3, 3177 4 + ( (imm8 >> 0) & 3 ), 3178 4 + ( (imm8 >> 2) & 3 ), 3179 4 + ( (imm8 >> 4) & 3 ), 3180 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 3181 } 3182 } 3183 unittest 3184 { 3185 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3186 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3187 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 3188 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 3189 assert(C.array == expectedC); 3190 } 3191 3192 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 3193 /// bits of result, with the high 64 bits being copied from from `a` to result. 3194 /// See_also: `_MM_SHUFFLE`. 3195 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe 3196 { 3197 static if (GDC_with_SSE2) 3198 { 3199 return __builtin_ia32_pshuflw(a, imm8); 3200 } 3201 else 3202 { 3203 return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ), 3204 ( (imm8 >> 2) & 3 ), 3205 ( (imm8 >> 4) & 3 ), 3206 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 3207 } 3208 } 3209 unittest 3210 { 3211 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3212 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3213 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 3214 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 3215 assert(B.array == expectedB); 3216 } 3217 3218 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros. 3219 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted 3220 { 3221 static if (LDC_with_SSE2) 3222 { 3223 return __builtin_ia32_pslld128(a, count); 3224 } 3225 else static if (GDC_with_SSE2) 3226 { 3227 return __builtin_ia32_pslld128(a, count); 3228 } 3229 else static if (DMD_with_32bit_asm) 3230 { 3231 asm pure nothrow @nogc @trusted 3232 { 3233 movdqu XMM0, a; 3234 movdqu XMM1, count; 3235 pslld XMM0, XMM1; 3236 movdqu a, XMM0; 3237 } 3238 return a; 3239 } 3240 else 3241 { 3242 int4 r = void; 3243 long2 lc = cast(long2)count; 3244 int bits = cast(int)(lc.array[0]); 3245 foreach(i; 0..4) 3246 r[i] = cast(uint)(a[i]) << bits; 3247 return r; 3248 } 3249 } 3250 3251 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros. 3252 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted 3253 { 3254 static if (LDC_with_SSE2) 3255 { 3256 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3257 } 3258 else static if (GDC_with_SSE2) 3259 { 3260 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3261 } 3262 else static if (DMD_with_32bit_asm) 3263 { 3264 asm pure nothrow @nogc @trusted 3265 { 3266 movdqu XMM0, a; 3267 movdqu XMM1, count; 3268 psllq XMM0, XMM1; 3269 movdqu a, XMM0; 3270 } 3271 return a; 3272 } 3273 else 3274 { 3275 // ARM: good since LDC 1.12 -O2 3276 // ~but -O0 version is catastrophic 3277 long2 r = void; 3278 long2 sa = cast(long2)a; 3279 long2 lc = cast(long2)count; 3280 int bits = cast(int)(lc.array[0]); 3281 foreach(i; 0..2) 3282 r.array[i] = cast(ulong)(sa.array[i]) << bits; 3283 return cast(__m128i)r; 3284 } 3285 } 3286 3287 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros. 3288 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 3289 { 3290 static if (LDC_with_SSE2) 3291 { 3292 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3293 } 3294 else static if (GDC_with_SSE2) 3295 { 3296 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3297 } 3298 else static if (DMD_with_32bit_asm) 3299 { 3300 asm pure nothrow @nogc 3301 { 3302 movdqu XMM0, a; 3303 movdqu XMM1, count; 3304 psllw XMM0, XMM1; 3305 movdqu a, XMM0; 3306 } 3307 return a; 3308 } 3309 else 3310 { 3311 short8 sa = cast(short8)a; 3312 long2 lc = cast(long2)count; 3313 int bits = cast(int)(lc.array[0]); 3314 short8 r = void; 3315 foreach(i; 0..8) 3316 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits); 3317 return cast(int4)r; 3318 } 3319 } 3320 3321 3322 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 3323 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted 3324 { 3325 static if (GDC_with_SSE2) 3326 { 3327 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3328 } 3329 else static if (LDC_with_SSE2) 3330 { 3331 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3332 } 3333 else 3334 { 3335 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3336 // D says "It's illegal to shift by the same or more bits 3337 // than the size of the quantity being shifted" 3338 // and it's UB instead. 3339 int4 r = _mm_setzero_si128(); 3340 3341 ubyte count = cast(ubyte) imm8; 3342 if (count > 31) 3343 return r; 3344 3345 foreach(i; 0..4) 3346 r.array[i] = cast(uint)(a.array[i]) << count; 3347 return r; 3348 } 3349 } 3350 unittest 3351 { 3352 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3353 __m128i B = _mm_slli_epi32(A, 1); 3354 __m128i B2 = _mm_slli_epi32(A, 1 + 256); 3355 int[4] expectedB = [ 0, 4, 6, -8]; 3356 assert(B.array == expectedB); 3357 assert(B2.array == expectedB); 3358 3359 __m128i C = _mm_slli_epi32(A, 0); 3360 int[4] expectedC = [ 0, 2, 3, -4]; 3361 assert(C.array == expectedC); 3362 3363 __m128i D = _mm_slli_epi32(A, 65); 3364 int[4] expectedD = [ 0, 0, 0, 0]; 3365 assert(D.array == expectedD); 3366 } 3367 3368 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros. 3369 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted 3370 { 3371 static if (GDC_with_SSE2) 3372 { 3373 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3374 } 3375 else static if (LDC_with_SSE2) 3376 { 3377 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3378 } 3379 else 3380 { 3381 long2 sa = cast(long2)a; 3382 3383 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3384 // D says "It's illegal to shift by the same or more bits 3385 // than the size of the quantity being shifted" 3386 // and it's UB instead. 3387 long2 r = cast(long2) _mm_setzero_si128(); 3388 ubyte count = cast(ubyte) imm8; 3389 if (count > 63) 3390 return cast(__m128i)r; 3391 3392 r.ptr[0] = cast(ulong)(sa.array[0]) << count; 3393 r.ptr[1] = cast(ulong)(sa.array[1]) << count; 3394 return cast(__m128i)r; 3395 } 3396 } 3397 unittest 3398 { 3399 __m128i A = _mm_setr_epi64(8, -4); 3400 long2 B = cast(long2) _mm_slli_epi64(A, 1); 3401 long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024); 3402 long[2] expectedB = [ 16, -8]; 3403 assert(B.array == expectedB); 3404 assert(B2.array == expectedB); 3405 3406 long2 C = cast(long2) _mm_slli_epi64(A, 0); 3407 long[2] expectedC = [ 8, -4]; 3408 assert(C.array == expectedC); 3409 3410 long2 D = cast(long2) _mm_slli_epi64(A, 64); 3411 long[2] expectedD = [ 0, -0]; 3412 assert(D.array == expectedD); 3413 } 3414 3415 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 3416 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted 3417 { 3418 static if (GDC_with_SSE2) 3419 { 3420 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3421 } 3422 else static if (LDC_with_SSE2) 3423 { 3424 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3425 } 3426 else static if (LDC_with_ARM64) 3427 { 3428 short8 sa = cast(short8)a; 3429 short8 r = cast(short8)_mm_setzero_si128(); 3430 ubyte count = cast(ubyte) imm8; 3431 if (count > 15) 3432 return cast(__m128i)r; 3433 r = sa << short8(count); 3434 return cast(__m128i)r; 3435 } 3436 else 3437 { 3438 short8 sa = cast(short8)a; 3439 short8 r = cast(short8)_mm_setzero_si128(); 3440 ubyte count = cast(ubyte) imm8; 3441 if (count > 15) 3442 return cast(__m128i)r; 3443 foreach(i; 0..8) 3444 r.ptr[i] = cast(short)(sa.array[i] << count); 3445 return cast(__m128i)r; 3446 } 3447 } 3448 unittest 3449 { 3450 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3451 short8 B = cast(short8)( _mm_slli_epi16(A, 1) ); 3452 short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) ); 3453 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 3454 assert(B.array == expectedB); 3455 assert(B2.array == expectedB); 3456 3457 short8 C = cast(short8)( _mm_slli_epi16(A, 16) ); 3458 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 3459 assert(C.array == expectedC); 3460 } 3461 3462 3463 /// Shift `a` left by `bytes` bytes while shifting in zeros. 3464 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted 3465 { 3466 static if (bytes & 0xF0) 3467 { 3468 return _mm_setzero_si128(); 3469 } 3470 else 3471 { 3472 static if (GDC_with_SSE2) 3473 { 3474 return __builtin_ia32_pslldqi128(op, cast(ubyte)(bytes * 8)); 3475 } 3476 else version(DigitalMars) 3477 { 3478 version(D_InlineAsm_X86) 3479 { 3480 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64 3481 { 3482 movdqu XMM0, op; 3483 pslldq XMM0, bytes; 3484 movdqu op, XMM0; 3485 } 3486 return op; 3487 } 3488 else 3489 { 3490 byte16 A = cast(byte16)op; 3491 byte16 R; 3492 for (int n = 15; n >= bytes; --n) 3493 R.ptr[n] = A.array[n-bytes]; 3494 for (int n = bytes-1; n >= 0; --n) 3495 R.ptr[n] = 0; 3496 return cast(__m128i)R; 3497 } 3498 } 3499 else 3500 { 3501 return cast(__m128i) shufflevector!(byte16, 3502 16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes, 3503 22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes, 3504 28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes) 3505 (cast(byte16)_mm_setzero_si128(), cast(byte16)op); 3506 } 3507 } 3508 } 3509 unittest 3510 { 3511 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3512 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left 3513 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ]; 3514 assert(R.array == correct); 3515 3516 __m128i B = _mm_srli_si128!16(_mm_set1_epi32(-1)); 3517 int[4] expectedB = [0, 0, 0, 0]; 3518 assert(B.array == expectedB); 3519 } 3520 3521 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`. 3522 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted 3523 { 3524 version(LDC) 3525 { 3526 // Disappeared with LDC 1.11 3527 static if (__VERSION__ < 2081) 3528 return __builtin_ia32_sqrtpd(vec); 3529 else 3530 { 3531 vec.array[0] = llvm_sqrt(vec.array[0]); 3532 vec.array[1] = llvm_sqrt(vec.array[1]); 3533 return vec; 3534 } 3535 } 3536 else static if (GDC_with_SSE2) 3537 { 3538 return __builtin_ia32_sqrtpd(vec); 3539 } 3540 else 3541 { 3542 vec.ptr[0] = sqrt(vec.array[0]); 3543 vec.ptr[1] = sqrt(vec.array[1]); 3544 return vec; 3545 } 3546 } 3547 3548 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 3549 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 3550 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted 3551 { 3552 // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only. 3553 // "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 3554 // The quadword at bits 127:64 of the destination operand remains unchanged." 3555 version(LDC) 3556 { 3557 // Disappeared with LDC 1.11 3558 static if (__VERSION__ < 2081) 3559 { 3560 __m128d c = __builtin_ia32_sqrtsd(b); 3561 a[0] = c[0]; 3562 return a; 3563 } 3564 else 3565 { 3566 a.array[0] = llvm_sqrt(b.array[0]); 3567 return a; 3568 } 3569 } 3570 else static if (GDC_with_SSE2) 3571 { 3572 __m128d c = __builtin_ia32_sqrtsd(b); 3573 a.ptr[0] = c.array[0]; 3574 return a; 3575 } 3576 else 3577 { 3578 a.ptr[0] = sqrt(b.array[0]); 3579 return a; 3580 } 3581 } 3582 unittest 3583 { 3584 __m128d A = _mm_setr_pd(1.0, 3.0); 3585 __m128d B = _mm_setr_pd(4.0, 5.0); 3586 __m128d R = _mm_sqrt_sd(A, B); 3587 double[2] correct = [2.0, 3.0 ]; 3588 assert(R.array == correct); 3589 } 3590 3591 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits. 3592 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted 3593 { 3594 static if (GDC_with_SSE2) 3595 { 3596 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3597 } 3598 else static if (LDC_with_SSE2) 3599 { 3600 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3601 } 3602 else 3603 { 3604 short8 sa = cast(short8)a; 3605 long2 lc = cast(long2)count; 3606 int bits = cast(int)(lc.array[0]); 3607 short8 r = void; 3608 foreach(i; 0..8) 3609 r.ptr[i] = cast(short)(sa.array[i] >> bits); 3610 return cast(int4)r; 3611 } 3612 } 3613 3614 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits. 3615 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted 3616 { 3617 static if (LDC_with_SSE2) 3618 { 3619 return __builtin_ia32_psrad128(a, count); 3620 } 3621 else static if (GDC_with_SSE2) 3622 { 3623 return __builtin_ia32_psrad128(a, count); 3624 } 3625 else 3626 { 3627 int4 r = void; 3628 long2 lc = cast(long2)count; 3629 int bits = cast(int)(lc.array[0]); 3630 r.ptr[0] = (a.array[0] >> bits); 3631 r.ptr[1] = (a.array[1] >> bits); 3632 r.ptr[2] = (a.array[2] >> bits); 3633 r.ptr[3] = (a.array[3] >> bits); 3634 return r; 3635 } 3636 } 3637 3638 3639 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 3640 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted 3641 { 3642 static if (GDC_with_SSE2) 3643 { 3644 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 3645 } 3646 else static if (LDC_with_SSE2) 3647 { 3648 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 3649 } 3650 else static if (LDC_with_ARM64) 3651 { 3652 short8 sa = cast(short8)a; 3653 ubyte count = cast(ubyte)imm8; 3654 if (count > 15) 3655 count = 15; 3656 short8 r = sa >> short8(count); 3657 return cast(__m128i)r; 3658 } 3659 else 3660 { 3661 short8 sa = cast(short8)a; 3662 short8 r = void; 3663 3664 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3665 // D says "It's illegal to shift by the same or more bits 3666 // than the size of the quantity being shifted" 3667 // and it's UB instead. 3668 ubyte count = cast(ubyte)imm8; 3669 if (count > 15) 3670 count = 15; 3671 foreach(i; 0..8) 3672 r.ptr[i] = cast(short)(sa.array[i] >> count); 3673 return cast(int4)r; 3674 } 3675 } 3676 unittest 3677 { 3678 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3679 short8 B = cast(short8)( _mm_srai_epi16(A, 1) ); 3680 short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) ); 3681 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 3682 assert(B.array == expectedB); 3683 assert(B2.array == expectedB); 3684 3685 short8 C = cast(short8)( _mm_srai_epi16(A, 18) ); 3686 short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ]; 3687 assert(C.array == expectedC); 3688 } 3689 3690 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 3691 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted 3692 { 3693 static if (LDC_with_SSE2) 3694 { 3695 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 3696 } 3697 else static if (GDC_with_SSE2) 3698 { 3699 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 3700 } 3701 else 3702 { 3703 int4 r = void; 3704 3705 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3706 // D says "It's illegal to shift by the same or more bits 3707 // than the size of the quantity being shifted" 3708 // and it's UB instead. 3709 ubyte count = cast(ubyte) imm8; 3710 if (count > 31) 3711 count = 31; 3712 3713 r.ptr[0] = (a.array[0] >> count); 3714 r.ptr[1] = (a.array[1] >> count); 3715 r.ptr[2] = (a.array[2] >> count); 3716 r.ptr[3] = (a.array[3] >> count); 3717 return r; 3718 } 3719 } 3720 unittest 3721 { 3722 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3723 __m128i B = _mm_srai_epi32(A, 1); 3724 __m128i B2 = _mm_srai_epi32(A, 1 + 256); 3725 int[4] expectedB = [ 0, 1, 1, -2]; 3726 assert(B.array == expectedB); 3727 assert(B2.array == expectedB); 3728 3729 __m128i C = _mm_srai_epi32(A, 32); 3730 int[4] expectedC = [ 0, 0, 0, -1]; 3731 assert(C.array == expectedC); 3732 3733 __m128i D = _mm_srai_epi32(A, 0); 3734 int[4] expectedD = [ 0, 2, 3, -4]; 3735 assert(D.array == expectedD); 3736 } 3737 3738 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted 3739 { 3740 static if (LDC_with_SSE2) 3741 { 3742 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 3743 } 3744 else static if (GDC_with_SSE2) 3745 { 3746 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 3747 } 3748 else 3749 { 3750 short8 sa = cast(short8)a; 3751 long2 lc = cast(long2)count; 3752 int bits = cast(int)(lc.array[0]); 3753 short8 r = void; 3754 foreach(i; 0..8) 3755 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits); 3756 return cast(int4)r; 3757 } 3758 } 3759 3760 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted 3761 { 3762 static if (LDC_with_SSE2) 3763 { 3764 return __builtin_ia32_psrld128(a, count); 3765 } 3766 else static if (GDC_with_SSE2) 3767 { 3768 return __builtin_ia32_psrld128(a, count); 3769 } 3770 else 3771 { 3772 int4 r = void; 3773 long2 lc = cast(long2)count; 3774 int bits = cast(int)(lc.array[0]); 3775 r.ptr[0] = cast(uint)(a.array[0]) >> bits; 3776 r.ptr[1] = cast(uint)(a.array[1]) >> bits; 3777 r.ptr[2] = cast(uint)(a.array[2]) >> bits; 3778 r.ptr[3] = cast(uint)(a.array[3]) >> bits; 3779 return r; 3780 } 3781 } 3782 3783 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted 3784 { 3785 static if (LDC_with_SSE2) 3786 { 3787 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 3788 } 3789 else static if (GDC_with_SSE2) 3790 { 3791 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 3792 } 3793 else 3794 { 3795 long2 r = void; 3796 long2 sa = cast(long2)a; 3797 long2 lc = cast(long2)count; 3798 int bits = cast(int)(lc.array[0]); 3799 r.ptr[0] = cast(ulong)(sa.array[0]) >> bits; 3800 r.ptr[1] = cast(ulong)(sa.array[1]) >> bits; 3801 return cast(__m128i)r; 3802 } 3803 } 3804 3805 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 3806 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted 3807 { 3808 static if (GDC_with_SSE2) 3809 { 3810 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 3811 } 3812 else static if (LDC_with_SSE2) 3813 { 3814 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 3815 } 3816 else static if (LDC_with_ARM64) 3817 { 3818 short8 sa = cast(short8)a; 3819 short8 r = cast(short8) _mm_setzero_si128(); 3820 3821 ubyte count = cast(ubyte)imm8; 3822 if (count >= 16) 3823 return cast(__m128i)r; 3824 3825 r = sa >>> short8(count); // This facility offered with LDC, but not DMD. 3826 return cast(__m128i)r; 3827 } 3828 else 3829 { 3830 short8 sa = cast(short8)a; 3831 ubyte count = cast(ubyte)imm8; 3832 3833 short8 r = cast(short8) _mm_setzero_si128(); 3834 if (count >= 16) 3835 return cast(__m128i)r; 3836 3837 foreach(i; 0..8) 3838 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count); 3839 return cast(__m128i)r; 3840 } 3841 } 3842 unittest 3843 { 3844 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3845 short8 B = cast(short8)( _mm_srli_epi16(A, 1) ); 3846 short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) ); 3847 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 3848 assert(B.array == expectedB); 3849 assert(B2.array == expectedB); 3850 3851 short8 C = cast(short8)( _mm_srli_epi16(A, 16) ); 3852 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0]; 3853 assert(C.array == expectedC); 3854 3855 short8 D = cast(short8)( _mm_srli_epi16(A, 0) ); 3856 short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ]; 3857 assert(D.array == expectedD); 3858 } 3859 3860 3861 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 3862 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted 3863 { 3864 static if (GDC_with_SSE2) 3865 { 3866 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 3867 } 3868 else static if (LDC_with_SSE2) 3869 { 3870 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 3871 } 3872 else 3873 { 3874 ubyte count = cast(ubyte) imm8; 3875 3876 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3877 // D says "It's illegal to shift by the same or more bits 3878 // than the size of the quantity being shifted" 3879 // and it's UB instead. 3880 int4 r = _mm_setzero_si128(); 3881 if (count >= 32) 3882 return r; 3883 r.ptr[0] = a.array[0] >>> count; 3884 r.ptr[1] = a.array[1] >>> count; 3885 r.ptr[2] = a.array[2] >>> count; 3886 r.ptr[3] = a.array[3] >>> count; 3887 return r; 3888 } 3889 } 3890 unittest 3891 { 3892 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3893 __m128i B = _mm_srli_epi32(A, 1); 3894 __m128i B2 = _mm_srli_epi32(A, 1 + 256); 3895 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 3896 assert(B.array == expectedB); 3897 assert(B2.array == expectedB); 3898 3899 __m128i C = _mm_srli_epi32(A, 255); 3900 int[4] expectedC = [ 0, 0, 0, 0 ]; 3901 assert(C.array == expectedC); 3902 } 3903 3904 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros. 3905 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted 3906 { 3907 static if (GDC_with_SSE2) 3908 { 3909 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 3910 } 3911 else static if (LDC_with_SSE2) 3912 { 3913 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 3914 } 3915 else 3916 { 3917 long2 r = cast(long2) _mm_setzero_si128(); 3918 long2 sa = cast(long2)a; 3919 3920 ubyte count = cast(ubyte) imm8; 3921 if (count >= 64) 3922 return cast(__m128i)r; 3923 3924 r.ptr[0] = sa.array[0] >>> count; 3925 r.ptr[1] = sa.array[1] >>> count; 3926 return cast(__m128i)r; 3927 } 3928 } 3929 unittest 3930 { 3931 __m128i A = _mm_setr_epi64(8, -4); 3932 long2 B = cast(long2) _mm_srli_epi64(A, 1); 3933 long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512); 3934 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 3935 assert(B.array == expectedB); 3936 assert(B2.array == expectedB); 3937 3938 long2 C = cast(long2) _mm_srli_epi64(A, 64); 3939 long[2] expectedC = [ 0, 0 ]; 3940 assert(C.array == expectedC); 3941 } 3942 3943 /// Shift `v` right by `bytes` bytes while shifting in zeros. 3944 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe 3945 { 3946 static if (bytes & 0xF0) 3947 { 3948 return _mm_setzero_si128(); 3949 } 3950 else static if (GDC_with_SSE2) 3951 { 3952 return cast(__m128i) __builtin_ia32_psrldqi128(v, cast(ubyte)(bytes * 8)); 3953 } 3954 else static if (DMD_with_32bit_asm) 3955 { 3956 asm pure nothrow @nogc @trusted 3957 { 3958 movdqu XMM0, v; 3959 psrldq XMM0, bytes; 3960 movdqu v, XMM0; 3961 } 3962 return v; 3963 } 3964 else 3965 { 3966 return cast(__m128i) shufflevector!(byte16, 3967 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7, 3968 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15) 3969 (cast(byte16) v, cast(byte16)_mm_setzero_si128()); 3970 } 3971 } 3972 unittest 3973 { 3974 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1)); 3975 int[4] correct = [2, 3, 4, 0]; 3976 assert(R.array == correct); 3977 3978 __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1)); 3979 int[4] expectedA = [0, 0, 0, 0]; 3980 assert(A.array == expectedA); 3981 } 3982 3983 /// Shift `v` right by `bytes` bytes while shifting in zeros. 3984 /// #BONUS 3985 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe 3986 { 3987 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v); 3988 } 3989 unittest 3990 { 3991 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)); 3992 float[4] correct = [3.0f, 4.0f, 0, 0]; 3993 assert(R.array == correct); 3994 } 3995 3996 /// Shift `v` right by `bytes` bytes while shifting in zeros. 3997 /// #BONUS 3998 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe 3999 { 4000 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v); 4001 } 4002 4003 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4004 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4005 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted 4006 { 4007 __m128d* aligned = cast(__m128d*)mem_addr; 4008 *aligned = a; 4009 } 4010 4011 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 4012 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4013 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted 4014 { 4015 __m128d* aligned = cast(__m128d*)mem_addr; 4016 __m128d r; 4017 r.ptr[0] = a.array[0]; 4018 r.ptr[1] = a.array[0]; 4019 *aligned = r; 4020 } 4021 4022 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 4023 /// be aligned on any particular boundary. 4024 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 4025 { 4026 *mem_addr = a.array[0]; 4027 } 4028 4029 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 4030 /// general-protection exception may be generated. 4031 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 4032 { 4033 *mem_addr = a; 4034 } 4035 4036 alias _mm_store1_pd = _mm_store_pd1; /// 4037 4038 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory. 4039 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 4040 { 4041 *mem_addr = a.array[1]; 4042 } 4043 4044 // Note: `mem_addr` doesn't have to actually be aligned, which breaks 4045 // expectations from the user point of view. This problem also exist in C++. 4046 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 4047 { 4048 long* dest = cast(long*)mem_addr; 4049 long2 la = cast(long2)a; 4050 *dest = la.array[0]; 4051 } 4052 unittest 4053 { 4054 long[3] A = [1, 2, 3]; 4055 _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 4056 long[3] correct = [1, 0x1_0000_0000, 3]; 4057 assert(A == correct); 4058 } 4059 4060 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. 4061 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 4062 { 4063 *mem_addr = a.array[0]; 4064 } 4065 4066 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order. `mem_addr` must be 4067 /// aligned on a 16-byte boundary or a general-protection exception may be generated. 4068 void _mm_storer_pd (double* mem_addr, __m128d a) pure 4069 { 4070 __m128d* aligned = cast(__m128d*)mem_addr; 4071 *aligned = shufflevector!(double2, 1, 0)(a, a); 4072 } 4073 4074 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4075 /// `mem_addr` does not need to be aligned on any particular boundary. 4076 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe 4077 { 4078 storeUnaligned!double2(a, mem_addr); 4079 } 4080 4081 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 4082 /// boundary. 4083 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe 4084 { 4085 storeUnaligned!__m128i(a, cast(int*)mem_addr); 4086 } 4087 4088 /// Store 32-bit integer from the first element of `a` into memory. 4089 /// `mem_addr` does not need to be aligned on any particular boundary. 4090 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted 4091 { 4092 int* dest = cast(int*)mem_addr; 4093 *dest = a.array[0]; 4094 } 4095 unittest 4096 { 4097 int[2] arr = [-24, 12]; 4098 _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7)); 4099 assert(arr == [-24, -1]); 4100 } 4101 4102 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 4103 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte 4104 /// boundary or a general-protection exception may be generated. 4105 void _mm_stream_pd (double* mem_addr, __m128d a) 4106 { 4107 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4108 __m128d* dest = cast(__m128d*)mem_addr; 4109 *dest = a; 4110 } 4111 4112 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 4113 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception 4114 /// may be generated. 4115 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) 4116 { 4117 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4118 __m128i* dest = cast(__m128i*)mem_addr; 4119 *dest = a; 4120 } 4121 4122 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 4123 /// pollution. If the cache line containing address mem_addr is already in the cache, 4124 /// the cache will be updated. 4125 void _mm_stream_si32 (int* mem_addr, int a) 4126 { 4127 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4128 *mem_addr = a; 4129 } 4130 4131 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 4132 /// cache pollution. If the cache line containing address mem_addr is already 4133 /// in the cache, the cache will be updated. 4134 void _mm_stream_si64 (long* mem_addr, long a) 4135 { 4136 // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4137 *mem_addr = a; 4138 } 4139 4140 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 4141 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 4142 { 4143 return cast(__m128i)(cast(short8)a - cast(short8)b); 4144 } 4145 4146 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 4147 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 4148 { 4149 return cast(__m128i)(cast(int4)a - cast(int4)b); 4150 } 4151 4152 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`. 4153 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 4154 { 4155 return cast(__m128i)(cast(long2)a - cast(long2)b); 4156 } 4157 4158 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 4159 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 4160 { 4161 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 4162 } 4163 4164 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 4165 /// floating-point elements in `a`. 4166 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 4167 { 4168 return a - b; 4169 } 4170 4171 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 4172 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the 4173 /// upper element of result. 4174 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted 4175 { 4176 version(DigitalMars) 4177 { 4178 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 4179 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 4180 asm pure nothrow @nogc @trusted { nop;} 4181 a[0] = a[0] - b[0]; 4182 return a; 4183 } 4184 else static if (GDC_with_SSE2) 4185 { 4186 return __builtin_ia32_subsd(a, b); 4187 } 4188 else 4189 { 4190 a.ptr[0] -= b.array[0]; 4191 return a; 4192 } 4193 } 4194 unittest 4195 { 4196 __m128d a = [1.5, -2.0]; 4197 a = _mm_sub_sd(a, a); 4198 assert(a.array == [0.0, -2.0]); 4199 } 4200 4201 /// Subtract 64-bit integer `b` from 64-bit integer `a`. 4202 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe 4203 { 4204 return a - b; 4205 } 4206 4207 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4208 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 4209 { 4210 version(LDC) 4211 { 4212 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4213 { 4214 // Generates PSUBSW since LDC 1.15 -O0 4215 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4216 4217 enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4218 enum ir = ` 4219 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4220 ret <8 x i16> %r`; 4221 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4222 } 4223 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4224 { 4225 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4226 short[8] res; 4227 short8 sa = cast(short8)a; 4228 short8 sb = cast(short8)b; 4229 foreach(i; 0..8) 4230 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4231 return _mm_loadu_si128(cast(int4*)res.ptr); 4232 } 4233 else static if (LDC_with_SSE2) 4234 { 4235 return __builtin_ia32_psubsw128(a, b); 4236 } 4237 else 4238 static assert(false); 4239 } 4240 else static if (GDC_with_SSE2) 4241 { 4242 return __builtin_ia32_psubsw128(a, b); 4243 } 4244 else 4245 { 4246 short[8] res; 4247 short8 sa = cast(short8)a; 4248 short8 sb = cast(short8)b; 4249 foreach(i; 0..8) 4250 res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4251 return _mm_loadu_si128(cast(int4*)res.ptr); 4252 } 4253 } 4254 unittest 4255 { 4256 short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0), 4257 _mm_setr_epi16(-10 , 16, 5, 4, 3, 2, 1, 0)); 4258 static immutable short[8] correctResult = [32767, -32768, 0, 0, 0, 0, 0, 0]; 4259 assert(res.array == correctResult); 4260 } 4261 4262 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 4263 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 4264 { 4265 version(LDC) 4266 { 4267 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4268 { 4269 // x86: Generates PSUBSB since LDC 1.15 -O0 4270 // ARM: Generates sqsub.16b since LDC 1.21 -O0 4271 enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 4272 enum ir = ` 4273 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 4274 ret <16 x i8> %r`; 4275 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 4276 } 4277 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4278 { 4279 byte[16] res; 4280 byte16 sa = cast(byte16)a; 4281 byte16 sb = cast(byte16)b; 4282 foreach(i; 0..16) 4283 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4284 return _mm_loadu_si128(cast(int4*)res.ptr); 4285 } 4286 else static if (LDC_with_SSE2) 4287 { 4288 return __builtin_ia32_psubsb128(a, b); 4289 } 4290 else 4291 static assert(false); 4292 } 4293 else static if (GDC_with_SSE2) 4294 { 4295 return __builtin_ia32_psubsb128(a, b); 4296 } 4297 else 4298 { 4299 byte[16] res; 4300 byte16 sa = cast(byte16)a; 4301 byte16 sb = cast(byte16)b; 4302 foreach(i; 0..16) 4303 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4304 return _mm_loadu_si128(cast(int4*)res.ptr); 4305 } 4306 } 4307 unittest 4308 { 4309 byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4310 _mm_setr_epi8( 15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4311 static immutable byte[16] correctResult = [-128, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4312 assert(res.array == correctResult); 4313 } 4314 4315 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 4316 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 4317 { 4318 version(LDC) 4319 { 4320 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4321 { 4322 // x86: Generates PSUBUSW since LDC 1.15 -O0 4323 // ARM: Generates uqsub.8h since LDC 1.21 -O0 4324 enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4325 enum ir = ` 4326 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4327 ret <8 x i16> %r`; 4328 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4329 } 4330 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4331 { 4332 short[8] res; 4333 short8 sa = cast(short8)a; 4334 short8 sb = cast(short8)b; 4335 foreach(i; 0..8) 4336 { 4337 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4338 res[i] = saturateSignedIntToUnsignedShort(sum); 4339 } 4340 return _mm_loadu_si128(cast(int4*)res.ptr); 4341 } 4342 else static if (LDC_with_SSE2) 4343 { 4344 return __builtin_ia32_psubusw128(a, b); 4345 } 4346 else 4347 static assert(false); 4348 } 4349 else static if (GDC_with_SSE2) 4350 { 4351 return __builtin_ia32_psubusw128(a, b); 4352 } 4353 else 4354 { 4355 short[8] res; 4356 short8 sa = cast(short8)a; 4357 short8 sb = cast(short8)b; 4358 foreach(i; 0..8) 4359 { 4360 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4361 res[i] = saturateSignedIntToUnsignedShort(sum); 4362 } 4363 return _mm_loadu_si128(cast(int4*)res.ptr); 4364 } 4365 } 4366 unittest 4367 { 4368 short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534, 1, 5, 4, 3, 2, 1, 0), 4369 _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0)); 4370 static immutable short[8] correct = [ 0, 0, 1, 0, 0, 2, 0, 0]; 4371 assert(R.array == correct); 4372 } 4373 4374 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4375 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4376 { 4377 version(LDC) 4378 { 4379 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4380 { 4381 // x86: Generates PSUBUSB since LDC 1.15 -O0 4382 // ARM: Generates uqsub.16b since LDC 1.21 -O0 4383 enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 4384 enum ir = ` 4385 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 4386 ret <16 x i8> %r`; 4387 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 4388 } 4389 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4390 { 4391 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4392 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4393 { 4394 ubyte[16] res; 4395 byte16 sa = cast(byte16)a; 4396 byte16 sb = cast(byte16)b; 4397 foreach(i; 0..16) 4398 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4399 return _mm_loadu_si128(cast(int4*)res.ptr); 4400 } 4401 } 4402 else static if (LDC_with_SSE2) 4403 { 4404 return __builtin_ia32_psubusb128(a, b); 4405 } 4406 else 4407 static assert(false); 4408 } 4409 else static if (GDC_with_SSE2) 4410 { 4411 return __builtin_ia32_psubusb128(a, b); 4412 } 4413 else 4414 { 4415 ubyte[16] res; 4416 byte16 sa = cast(byte16)a; 4417 byte16 sb = cast(byte16)b; 4418 foreach(i; 0..16) 4419 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4420 return _mm_loadu_si128(cast(int4*)res.ptr); 4421 } 4422 } 4423 unittest 4424 { 4425 byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4426 _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4427 static immutable byte[16] correctResult = [ 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4428 assert(res.array == correctResult); 4429 } 4430 4431 // Note: the only difference between these intrinsics is the signalling 4432 // behaviour of quiet NaNs. This is incorrect but the case where 4433 // you would want to differentiate between qNaN and sNaN and then 4434 // treat them differently on purpose seems extremely rare. 4435 alias _mm_ucomieq_sd = _mm_comieq_sd; /// 4436 alias _mm_ucomige_sd = _mm_comige_sd; /// 4437 alias _mm_ucomigt_sd = _mm_comigt_sd; /// 4438 alias _mm_ucomile_sd = _mm_comile_sd; /// 4439 alias _mm_ucomilt_sd = _mm_comilt_sd; /// 4440 alias _mm_ucomineq_sd = _mm_comineq_sd; /// 4441 4442 /// Return vector of type `__m128d` with undefined elements. 4443 __m128d _mm_undefined_pd() pure @safe 4444 { 4445 __m128d result = void; 4446 return result; 4447 } 4448 4449 /// Return vector of type `__m128i` with undefined elements. 4450 __m128i _mm_undefined_si128() pure @safe 4451 { 4452 __m128i result = void; 4453 return result; 4454 } 4455 4456 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. 4457 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe 4458 { 4459 static if (GDC_with_SSE2) 4460 { 4461 return __builtin_ia32_punpckhwd128(a, b); 4462 } 4463 else static if (DMD_with_32bit_asm) 4464 { 4465 asm pure nothrow @nogc @trusted 4466 { 4467 movdqu XMM0, a; 4468 movdqu XMM1, b; 4469 punpckhwd XMM0, XMM1; 4470 movdqu a, XMM0; 4471 } 4472 return a; 4473 } 4474 else 4475 { 4476 return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 4477 (cast(short8)a, cast(short8)b); 4478 } 4479 } 4480 unittest 4481 { 4482 __m128i A = _mm_setr_epi16(4, 5, 6, 7, 8, 9, 10, 11); 4483 __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19); 4484 short8 C = cast(short8)(_mm_unpackhi_epi16(A, B)); 4485 short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19]; 4486 assert(C.array == correct); 4487 } 4488 4489 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. 4490 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @safe 4491 { 4492 static if (GDC_with_SSE2) 4493 { 4494 return __builtin_ia32_punpckhdq128(a, b); 4495 } 4496 else 4497 { 4498 return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 4499 } 4500 } 4501 // TODO unittest 4502 4503 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`. 4504 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted 4505 { 4506 static if (GDC_with_SSE2) 4507 { 4508 return __builtin_ia32_punpckhqdq128(a, b); 4509 } 4510 else 4511 { 4512 __m128i r = cast(__m128i)b; 4513 r[0] = a[2]; 4514 r[1] = a[3]; 4515 return r; 4516 } 4517 } 4518 unittest // Issue #36 4519 { 4520 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4521 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4522 long2 C = cast(long2)(_mm_unpackhi_epi64(A, B)); 4523 long[2] correct = [0x33333333_33333333, 0x55555555_55555555]; 4524 assert(C.array == correct); 4525 } 4526 4527 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. 4528 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe 4529 { 4530 static if (GDC_with_SSE2) 4531 { 4532 return __builtin_ia32_punpckhbw128(a, b); 4533 } 4534 else static if (DMD_with_32bit_asm) 4535 { 4536 asm pure nothrow @nogc @trusted 4537 { 4538 movdqu XMM0, a; 4539 movdqu XMM1, b; 4540 punpckhbw XMM0, XMM1; 4541 movdqu a, XMM0; 4542 } 4543 return a; 4544 } 4545 else 4546 { 4547 return cast(__m128i)shufflevector!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 4548 12, 28, 13, 29, 14, 30, 15, 31) 4549 (cast(byte16)a, cast(byte16)b); 4550 } 4551 } 4552 // TODO unittest 4553 4554 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`. 4555 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe 4556 { 4557 static if (GDC_with_SSE2) 4558 { 4559 return __builtin_ia32_unpckhpd(a, b); 4560 } 4561 else 4562 { 4563 return shufflevector!(__m128d, 1, 3)(a, b); 4564 } 4565 } 4566 // TODO unittest 4567 4568 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. 4569 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe 4570 { 4571 static if (GDC_with_SSE2) 4572 { 4573 return __builtin_ia32_punpcklwd128(a, b); 4574 } 4575 else static if (DMD_with_32bit_asm) 4576 { 4577 asm pure nothrow @nogc @trusted 4578 { 4579 movdqu XMM0, a; 4580 movdqu XMM1, b; 4581 punpcklwd XMM0, XMM1; 4582 movdqu a, XMM0; 4583 } 4584 return a; 4585 } 4586 else 4587 { 4588 return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11) 4589 (cast(short8)a, cast(short8)b); 4590 } 4591 } 4592 // TODO unittest 4593 4594 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. 4595 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @safe 4596 { 4597 static if (GDC_with_SSE2) 4598 { 4599 return __builtin_ia32_punpckldq128(a, b); 4600 } 4601 else 4602 { 4603 return shufflevector!(int4, 0, 4, 1, 5) 4604 (cast(int4)a, cast(int4)b); 4605 } 4606 } 4607 // TODO unittest 4608 4609 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`. 4610 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted 4611 { 4612 static if (GDC_with_SSE2) 4613 { 4614 return __builtin_ia32_punpcklqdq128(a, b); 4615 } 4616 else 4617 { 4618 long2 lA = cast(long2)a; 4619 long2 lB = cast(long2)b; 4620 long2 R; 4621 R.ptr[0] = lA.array[0]; 4622 R.ptr[1] = lB.array[0]; 4623 return cast(__m128i)R; 4624 } 4625 } 4626 unittest // Issue #36 4627 { 4628 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4629 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4630 long2 C = cast(long2)(_mm_unpacklo_epi64(A, B)); 4631 long[2] correct = [0x22222222_22222222, 0x44444444_44444444]; 4632 assert(C.array == correct); 4633 } 4634 4635 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. 4636 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe 4637 { 4638 static if (GDC_with_SSE2) 4639 { 4640 return __builtin_ia32_punpcklbw128(a, b); 4641 } 4642 else static if (DMD_with_32bit_asm) 4643 { 4644 asm pure nothrow @nogc @trusted 4645 { 4646 movdqu XMM0, a; 4647 movdqu XMM1, b; 4648 punpcklbw XMM0, XMM1; 4649 movdqu a, XMM0; 4650 } 4651 return a; 4652 } 4653 else 4654 { 4655 return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 4656 4, 20, 5, 21, 6, 22, 7, 23) 4657 (cast(byte16)a, cast(byte16)b); 4658 } 4659 } 4660 // TODO unittest 4661 4662 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`. 4663 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe 4664 { 4665 static if (GDC_with_SSE2) 4666 { 4667 return __builtin_ia32_unpcklpd(a, b); 4668 } 4669 else 4670 { 4671 return shufflevector!(__m128d, 0, 2)(a, b); 4672 } 4673 } 4674 // TODO unittest 4675 4676 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 4677 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 4678 { 4679 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 4680 } 4681 4682 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`. 4683 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 4684 { 4685 return a ^ b; 4686 } 4687 4688 unittest 4689 { 4690 float distance(float[4] a, float[4] b) nothrow @nogc 4691 { 4692 __m128 va = _mm_loadu_ps(a.ptr); 4693 __m128 vb = _mm_loadu_ps(b.ptr); 4694 __m128 diffSquared = _mm_sub_ps(va, vb); 4695 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 4696 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared)); 4697 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum)); 4698 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 4699 } 4700 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 4701 }