1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2019, Stefanos Baziotis 2019. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.emmintrin; 7 8 public import inteli.types; 9 public import inteli.xmmintrin; // SSE2 includes SSE1 10 import inteli.mmx; 11 import inteli.internals; 12 13 nothrow @nogc: 14 15 16 // SSE2 instructions 17 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 18 19 /// Add packed 16-bit integers in `a` and `b`. 20 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 21 { 22 return cast(__m128i)(cast(short8)a + cast(short8)b); 23 } 24 unittest 25 { 26 __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77); 27 short8 R = cast(short8) _mm_add_epi16(A, A); 28 short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154]; 29 assert(R.array == correct); 30 } 31 32 /// Add packed 32-bit integers in `a` and `b`. 33 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 34 { 35 return cast(__m128i)(cast(int4)a + cast(int4)b); 36 } 37 unittest 38 { 39 __m128i A = _mm_setr_epi32( -7, -1, 0, 9); 40 int4 R = _mm_add_epi32(A, A); 41 int[4] correct = [ -14, -2, 0, 18 ]; 42 assert(R.array == correct); 43 } 44 45 /// Add packed 64-bit integers in `a` and `b`. 46 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 47 { 48 return cast(__m128i)(cast(long2)a + cast(long2)b); 49 } 50 unittest 51 { 52 __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000); 53 long2 R = cast(long2) _mm_add_epi64(A, A); 54 long[2] correct = [ -2, 0 ]; 55 assert(R.array == correct); 56 } 57 58 /// Add packed 8-bit integers in `a` and `b`. 59 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 60 { 61 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 62 } 63 unittest 64 { 65 __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78); 66 byte16 R = cast(byte16) _mm_add_epi8(A, A); 67 byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100]; 68 assert(R.array == correct); 69 } 70 71 /// Add the lower double-precision (64-bit) floating-point element 72 /// in `a` and `b`, store the result in the lower element of dst, 73 /// and copy the upper element from `a` to the upper element of destination. 74 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 75 { 76 static if (GDC_with_SSE2) 77 { 78 return __builtin_ia32_addsd(a, b); 79 } 80 else version(DigitalMars) 81 { 82 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 83 asm pure nothrow @nogc @trusted { nop;} 84 a[0] = a[0] + b[0]; 85 return a; 86 } 87 else 88 { 89 a[0] += b[0]; 90 return a; 91 } 92 } 93 unittest 94 { 95 __m128d a = [1.5, -2.0]; 96 a = _mm_add_sd(a, a); 97 assert(a.array == [3.0, -2.0]); 98 } 99 100 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 101 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 102 { 103 return a + b; 104 } 105 unittest 106 { 107 __m128d a = [1.5, -2.0]; 108 a = _mm_add_pd(a, a); 109 assert(a.array == [3.0, -4.0]); 110 } 111 112 /// Add 64-bit integers `a` and `b`. 113 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe 114 { 115 return a + b; 116 } 117 118 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 119 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 120 { 121 static if (GDC_with_SSE2) 122 { 123 return __builtin_ia32_paddsw128(a, b); 124 } 125 else version(LDC) 126 { 127 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 128 { 129 // Generates PADDSW since LDC 1.15 -O0 130 enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 131 enum ir = ` 132 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 133 ret <8 x i16> %r`; 134 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 135 } 136 else 137 return __builtin_ia32_paddsw128(a, b); 138 } 139 else 140 { 141 short[8] res; 142 short8 sa = cast(short8)a; 143 short8 sb = cast(short8)b; 144 foreach(i; 0..8) 145 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 146 return _mm_loadu_si128(cast(int4*)res.ptr); 147 } 148 } 149 unittest 150 { 151 short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), 152 _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); 153 static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 154 assert(res.array == correctResult); 155 } 156 157 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 158 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 159 { 160 static if (GDC_with_SSE2) 161 { 162 return __builtin_ia32_paddsb128(a, b); 163 } 164 else version(LDC) 165 { 166 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 167 { 168 // Generates PADDSB since LDC 1.15 -O0 169 enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 170 enum ir = ` 171 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 172 ret <16 x i8> %r`; 173 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 174 } 175 else 176 return __builtin_ia32_paddsb128(a, b); 177 } 178 else 179 { 180 byte[16] res; 181 byte16 sa = cast(byte16)a; 182 byte16 sb = cast(byte16)b; 183 foreach(i; 0..16) 184 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 185 return _mm_loadu_si128(cast(int4*)res.ptr); 186 } 187 } 188 unittest 189 { 190 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 191 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 192 static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14, 193 16, 18, 20, 22, 24, 26, 28, 30]; 194 assert(res.array == correctResult); 195 } 196 197 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 198 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 199 { 200 version(LDC) 201 { 202 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 203 { 204 // Generates PADDUSB since LDC 1.15 -O0 205 enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 206 enum ir = ` 207 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 208 ret <16 x i8> %r`; 209 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 210 } 211 else 212 return __builtin_ia32_paddusb128(a, b); 213 } 214 else 215 { 216 ubyte[16] res; 217 byte16 sa = cast(byte16)a; 218 byte16 sb = cast(byte16)b; 219 foreach(i; 0..16) 220 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 221 return _mm_loadu_si128(cast(int4*)res.ptr); 222 } 223 } 224 unittest 225 { 226 byte16 res = cast(byte16) _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0), 227 _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0)); 228 static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 229 assert(res.array == correctResult); 230 } 231 232 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation. 233 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 234 { 235 version(LDC) 236 { 237 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 238 { 239 // Generates PADDUSW since LDC 1.15 -O0 240 enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 241 enum ir = ` 242 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 243 ret <8 x i16> %r`; 244 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 245 } 246 else 247 return __builtin_ia32_paddusw128(a, b); 248 } 249 else 250 { 251 ushort[8] res; 252 short8 sa = cast(short8)a; 253 short8 sb = cast(short8)b; 254 foreach(i; 0..8) 255 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 256 return _mm_loadu_si128(cast(int4*)res.ptr); 257 } 258 } 259 unittest 260 { 261 short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0), 262 _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0)); 263 static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6]; 264 assert(res.array == correctResult); 265 } 266 267 /// Compute the bitwise AND of packed double-precision (64-bit) 268 /// floating-point elements in `a` and `b`. 269 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 270 { 271 return cast(__m128d)( cast(long2)a & cast(long2)b ); 272 } 273 unittest 274 { 275 double a = 4.32; 276 double b = -78.99; 277 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 278 __m128d A = _mm_set_pd(a, b); 279 __m128d B = _mm_set_pd(b, a); 280 long2 R = cast(long2)( _mm_and_pd(A, B) ); 281 assert(R.array[0] == correct); 282 assert(R.array[1] == correct); 283 } 284 285 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`. 286 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 287 { 288 return a & b; 289 } 290 unittest 291 { 292 __m128i A = _mm_set1_epi32(7); 293 __m128i B = _mm_set1_epi32(14); 294 __m128i R = _mm_and_si128(A, B); 295 int[4] correct = [6, 6, 6, 6]; 296 assert(R.array == correct); 297 } 298 299 /// Compute the bitwise NOT of packed double-precision (64-bit) 300 /// floating-point elements in `a` and then AND with `b`. 301 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 302 { 303 return cast(__m128d)( ~(cast(long2)a) & cast(long2)b); 304 } 305 unittest 306 { 307 double a = 4.32; 308 double b = -78.99; 309 long correct = (~*cast(long*)(&a)) & ( *cast(long*)(&b)); 310 long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b)); 311 __m128d A = _mm_setr_pd(a, b); 312 __m128d B = _mm_setr_pd(b, a); 313 long2 R = cast(long2)( _mm_andnot_pd(A, B) ); 314 assert(R.array[0] == correct); 315 assert(R.array[1] == correct2); 316 } 317 318 /// Compute the bitwise NOT of 128 bits (representing integer data) 319 /// in `a` and then AND with `b`. 320 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 321 { 322 return (~a) & b; 323 } 324 unittest 325 { 326 __m128i A = _mm_set1_epi32(7); 327 __m128i B = _mm_set1_epi32(14); 328 __m128i R = _mm_andnot_si128(A, B); 329 int[4] correct = [8, 8, 8, 8]; 330 assert(R.array == correct); 331 } 332 333 /// Average packed unsigned 16-bit integers in `a` and `b`. 334 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted 335 { 336 static if (GDC_with_SSE2) 337 { 338 return __builtin_ia32_pavgw128(a, b); 339 } 340 else version(LDC) 341 { 342 // Generates pavgw even in LDC 1.0, even in -O0 343 enum ir = ` 344 %ia = zext <8 x i16> %0 to <8 x i32> 345 %ib = zext <8 x i16> %1 to <8 x i32> 346 %isum = add <8 x i32> %ia, %ib 347 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 348 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 349 %r = trunc <8 x i32> %isums to <8 x i16> 350 ret <8 x i16> %r`; 351 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 352 } 353 else 354 { 355 short8 sa = cast(short8)a; 356 short8 sb = cast(short8)b; 357 short8 sr = void; 358 foreach(i; 0..8) 359 { 360 sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 ); 361 } 362 return cast(int4)sr; 363 } 364 } 365 unittest 366 { 367 __m128i A = _mm_set1_epi16(31); 368 __m128i B = _mm_set1_epi16(64); 369 short8 avg = cast(short8)(_mm_avg_epu16(A, B)); 370 foreach(i; 0..8) 371 assert(avg.array[i] == 48); 372 } 373 374 /// Average packed unsigned 8-bit integers in `a` and `b`. 375 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted 376 { 377 static if (GDC_with_SSE2) 378 { 379 return __builtin_ia32_pavgb128(a, b); 380 } 381 else version(LDC) 382 { 383 // Generates pavgb even in LDC 1.0, even in -O0 384 enum ir = ` 385 %ia = zext <16 x i8> %0 to <16 x i16> 386 %ib = zext <16 x i8> %1 to <16 x i16> 387 %isum = add <16 x i16> %ia, %ib 388 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 389 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 390 %r = trunc <16 x i16> %isums to <16 x i8> 391 ret <16 x i8> %r`; 392 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 393 } 394 else 395 { 396 byte16 sa = cast(byte16)a; 397 byte16 sb = cast(byte16)b; 398 byte16 sr = void; 399 foreach(i; 0..16) 400 { 401 sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 ); 402 } 403 return cast(int4)sr; 404 } 405 } 406 unittest 407 { 408 __m128i A = _mm_set1_epi8(31); 409 __m128i B = _mm_set1_epi8(64); 410 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B)); 411 foreach(i; 0..16) 412 assert(avg.array[i] == 48); 413 } 414 415 /// Shift `a` left by `bytes` bytes while shifting in zeros. 416 alias _mm_bslli_si128 = _mm_slli_si128; 417 unittest 418 { 419 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 420 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 421 __m128i result = _mm_bslli_si128!5(toShift); 422 assert( (cast(byte16)result).array == exact); 423 } 424 425 /// Shift `v` right by `bytes` bytes while shifting in zeros. 426 alias _mm_bsrli_si128 = _mm_srli_si128; 427 unittest 428 { 429 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 430 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0]; 431 __m128i result = _mm_bsrli_si128!5(toShift); 432 assert( (cast(byte16)result).array == exact); 433 } 434 435 /// Cast vector of type `__m128d` to type `__m128`. 436 /// Note: Also possible with a regular `cast(__m128)(a)`. 437 __m128 _mm_castpd_ps (__m128d a) pure @safe 438 { 439 return cast(__m128)a; 440 } 441 442 /// Cast vector of type `__m128d` to type `__m128i`. 443 /// Note: Also possible with a regular `cast(__m128i)(a)`. 444 __m128i _mm_castpd_si128 (__m128d a) pure @safe 445 { 446 return cast(__m128i)a; 447 } 448 449 /// Cast vector of type `__m128` to type `__m128d`. 450 /// Note: Also possible with a regular `cast(__m128d)(a)`. 451 __m128d _mm_castps_pd (__m128 a) pure @safe 452 { 453 return cast(__m128d)a; 454 } 455 456 /// Cast vector of type `__m128` to type `__m128i`. 457 /// Note: Also possible with a regular `cast(__m128i)(a)`. 458 __m128i _mm_castps_si128 (__m128 a) pure @safe 459 { 460 return cast(__m128i)a; 461 } 462 463 /// Cast vector of type `__m128i` to type `__m128d`. 464 /// Note: Also possible with a regular `cast(__m128d)(a)`. 465 __m128d _mm_castsi128_pd (__m128i a) pure @safe 466 { 467 return cast(__m128d)a; 468 } 469 470 /// Cast vector of type `__m128i` to type `__m128`. 471 /// Note: Also possible with a regular `cast(__m128)(a)`. 472 __m128 _mm_castsi128_ps (__m128i a) pure @safe 473 { 474 return cast(__m128)a; 475 } 476 477 /// Invalidate and flush the cache line that contains `p` 478 /// from all levels of the cache hierarchy. 479 void _mm_clflush (const(void)* p) @trusted 480 { 481 static if (GDC_with_SSE2) 482 { 483 __builtin_ia32_clflush(p); 484 } 485 else version(LDC) 486 { 487 __builtin_ia32_clflush(cast(void*)p); 488 } 489 else version(D_InlineAsm_X86) 490 { 491 asm pure nothrow @nogc @safe 492 { 493 mov EAX, p; 494 clflush [EAX]; 495 } 496 } 497 else version(D_InlineAsm_X86_64) 498 { 499 asm pure nothrow @nogc @safe 500 { 501 mov RAX, p; 502 clflush [RAX]; 503 } 504 } 505 else 506 { 507 // Do nothing. Invalidating cacheline does 508 // not affect correctness. 509 } 510 } 511 unittest 512 { 513 ubyte[64] cacheline; 514 _mm_clflush(cacheline.ptr); 515 } 516 517 /// Compare packed 16-bit integers in `a` and `b` for equality. 518 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 519 { 520 static if (GDC_with_SSE2) 521 { 522 return __builtin_ia32_pcmpeqw128(a, b); 523 } 524 else 525 { 526 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 527 } 528 } 529 unittest 530 { 531 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 532 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 533 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 534 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 535 assert(R.array == E); 536 } 537 538 /// Compare packed 32-bit integers in `a` and `b` for equality. 539 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 540 { 541 static if (GDC_with_SSE2) 542 { 543 return __builtin_ia32_pcmpeqd128(a, b); 544 } 545 else 546 { 547 return equalMask!__m128i(a, b); 548 } 549 } 550 unittest 551 { 552 int4 A = [-3, -2, -1, 0]; 553 int4 B = [ 4, -2, 2, 0]; 554 int[4] E = [ 0, -1, 0, -1]; 555 int4 R = cast(int4)(_mm_cmpeq_epi16(A, B)); 556 assert(R.array == E); 557 } 558 559 /// Compare packed 8-bit integers in `a` and `b` for equality. 560 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 561 { 562 static if (GDC_with_SSE2) 563 { 564 return __builtin_ia32_pcmpeqb128(a, b); 565 } 566 else 567 { 568 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 569 } 570 } 571 unittest 572 { 573 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 574 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 575 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 576 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 577 assert(C.array == correct); 578 } 579 580 /// Compare packed double-precision (64-bit) floating-point elements 581 /// in `a` and `b` for equality. 582 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 583 { 584 static if (GDC_with_SSE2) 585 { 586 return __builtin_ia32_cmpeqpd(a, b); 587 } 588 else 589 { 590 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b); 591 } 592 } 593 594 /// Compare the lower double-precision (64-bit) floating-point elements 595 /// in `a` and `b` for equality, store the result in the lower element, 596 /// and copy the upper element from `a`. 597 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 598 { 599 static if (GDC_with_SSE2) 600 { 601 return __builtin_ia32_cmpeqsd(a, b); 602 } 603 else 604 { 605 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b); 606 } 607 } 608 609 /// Compare packed double-precision (64-bit) floating-point elements 610 /// in `a` and `b` for greater-than-or-equal. 611 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 612 { 613 static if (GDC_with_SSE2) 614 { 615 return __builtin_ia32_cmpgepd(a, b); 616 } 617 else 618 { 619 return cast(__m128d) cmppd!(FPComparison.oge)(a, b); 620 } 621 } 622 623 /// Compare the lower double-precision (64-bit) floating-point elements 624 /// in `a` and `b` for greater-than-or-equal, store the result in the 625 /// lower element, and copy the upper element from `a`. 626 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 627 { 628 // Note: There is no __builtin_ia32_cmpgesd builtin. 629 static if (GDC_with_SSE2) 630 { 631 return __builtin_ia32_cmpnltsd(b, a); 632 } 633 else 634 { 635 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b); 636 } 637 } 638 639 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 640 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 641 { 642 static if (GDC_with_SSE2) 643 { 644 return __builtin_ia32_pcmpgtw128(a, b); 645 } 646 else 647 { 648 return cast(__m128i)( greaterMask!short8(cast(short8)a, cast(short8)b)); 649 } 650 } 651 unittest 652 { 653 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 654 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 655 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 656 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 657 assert(R.array == E); 658 } 659 660 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 661 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 662 { 663 static if (GDC_with_SSE2) 664 { 665 return __builtin_ia32_pcmpgtd128(a, b); 666 } 667 else 668 { 669 return cast(__m128i)( greaterMask!int4(a, b)); 670 } 671 } 672 unittest 673 { 674 int4 A = [-3, 2, -1, 0]; 675 int4 B = [ 4, -2, 2, 0]; 676 int[4] E = [ 0, -1, 0, 0]; 677 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 678 assert(R.array == E); 679 } 680 681 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 682 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 683 { 684 static if (GDC_with_SSE2) 685 { 686 return __builtin_ia32_pcmpgtb128(a, b); 687 } 688 else 689 { 690 return cast(__m128i)( greaterMask!byte16(cast(byte16)a, cast(byte16)b)); 691 } 692 } 693 unittest 694 { 695 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 696 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 697 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 698 byte[16] correct = [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 699 __m128i D = _mm_cmpeq_epi8(A, B); 700 assert(C.array == correct); 701 } 702 703 /// Compare packed double-precision (64-bit) floating-point elements 704 /// in `a` and `b` for greater-than. 705 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 706 { 707 static if (GDC_with_SSE2) 708 { 709 return __builtin_ia32_cmpgtpd(a, b); 710 } 711 else 712 { 713 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b); 714 } 715 } 716 717 /// Compare the lower double-precision (64-bit) floating-point elements 718 /// in `a` and `b` for greater-than, store the result in the lower element, 719 /// and copy the upper element from `a`. 720 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 721 { 722 // Note: There is no __builtin_ia32_cmpgtsd builtin. 723 static if (GDC_with_SSE2) 724 { 725 return __builtin_ia32_cmpnlesd(b, a); 726 } 727 else 728 { 729 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b); 730 } 731 } 732 733 /// Compare packed double-precision (64-bit) floating-point elements 734 /// in `a` and `b` for less-than-or-equal. 735 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 736 { 737 static if (GDC_with_SSE2) 738 { 739 return __builtin_ia32_cmplepd(a, b); 740 } 741 else 742 { 743 return cast(__m128d) cmppd!(FPComparison.ole)(a, b); 744 } 745 } 746 747 /// Compare the lower double-precision (64-bit) floating-point elements 748 /// in `a` and `b` for less-than-or-equal, store the result in the 749 /// lower element, and copy the upper element from `a`. 750 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 751 { 752 static if (GDC_with_SSE2) 753 { 754 return __builtin_ia32_cmplesd(a, b); 755 } 756 else 757 { 758 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b); 759 } 760 } 761 762 /// Compare packed 16-bit integers in `a` and `b` for less-than. 763 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 764 { 765 return _mm_cmpgt_epi16(b, a); 766 } 767 768 /// Compare packed 32-bit integers in `a` and `b` for less-than. 769 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 770 { 771 return _mm_cmpgt_epi32(b, a); 772 } 773 774 /// Compare packed 8-bit integers in `a` and `b` for less-than. 775 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 776 { 777 return _mm_cmpgt_epi8(b, a); 778 } 779 780 /// Compare packed double-precision (64-bit) floating-point elements 781 /// in `a` and `b` for less-than. 782 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 783 { 784 static if (GDC_with_SSE2) 785 { 786 return __builtin_ia32_cmpltpd(a, b); 787 } 788 else 789 { 790 return cast(__m128d) cmppd!(FPComparison.olt)(a, b); 791 } 792 } 793 794 /// Compare the lower double-precision (64-bit) floating-point elements 795 /// in `a` and `b` for less-than, store the result in the lower 796 /// element, and copy the upper element from `a`. 797 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 798 { 799 static if (GDC_with_SSE2) 800 { 801 return __builtin_ia32_cmpltsd(a, b); 802 } 803 else 804 { 805 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b); 806 } 807 } 808 809 /// Compare packed double-precision (64-bit) floating-point elements 810 /// in `a` and `b` for not-equal. 811 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 812 { 813 static if (GDC_with_SSE2) 814 { 815 return __builtin_ia32_cmpneqpd(a, b); 816 } 817 else 818 { 819 return cast(__m128d) cmppd!(FPComparison.une)(a, b); 820 } 821 } 822 823 /// Compare the lower double-precision (64-bit) floating-point elements 824 /// in `a` and `b` for not-equal, store the result in the lower 825 /// element, and copy the upper element from `a`. 826 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 827 { 828 static if (GDC_with_SSE2) 829 { 830 return __builtin_ia32_cmpneqsd(a, b); 831 } 832 else 833 { 834 return cast(__m128d) cmpsd!(FPComparison.une)(a, b); 835 } 836 } 837 838 /// Compare packed double-precision (64-bit) floating-point elements 839 /// in `a` and `b` for not-greater-than-or-equal. 840 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 841 { 842 static if (GDC_with_SSE2) 843 { 844 return __builtin_ia32_cmpngepd(a, b); 845 } 846 else 847 { 848 return cast(__m128d) cmppd!(FPComparison.ult)(a, b); 849 } 850 } 851 852 /// Compare the lower double-precision (64-bit) floating-point elements 853 /// in `a` and `b` for not-greater-than-or-equal, store the result in 854 /// the lower element, and copy the upper element from `a`. 855 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 856 { 857 // Note: There is no __builtin_ia32_cmpngesd builtin. 858 static if (GDC_with_SSE2) 859 { 860 return __builtin_ia32_cmpltsd(b, a); 861 } 862 else 863 { 864 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b); 865 } 866 } 867 868 /// Compare packed double-precision (64-bit) floating-point elements 869 /// in `a` and `b` for not-greater-than. 870 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 871 { 872 static if (GDC_with_SSE2) 873 { 874 return __builtin_ia32_cmpngtpd(a, b); 875 } 876 else 877 { 878 return cast(__m128d) cmppd!(FPComparison.ule)(a, b); 879 } 880 } 881 882 /// Compare the lower double-precision (64-bit) floating-point elements 883 /// in `a` and `b` for not-greater-than, store the result in the 884 /// lower element, and copy the upper element from `a`. 885 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 886 { 887 // Note: There is no __builtin_ia32_cmpngtsd builtin. 888 static if (GDC_with_SSE2) 889 { 890 return __builtin_ia32_cmplesd(b, a); 891 } 892 else 893 { 894 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b); 895 } 896 } 897 898 /// Compare packed double-precision (64-bit) floating-point elements 899 /// in `a` and `b` for not-less-than-or-equal. 900 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 901 { 902 static if (GDC_with_SSE2) 903 { 904 return __builtin_ia32_cmpnlepd(a, b); 905 } 906 else 907 { 908 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b); 909 } 910 } 911 912 /// Compare the lower double-precision (64-bit) floating-point elements 913 /// in `a` and `b` for not-less-than-or-equal, store the result in the 914 /// lower element, and copy the upper element from `a`. 915 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 916 { 917 static if (GDC_with_SSE2) 918 { 919 return __builtin_ia32_cmpnlesd(a, b); 920 } 921 else 922 { 923 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b); 924 } 925 } 926 927 /// Compare packed double-precision (64-bit) floating-point elements 928 /// in `a` and `b` for not-less-than. 929 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 930 { 931 static if (GDC_with_SSE2) 932 { 933 return __builtin_ia32_cmpnltpd(a, b); 934 } 935 else 936 { 937 return cast(__m128d) cmppd!(FPComparison.uge)(a, b); 938 } 939 } 940 941 /// Compare the lower double-precision (64-bit) floating-point elements 942 /// in `a` and `b` for not-less-than, store the result in the lower 943 /// element, and copy the upper element from `a`. 944 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 945 { 946 static if (GDC_with_SSE2) 947 { 948 return __builtin_ia32_cmpnltsd(a, b); 949 } 950 else 951 { 952 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b); 953 } 954 } 955 956 /// Compare packed double-precision (64-bit) floating-point elements 957 /// in `a` and `b` to see if neither is NaN. 958 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 959 { 960 static if (GDC_with_SSE2) 961 { 962 return __builtin_ia32_cmpordpd(a, b); 963 } 964 else 965 { 966 return cast(__m128d) cmppd!(FPComparison.ord)(a, b); 967 } 968 } 969 970 /// Compare the lower double-precision (64-bit) floating-point elements 971 /// in `a` and `b` to see if neither is NaN, store the result in the 972 /// lower element, and copy the upper element from `a` to the upper element. 973 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 974 { 975 static if (GDC_with_SSE2) 976 { 977 return __builtin_ia32_cmpordsd(a, b); 978 } 979 else 980 { 981 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b); 982 } 983 } 984 985 /// Compare packed double-precision (64-bit) floating-point elements 986 /// in `a` and `b` to see if either is NaN. 987 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 988 { 989 static if (GDC_with_SSE2) 990 { 991 return __builtin_ia32_cmpunordpd(a, b); 992 } 993 else 994 { 995 return cast(__m128d) cmppd!(FPComparison.uno)(a, b); 996 } 997 } 998 999 /// Compare the lower double-precision (64-bit) floating-point elements 1000 /// in `a` and `b` to see if either is NaN, store the result in the lower 1001 /// element, and copy the upper element from `a` to the upper element. 1002 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 1003 { 1004 static if (GDC_with_SSE2) 1005 { 1006 return __builtin_ia32_cmpunordsd(a, b); 1007 } 1008 else 1009 { 1010 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b); 1011 } 1012 } 1013 1014 1015 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS 1016 // Some such comparisons yields true for NaNs, other don't. 1017 1018 /// Compare the lower double-precision (64-bit) floating-point element 1019 /// in `a` and `b` for equality, and return the boolean result (0 or 1). 1020 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe 1021 { 1022 static if (GDC_with_SSE2) 1023 { 1024 return __builtin_ia32_comieq(a, b); 1025 } 1026 else 1027 { 1028 return comsd!(FPComparison.ueq)(a, b); // yields true for NaN, same as GCC 1029 } 1030 } 1031 1032 /// Compare the lower double-precision (64-bit) floating-point element 1033 /// in `a` and `b` for greater-than-or-equal, and return the boolean 1034 /// result (0 or 1). 1035 int _mm_comige_sd (__m128d a, __m128d b) pure @safe 1036 { 1037 static if (GDC_with_SSE2) 1038 { 1039 return __builtin_ia32_comige(a, b); 1040 } 1041 else 1042 { 1043 return comsd!(FPComparison.oge)(a, b); 1044 } 1045 } 1046 1047 /// Compare the lower double-precision (64-bit) floating-point element 1048 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1). 1049 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe 1050 { 1051 static if (GDC_with_SSE2) 1052 { 1053 return __builtin_ia32_comigt(a, b); 1054 } 1055 else 1056 { 1057 return comsd!(FPComparison.ogt)(a, b); 1058 } 1059 } 1060 1061 /// Compare the lower double-precision (64-bit) floating-point element 1062 /// in `a` and `b` for less-than-or-equal. 1063 int _mm_comile_sd (__m128d a, __m128d b) pure @safe 1064 { 1065 static if (GDC_with_SSE2) 1066 { 1067 return __builtin_ia32_comile(a, b); 1068 } 1069 else 1070 { 1071 return comsd!(FPComparison.ule)(a, b); // yields true for NaN, same as GCC 1072 } 1073 } 1074 1075 /// Compare the lower double-precision (64-bit) floating-point element 1076 /// in `a` and `b` for less-than, and return the boolean result (0 or 1). 1077 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe 1078 { 1079 static if (GDC_with_SSE2) 1080 { 1081 return __builtin_ia32_comilt(a, b); 1082 } 1083 else 1084 { 1085 return comsd!(FPComparison.ult)(a, b); // yields true for NaN, same as GCC 1086 } 1087 } 1088 1089 /// Compare the lower double-precision (64-bit) floating-point element 1090 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1). 1091 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe 1092 { 1093 static if (GDC_with_SSE2) 1094 { 1095 return __builtin_ia32_comineq(a, b); 1096 } 1097 else 1098 { 1099 return comsd!(FPComparison.one)(a, b); 1100 } 1101 } 1102 1103 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) 1104 /// floating-point elements. 1105 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted 1106 { 1107 version(LDC) 1108 { 1109 // Generates cvtdq2pd since LDC 1.0, even without optimizations 1110 enum ir = ` 1111 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 1112 %r = sitofp <2 x i32> %v to <2 x double> 1113 ret <2 x double> %r`; 1114 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a); 1115 } 1116 else static if (GDC_with_SSE2) 1117 { 1118 return __builtin_ia32_cvtdq2pd(a); 1119 } 1120 else 1121 { 1122 double2 r = void; 1123 r.ptr[0] = a.array[0]; 1124 r.ptr[1] = a.array[1]; 1125 return r; 1126 } 1127 } 1128 unittest 1129 { 1130 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54)); 1131 assert(A.array[0] == 54.0); 1132 assert(A.array[1] == 54.0); 1133 } 1134 1135 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 1136 /// floating-point elements. 1137 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted 1138 { 1139 static if (GDC_with_SSE2) 1140 { 1141 return __builtin_ia32_cvtdq2ps(a); 1142 } 1143 else 1144 { 1145 // Generates cvtdq2ps since LDC 1.0.0 -O1 1146 __m128 res; 1147 res.ptr[0] = cast(float)a.array[0]; 1148 res.ptr[1] = cast(float)a.array[1]; 1149 res.ptr[2] = cast(float)a.array[2]; 1150 res.ptr[3] = cast(float)a.array[3]; 1151 return res; 1152 } 1153 } 1154 unittest 1155 { 1156 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 1157 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 1158 } 1159 1160 /// Convert packed double-precision (64-bit) floating-point elements 1161 /// in `a` to packed 32-bit integers. 1162 __m128i _mm_cvtpd_epi32 (__m128d a) pure @trusted 1163 { 1164 version(LDC) 1165 { 1166 // Like in clang, implemented with a magic intrinsic right now 1167 return __builtin_ia32_cvtpd2dq(a); 1168 1169 /* Unfortunately this generates a cvttpd2dq instruction 1170 __m128i _mm_cvtpd_epi32 (__m128d a) pure @safe 1171 { 1172 enum ir = ` 1173 %i = fptosi <2 x double> %0 to <2 x i32> 1174 %r = shufflevector <2 x i32> %i,<2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1175 ret <4 x i32> %r`; 1176 1177 return cast(__m128i) inlineIR!(ir, __m128i, __m128d)(a); 1178 } */ 1179 } 1180 else static if (GDC_with_SSE2) 1181 { 1182 return __builtin_ia32_cvtpd2dq(a); 1183 } 1184 else 1185 { 1186 __m128i r = _mm_setzero_si128(); 1187 r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]); 1188 r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]); 1189 return r; 1190 } 1191 } 1192 unittest 1193 { 1194 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0)); 1195 assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0); 1196 } 1197 1198 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1199 /// to packed 32-bit integers 1200 __m64 _mm_cvtpd_pi32 (__m128d v) pure @safe 1201 { 1202 return to_m64(_mm_cvtpd_epi32(v)); 1203 } 1204 unittest 1205 { 1206 int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0)); 1207 assert(A.array[0] == 55 && A.array[1] == 61); 1208 } 1209 1210 /// Convert packed double-precision (64-bit) floating-point elements 1211 /// in `a` to packed single-precision (32-bit) floating-point elements. 1212 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted 1213 { 1214 version(LDC) 1215 { 1216 return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately 1217 } 1218 else static if (GDC_with_SSE2) 1219 { 1220 return __builtin_ia32_cvtpd2ps(a); 1221 } 1222 else 1223 { 1224 __m128 r = void; 1225 r.ptr[0] = a.array[0]; 1226 r.ptr[1] = a.array[1]; 1227 r.ptr[2] = 0; 1228 r.ptr[3] = 0; 1229 return r; 1230 } 1231 } 1232 unittest 1233 { 1234 __m128d A = _mm_set_pd(5.25, 4.0); 1235 __m128 B = _mm_cvtpd_ps(A); 1236 assert(B.array == [4.0f, 5.25f, 0, 0]); 1237 } 1238 1239 /// Convert packed 32-bit integers in `v` to packed double-precision 1240 /// (64-bit) floating-point elements. 1241 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe 1242 { 1243 return _mm_cvtepi32_pd(to_m128i(v)); 1244 } 1245 unittest 1246 { 1247 __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5)); 1248 assert(A.array[0] == 4.0 && A.array[1] == -5.0); 1249 } 1250 1251 /// Convert packed single-precision (32-bit) floating-point elements 1252 /// in `a` to packed 32-bit integers, 1253 __m128i _mm_cvtps_epi32 (__m128 a) pure @trusted 1254 { 1255 version(LDC) 1256 { 1257 // Disabled, since it fail with optimizations unfortunately 1258 //alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq; 1259 return __asm!__m128i("cvtps2dq $1,$0","=x,x",a); 1260 } 1261 else static if (GDC_with_SSE2) 1262 { 1263 return __builtin_ia32_cvtps2dq(a); 1264 } 1265 else 1266 { 1267 __m128i r = void; 1268 r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]); 1269 r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]); 1270 r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]); 1271 r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]); 1272 return r; 1273 } 1274 } 1275 unittest 1276 { 1277 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1278 1279 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1280 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1281 assert(A.array == [1, -2, 54, -3]); 1282 1283 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1284 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1285 assert(A.array == [1, -3, 53, -3]); 1286 1287 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1288 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1289 assert(A.array == [2, -2, 54, -2]); 1290 1291 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1292 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1293 assert(A.array == [1, -2, 53, -2]); 1294 1295 _MM_SET_ROUNDING_MODE(savedRounding); 1296 } 1297 1298 /// Convert packed single-precision (32-bit) floating-point elements 1299 /// in `a` to packed double-precision (64-bit) floating-point elements. 1300 __m128d _mm_cvtps_pd (__m128 a) pure @trusted 1301 { 1302 version(LDC) 1303 { 1304 // Generates cvtps2pd since LDC 1.0 -O0 1305 enum ir = ` 1306 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1> 1307 %r = fpext <2 x float> %v to <2 x double> 1308 ret <2 x double> %r`; 1309 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a); 1310 } 1311 else static if (GDC_with_SSE2) 1312 { 1313 return __builtin_ia32_cvtps2pd(a); 1314 } 1315 else 1316 { 1317 double2 r = void; 1318 r.ptr[0] = a.array[0]; 1319 r.ptr[1] = a.array[1]; 1320 return r; 1321 } 1322 } 1323 unittest 1324 { 1325 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f)); 1326 assert(A.array[0] == 54.0); 1327 assert(A.array[1] == 54.0); 1328 } 1329 1330 /// Copy the lower double-precision (64-bit) floating-point element of `a`. 1331 double _mm_cvtsd_f64 (__m128d a) pure @safe 1332 { 1333 return a.array[0]; 1334 } 1335 1336 /// Convert the lower double-precision (64-bit) floating-point element 1337 /// in `a` to a 32-bit integer. 1338 int _mm_cvtsd_si32 (__m128d a) pure @safe 1339 { 1340 version(LDC) 1341 { 1342 return __builtin_ia32_cvtsd2si(a); 1343 } 1344 else static if (GDC_with_SSE2) 1345 { 1346 return __builtin_ia32_cvtsd2si(a); 1347 } 1348 else 1349 { 1350 return convertDoubleToInt32UsingMXCSR(a[0]); 1351 } 1352 } 1353 unittest 1354 { 1355 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0))); 1356 } 1357 1358 version(LDC) 1359 { 1360 // Unfortunately this builtin crashes in 32-bit 1361 version(X86_64) 1362 alias _mm_cvtsd_si64 = __builtin_ia32_cvtsd2si64; 1363 else 1364 { 1365 long _mm_cvtsd_si64 (__m128d a) pure @safe 1366 { 1367 return convertDoubleToInt64UsingMXCSR(a[0]); 1368 } 1369 } 1370 } 1371 else 1372 { 1373 long _mm_cvtsd_si64 (__m128d a) pure @safe 1374 { 1375 return convertDoubleToInt64UsingMXCSR(a.array[0]); 1376 } 1377 } 1378 unittest 1379 { 1380 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0))); 1381 1382 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1383 1384 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1385 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.5))); 1386 1387 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1388 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1))); 1389 1390 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1391 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1))); 1392 1393 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1394 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9))); 1395 1396 _MM_SET_ROUNDING_MODE(savedRounding); 1397 } 1398 1399 alias _mm_cvtsd_si64x = _mm_cvtsd_si64; 1400 1401 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @safe 1402 { 1403 static if (GDC_with_SSE2) 1404 { 1405 return __builtin_ia32_cvtsd2ss(a, b); 1406 } 1407 else 1408 { 1409 // Generates cvtsd2ss since LDC 1.3 -O0 1410 a[0] = b[0]; 1411 return a; 1412 } 1413 } 1414 unittest 1415 { 1416 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0)); 1417 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]); 1418 } 1419 1420 int _mm_cvtsi128_si32 (__m128i a) pure @safe 1421 { 1422 return a.array[0]; 1423 } 1424 1425 long _mm_cvtsi128_si64 (__m128i a) pure @safe 1426 { 1427 long2 la = cast(long2)a; 1428 return la.array[0]; 1429 } 1430 alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 1431 1432 __m128d _mm_cvtsi32_sd(__m128d v, int x) pure @trusted 1433 { 1434 v.ptr[0] = cast(double)x; 1435 return v; 1436 } 1437 unittest 1438 { 1439 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 1440 assert(a.array == [42.0, 0]); 1441 } 1442 1443 __m128i _mm_cvtsi32_si128 (int a) pure @trusted 1444 { 1445 int4 r = [0, 0, 0, 0]; 1446 r.ptr[0] = a; 1447 return r; 1448 } 1449 unittest 1450 { 1451 __m128i a = _mm_cvtsi32_si128(65); 1452 assert(a.array == [65, 0, 0, 0]); 1453 } 1454 1455 1456 // Note: on macOS, using "llvm.x86.sse2.cvtsi642sd" was buggy 1457 __m128d _mm_cvtsi64_sd(__m128d v, long x) pure @trusted 1458 { 1459 v.ptr[0] = cast(double)x; 1460 return v; 1461 } 1462 unittest 1463 { 1464 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 1465 assert(a.array == [42.0, 0]); 1466 } 1467 1468 __m128i _mm_cvtsi64_si128 (long a) pure @trusted 1469 { 1470 long2 r = [0, 0]; 1471 r.ptr[0] = a; 1472 return cast(__m128i)(r); 1473 } 1474 1475 alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; 1476 alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; 1477 1478 double2 _mm_cvtss_sd(double2 v, float4 x) pure @trusted 1479 { 1480 v.ptr[0] = x.array[0]; 1481 return v; 1482 } 1483 unittest 1484 { 1485 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 1486 assert(a.array == [42.0, 0]); 1487 } 1488 1489 long _mm_cvttss_si64 (__m128 a) pure @safe 1490 { 1491 return cast(long)(a.array[0]); // Generates cvttss2si as expected 1492 } 1493 unittest 1494 { 1495 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1496 } 1497 1498 version(LDC) 1499 { 1500 alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq; 1501 } 1502 else 1503 { 1504 static if (GDC_with_SSE2) 1505 { 1506 alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq; 1507 } 1508 else 1509 { 1510 __m128i _mm_cvttpd_epi32 (__m128d a) pure @safe 1511 { 1512 // Note: doesn't generate cvttpd2dq as of LDC 1.13 1513 __m128i r; 1514 r.array[0] = cast(int)a.array[0]; 1515 r.array[1] = cast(int)a.array[1]; 1516 r.array[2] = 0; 1517 r.array[3] = 0; 1518 return r; 1519 } 1520 } 1521 } 1522 unittest 1523 { 1524 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f)); 1525 assert(R.array == [-4, 45641, 0, 0]); 1526 } 1527 1528 1529 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1530 /// to packed 32-bit integers with truncation. 1531 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe 1532 { 1533 return to_m64(_mm_cvttpd_epi32(v)); 1534 } 1535 unittest 1536 { 1537 int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f)); 1538 int[2] correct = [-4, 45641]; 1539 assert(R.array == correct); 1540 } 1541 1542 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted 1543 { 1544 // Note: Generates cvttps2dq since LDC 1.3 -O2 1545 __m128i r; 1546 r.ptr[0] = cast(int)a.array[0]; 1547 r.ptr[1] = cast(int)a.array[1]; 1548 r.ptr[2] = cast(int)a.array[2]; 1549 r.ptr[3] = cast(int)a.array[3]; 1550 return r; 1551 } 1552 unittest 1553 { 1554 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f)); 1555 assert(R.array == [-4, 45641, 0, 1]); 1556 } 1557 1558 int _mm_cvttsd_si32 (__m128d a) 1559 { 1560 // Generates cvttsd2si since LDC 1.3 -O0 1561 return cast(int)a.array[0]; 1562 } 1563 1564 long _mm_cvttsd_si64 (__m128d a) 1565 { 1566 // Generates cvttsd2si since LDC 1.3 -O0 1567 // but in 32-bit instead, it's a long sequence that resort to FPU 1568 return cast(long)a.array[0]; 1569 } 1570 1571 alias _mm_cvttsd_si64x = _mm_cvttsd_si64; 1572 1573 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe 1574 { 1575 return a / b; 1576 } 1577 1578 static if (GDC_with_SSE2) 1579 { 1580 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted 1581 { 1582 return __builtin_ia32_divsd(a, b); 1583 } 1584 } 1585 else version(DigitalMars) 1586 { 1587 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1588 __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe 1589 { 1590 asm pure nothrow @nogc @trusted { nop;} 1591 a.array[0] = a.array[0] / b.array[0]; 1592 return a; 1593 } 1594 } 1595 else 1596 { 1597 __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe 1598 { 1599 a.array[0] /= b.array[0]; 1600 return a; 1601 } 1602 } 1603 unittest 1604 { 1605 __m128d a = [2.0, 4.5]; 1606 a = _mm_div_sd(a, a); 1607 assert(a.array == [1.0, 4.5]); 1608 } 1609 1610 /// Extract a 16-bit integer from `v`, selected with `index` 1611 int _mm_extract_epi16(__m128i v, int index) pure @safe 1612 { 1613 short8 r = cast(short8)v; 1614 return cast(ushort)(r.array[index]); 1615 } 1616 unittest 1617 { 1618 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1); 1619 assert(_mm_extract_epi16(A, 6) == 6); 1620 assert(_mm_extract_epi16(A, 0) == 65535); 1621 } 1622 1623 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`. 1624 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted 1625 { 1626 short8 r = cast(short8)v; 1627 r.ptr[index & 7] = cast(short)i; 1628 return cast(__m128i)r; 1629 } 1630 unittest 1631 { 1632 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1633 short8 R = cast(short8) _mm_insert_epi16(A, 42, 6); 1634 short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7]; 1635 assert(R.array == correct); 1636 } 1637 1638 version(GNU) 1639 { 1640 void _mm_lfence() pure @trusted 1641 { 1642 static if (GDC_with_SSE2) 1643 { 1644 __builtin_ia32_lfence(); 1645 } 1646 else version(X86) 1647 { 1648 asm pure nothrow @nogc @trusted 1649 { 1650 "lfence;\n" : : : ; 1651 } 1652 } 1653 else 1654 static assert(false); 1655 } 1656 } 1657 else version(LDC) 1658 { 1659 alias _mm_lfence = __builtin_ia32_lfence; 1660 } 1661 else static if (DMD_with_asm) 1662 { 1663 void _mm_lfence() pure @safe 1664 { 1665 asm nothrow @nogc pure @safe 1666 { 1667 lfence; 1668 } 1669 } 1670 } 1671 else 1672 static assert(false); 1673 unittest 1674 { 1675 _mm_lfence(); 1676 } 1677 1678 1679 __m128d _mm_load_pd (const(double) * mem_addr) pure 1680 { 1681 __m128d* aligned = cast(__m128d*)mem_addr; 1682 return *aligned; 1683 } 1684 1685 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 1686 { 1687 double[2] arr = [*mem_addr, *mem_addr]; 1688 return loadUnaligned!(double2)(&arr[0]); 1689 } 1690 1691 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted 1692 { 1693 double2 r = [0, 0]; 1694 r.ptr[0] = *mem_addr; 1695 return r; 1696 } 1697 unittest 1698 { 1699 double x = -42; 1700 __m128d a = _mm_load_sd(&x); 1701 assert(a.array == [-42.0, 0.0]); 1702 } 1703 1704 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted 1705 { 1706 return *mem_addr; 1707 } 1708 1709 alias _mm_load1_pd = _mm_load_pd1; 1710 1711 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted 1712 { 1713 a.ptr[1] = *mem_addr; 1714 return a; 1715 } 1716 1717 // Note: strange signature since the memory doesn't have to aligned 1718 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted 1719 { 1720 auto pLong = cast(const(long)*)mem_addr; 1721 long2 r = [0, 0]; 1722 r.ptr[0] = *pLong; 1723 return cast(__m128i)(r); 1724 } 1725 1726 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted 1727 { 1728 a.ptr[0] = *mem_addr; 1729 return a; 1730 } 1731 1732 __m128d _mm_loadr_pd2 (const(double)* mem_addr) pure @trusted 1733 { 1734 __m128d a = *cast(__m128d*)(mem_addr); 1735 __m128d r; 1736 r.ptr[0] = a.array[1]; 1737 r.ptr[1] = a.array[0]; 1738 return r; 1739 } 1740 1741 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @safe 1742 { 1743 static if (GDC_with_SSE2) 1744 { 1745 return __builtin_ia32_loadupd(mem_addr); 1746 } 1747 else 1748 { 1749 return loadUnaligned!(double2)(mem_addr); 1750 } 1751 } 1752 1753 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 1754 { 1755 static if (GDC_with_SSE2) 1756 { 1757 return __builtin_ia32_loaddqu(cast(const(char*))mem_addr); 1758 } 1759 else 1760 { 1761 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 1762 } 1763 } 1764 1765 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted 1766 { 1767 int r = *cast(int*)(mem_addr); 1768 int4 result = [0, 0, 0, 0]; 1769 result.ptr[0] = r; 1770 return result; 1771 } 1772 unittest 1773 { 1774 int r = 42; 1775 __m128i A = _mm_loadu_si32(&r); 1776 int[4] correct = [42, 0, 0, 0]; 1777 assert(A.array == correct); 1778 } 1779 1780 static if (GDC_with_SSE2) 1781 { 1782 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 1783 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 1784 /// and pack the results in destination. 1785 alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128; 1786 } 1787 else version(LDC) 1788 { 1789 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 1790 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 1791 /// and pack the results in destination. 1792 alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128; 1793 } 1794 else 1795 { 1796 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 1797 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 1798 /// and pack the results in destination. 1799 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @safe 1800 { 1801 short8 sa = cast(short8)a; 1802 short8 sb = cast(short8)b; 1803 1804 int4 r; 1805 foreach(i; 0..4) 1806 { 1807 r.array[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1]; 1808 } 1809 return r; 1810 } 1811 } 1812 unittest 1813 { 1814 short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 1815 short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 1816 int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B); 1817 int[4] correct = [1, 13, -2147483648, 2*32767*32767]; 1818 assert(R.array == correct); 1819 } 1820 1821 version(LDC) 1822 { 1823 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 1824 /// (elements are not stored when the highest bit is not set in the corresponding element) 1825 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 1826 /// boundary. 1827 alias _mm_maskmoveu_si128 = __builtin_ia32_maskmovdqu; // can't do it with pure IR 1828 } 1829 else 1830 { 1831 static if (GDC_with_SSE2) 1832 { 1833 ///ditto 1834 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) pure @trusted 1835 { 1836 return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr); 1837 } 1838 } 1839 else 1840 { 1841 ///ditto 1842 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) pure @trusted 1843 { 1844 byte16 b = cast(byte16)a; 1845 byte16 m = cast(byte16)mask; 1846 byte* dest = cast(byte*)(mem_addr); 1847 foreach(j; 0..16) 1848 { 1849 if (m.array[j] & 128) 1850 { 1851 dest[j] = b.array[j]; 1852 } 1853 } 1854 } 1855 } 1856 } 1857 unittest 1858 { 1859 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]; 1860 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0); 1861 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); 1862 _mm_maskmoveu_si128(A, mask, dest.ptr); 1863 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42]; 1864 assert(dest == correct); 1865 } 1866 1867 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe 1868 { 1869 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1870 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else 1871 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1872 __m128i mask = _mm_and_si128(aTob, lowerShorts); 1873 return _mm_xor_si128(b, mask); 1874 } 1875 unittest 1876 { 1877 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-57), 1878 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 1879 short[8] correct = [45, 1, 9, 7, 9, 7, 0, 0]; 1880 assert(R.array == correct); 1881 } 1882 1883 1884 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1885 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe 1886 { 1887 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1888 __m128i value128 = _mm_set1_epi8(-128); 1889 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 1890 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1891 __m128i mask = _mm_and_si128(aTob, higher); 1892 return _mm_xor_si128(b, mask); 1893 } 1894 unittest 1895 { 1896 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 1897 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 1898 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 1899 assert(R.array == correct); 1900 } 1901 1902 __m128d _mm_max_pd (__m128d a, __m128d b) pure @safe 1903 { 1904 static if (GDC_with_SSE2) 1905 { 1906 return __builtin_ia32_maxpd(a, b); 1907 } 1908 else 1909 { 1910 // Generates maxpd starting with LDC 1.9 1911 a[0] = (a[0] > b[0]) ? a[0] : b[0]; 1912 a[1] = (a[1] > b[1]) ? a[1] : b[1]; 1913 return a; 1914 } 1915 } 1916 unittest 1917 { 1918 __m128d A = _mm_setr_pd(4.0, 1.0); 1919 __m128d B = _mm_setr_pd(1.0, 8.0); 1920 __m128d M = _mm_max_pd(A, B); 1921 assert(M.array[0] == 4.0); 1922 assert(M.array[1] == 8.0); 1923 } 1924 1925 __m128d _mm_max_sd (__m128d a, __m128d b) pure @safe 1926 { 1927 static if (GDC_with_SSE2) 1928 { 1929 return __builtin_ia32_maxsd(a, b); 1930 } 1931 else 1932 { 1933 __m128d r = a; 1934 // Generates maxsd starting with LDC 1.3 1935 r.array[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 1936 return r; 1937 } 1938 } 1939 unittest 1940 { 1941 __m128d A = _mm_setr_pd(1.0, 1.0); 1942 __m128d B = _mm_setr_pd(4.0, 2.0); 1943 __m128d M = _mm_max_sd(A, B); 1944 assert(M.array[0] == 4.0); 1945 assert(M.array[1] == 1.0); 1946 } 1947 1948 version(GNU) 1949 { 1950 void _mm_mfence() pure @trusted 1951 { 1952 static if (GDC_with_SSE2) 1953 { 1954 __builtin_ia32_mfence(); 1955 } 1956 else version(X86) 1957 { 1958 asm pure nothrow @nogc @trusted 1959 { 1960 "mfence;\n" : : : ; 1961 } 1962 } 1963 else 1964 static assert(false); 1965 } 1966 } 1967 else version(LDC) 1968 { 1969 alias _mm_mfence = __builtin_ia32_mfence; 1970 } 1971 else static if (DMD_with_asm) 1972 { 1973 void _mm_mfence() pure @safe 1974 { 1975 asm nothrow @nogc pure @safe 1976 { 1977 mfence; 1978 } 1979 } 1980 } 1981 else 1982 static assert(false); 1983 unittest 1984 { 1985 _mm_mfence(); 1986 } 1987 1988 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe 1989 { 1990 // Note: clang uses a __builtin_ia32_pminsw128 which has disappeared from LDC LLVM (?) 1991 // Implemented using masks and XOR 1992 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else 1993 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1994 __m128i mask = _mm_and_si128(aTob, lowerShorts); 1995 return _mm_xor_si128(b, mask); 1996 } 1997 unittest 1998 { 1999 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-57), 2000 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 2001 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -57]; 2002 assert(R.array == correct); 2003 } 2004 2005 2006 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe 2007 { 2008 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 2009 __m128i value128 = _mm_set1_epi8(-128); 2010 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2011 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2012 __m128i mask = _mm_and_si128(aTob, lower); 2013 return _mm_xor_si128(b, mask); 2014 } 2015 unittest 2016 { 2017 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2018 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2019 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 2020 assert(R.array == correct); 2021 } 2022 2023 __m128d _mm_min_pd (__m128d a, __m128d b) pure @safe 2024 { 2025 static if (GDC_with_SSE2) 2026 { 2027 return __builtin_ia32_minpd(a, b); 2028 } 2029 else 2030 { 2031 // Generates minpd starting with LDC 1.9 2032 a.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2033 a.array[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2034 return a; 2035 } 2036 } 2037 unittest 2038 { 2039 __m128d A = _mm_setr_pd(1.0, 2.0); 2040 __m128d B = _mm_setr_pd(4.0, 1.0); 2041 __m128d M = _mm_min_pd(A, B); 2042 assert(M.array[0] == 1.0); 2043 assert(M.array[1] == 1.0); 2044 } 2045 2046 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe 2047 { 2048 static if (GDC_with_SSE2) 2049 { 2050 return __builtin_ia32_minsd(a, b); 2051 } 2052 else 2053 { 2054 // Generates minsd starting with LDC 1.3 2055 __m128d r = a; 2056 r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2057 return r; 2058 } 2059 } 2060 unittest 2061 { 2062 __m128d A = _mm_setr_pd(1.0, 3.0); 2063 __m128d B = _mm_setr_pd(4.0, 2.0); 2064 __m128d M = _mm_min_sd(A, B); 2065 assert(M.array[0] == 1.0); 2066 assert(M.array[1] == 3.0); 2067 } 2068 2069 __m128i _mm_move_epi64 (__m128i a) pure @safe 2070 { 2071 static if (GDC_with_SSE2) 2072 { 2073 return __builtin_ia32_movq128(a); 2074 } 2075 else 2076 { 2077 long2 result = [ 0, 0 ]; 2078 long2 la = cast(long2) a; 2079 result.array[0] = la.array[0]; 2080 return cast(__m128i)(result); 2081 } 2082 } 2083 unittest 2084 { 2085 long2 A = [13, 47]; 2086 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 2087 long[2] correct = [13, 0]; 2088 assert(B.array == correct); 2089 } 2090 2091 __m128d _mm_move_sd (__m128d a, __m128d b) pure @safe 2092 { 2093 static if (GDC_with_SSE2) 2094 { 2095 return __builtin_ia32_movsd(a, b); 2096 } 2097 else 2098 { 2099 b.array[1] = a.array[1]; 2100 return b; 2101 } 2102 } 2103 unittest 2104 { 2105 double2 A = [13.0, 47.0]; 2106 double2 B = [34.0, 58.0]; 2107 double2 C = _mm_move_sd(A, B); 2108 double[2] correct = [34.0, 47.0]; 2109 assert(C.array == correct); 2110 } 2111 2112 version(LDC) 2113 { 2114 /// Create mask from the most significant bit of each 8-bit element in `v`. 2115 alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128; 2116 } 2117 else 2118 { 2119 static if (GDC_with_SSE2) 2120 { 2121 /// Create mask from the most significant bit of each 8-bit element in `v`. 2122 alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128; 2123 } 2124 else 2125 { 2126 /// Create mask from the most significant bit of each 8-bit element in `v`. 2127 int _mm_movemask_epi8(__m128i v) pure @safe 2128 { 2129 byte16 ai = cast(byte16)v; 2130 int r = 0; 2131 foreach(bit; 0..16) 2132 { 2133 if (ai.array[bit] < 0) r += (1 << bit); 2134 } 2135 return r; 2136 } 2137 } 2138 } 2139 unittest 2140 { 2141 assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 0, 0, -1, -1, -1, 0, 0, 0, 0, -1, -1, 0, -1, -1, 0))); 2142 } 2143 2144 version(LDC) 2145 { 2146 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2147 /// packed double-precision (64-bit) floating-point element in `v`. 2148 alias _mm_movemask_pd = __builtin_ia32_movmskpd; 2149 } 2150 else 2151 { 2152 static if (GDC_with_SSE2) 2153 { 2154 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2155 /// packed double-precision (64-bit) floating-point element in `v`. 2156 alias _mm_movemask_pd = __builtin_ia32_movmskpd; 2157 } 2158 else 2159 { 2160 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2161 /// packed double-precision (64-bit) floating-point element in `v`. 2162 int _mm_movemask_pd(__m128d v) pure @safe 2163 { 2164 long2 lv = cast(long2)v; 2165 int r = 0; 2166 if (lv.array[0] < 0) r += 1; 2167 if (lv.array[1] < 0) r += 2; 2168 return r; 2169 } 2170 } 2171 } 2172 unittest 2173 { 2174 __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0); 2175 assert(_mm_movemask_pd(A) == 2); 2176 } 2177 2178 /// Copy the lower 64-bit integer in `v`. 2179 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe 2180 { 2181 long2 lv = cast(long2)v; 2182 return long1(lv.array[0]); 2183 } 2184 unittest 2185 { 2186 __m128i A = _mm_set_epi64x(-1, -2); 2187 __m64 R = _mm_movepi64_pi64(A); 2188 assert(R.array[0] == -2); 2189 } 2190 2191 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element. 2192 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted 2193 { 2194 long2 r; 2195 r.ptr[0] = a.array[0]; 2196 r.ptr[1] = 0; 2197 return cast(__m128i)r; 2198 } 2199 2200 // Note: generates pmuludq in LDC with -O1 2201 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted 2202 { 2203 __m128i zero = _mm_setzero_si128(); 2204 2205 static if (__VERSION__ >= 2088) 2206 { 2207 // Need LLVM9 to avoid this shufflevector 2208 long2 la, lb; 2209 la.ptr[0] = cast(uint)a.array[0]; 2210 la.ptr[1] = cast(uint)a.array[2]; 2211 lb.ptr[0] = cast(uint)b.array[0]; 2212 lb.ptr[1] = cast(uint)b.array[2]; 2213 } 2214 else 2215 { 2216 long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero); 2217 long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero); 2218 } 2219 2220 static if (__VERSION__ >= 2076) 2221 { 2222 return cast(__m128i)(la * lb); 2223 } 2224 else 2225 { 2226 // long2 mul not supported before LDC 1.5 2227 la.ptr[0] *= lb.array[0]; 2228 la.ptr[1] *= lb.array[1]; 2229 return cast(__m128i)(la); 2230 } 2231 } 2232 unittest 2233 { 2234 __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff); 2235 __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff); 2236 __m128i C = _mm_mul_epu32(A, B); 2237 long2 LC = cast(long2)C; 2238 assert(LC.array[0] == 18446744065119617025uL); 2239 assert(LC.array[1] == 12723420444339690338uL); 2240 } 2241 2242 2243 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 2244 { 2245 return a * b; 2246 } 2247 unittest 2248 { 2249 __m128d a = [-2.0, 1.5]; 2250 a = _mm_mul_pd(a, a); 2251 assert(a.array == [4.0, 2.25]); 2252 } 2253 2254 version(DigitalMars) 2255 { 2256 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 2257 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe 2258 { 2259 asm pure nothrow @nogc @trusted { nop;} 2260 a.array[0] = a.array[0] * b.array[0]; 2261 return a; 2262 } 2263 } 2264 else 2265 { 2266 static if (GDC_with_SSE2) 2267 { 2268 alias _mm_mul_sd = __builtin_ia32_mulsd; 2269 } 2270 else 2271 { 2272 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe 2273 { 2274 a.array[0] *= b.array[0]; 2275 return a; 2276 } 2277 } 2278 } 2279 unittest 2280 { 2281 __m128d a = [-2.0, 1.5]; 2282 a = _mm_mul_sd(a, a); 2283 assert(a.array == [4.0, 1.5]); 2284 } 2285 2286 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 2287 /// and get an unsigned 64-bit result. 2288 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe 2289 { 2290 return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b))); 2291 } 2292 unittest 2293 { 2294 __m64 A = _mm_set_pi32(42, 0xDEADBEEF); 2295 __m64 B = _mm_set_pi32(42, 0xCAFEBABE); 2296 __m64 C = _mm_mul_su32(A, B); 2297 assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL); 2298 } 2299 2300 version(LDC) 2301 { 2302 alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128; 2303 } 2304 else 2305 { 2306 static if (GDC_with_SSE2) 2307 { 2308 alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128; 2309 } 2310 else 2311 { 2312 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @safe 2313 { 2314 short8 sa = cast(short8)a; 2315 short8 sb = cast(short8)b; 2316 short8 r = void; 2317 r.array[0] = (sa.array[0] * sb.array[0]) >> 16; 2318 r.array[1] = (sa.array[1] * sb.array[1]) >> 16; 2319 r.array[2] = (sa.array[2] * sb.array[2]) >> 16; 2320 r.array[3] = (sa.array[3] * sb.array[3]) >> 16; 2321 r.array[4] = (sa.array[4] * sb.array[4]) >> 16; 2322 r.array[5] = (sa.array[5] * sb.array[5]) >> 16; 2323 r.array[6] = (sa.array[6] * sb.array[6]) >> 16; 2324 r.array[7] = (sa.array[7] * sb.array[7]) >> 16; 2325 return cast(__m128i)r; 2326 } 2327 } 2328 } 2329 unittest 2330 { 2331 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2332 __m128i B = _mm_set1_epi16(16384); 2333 short8 R = cast(short8)_mm_mulhi_epi16(A, B); 2334 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1]; 2335 assert(R.array == correct); 2336 } 2337 2338 version(LDC) 2339 { 2340 alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128; 2341 } 2342 else 2343 { 2344 static if (GDC_with_SSE2) 2345 { 2346 alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128; 2347 } 2348 else 2349 { 2350 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @safe 2351 { 2352 short8 sa = cast(short8)a; 2353 short8 sb = cast(short8)b; 2354 short8 r = void; 2355 r.array[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 ); 2356 r.array[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 ); 2357 r.array[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 ); 2358 r.array[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 ); 2359 r.array[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 ); 2360 r.array[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 ); 2361 r.array[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 ); 2362 r.array[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 ); 2363 return cast(__m128i)r; 2364 } 2365 } 2366 } 2367 unittest 2368 { 2369 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2370 __m128i B = _mm_set1_epi16(16384); 2371 short8 R = cast(short8)_mm_mulhi_epu16(A, B); 2372 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1]; 2373 assert(R.array == correct); 2374 } 2375 2376 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe 2377 { 2378 return cast(__m128i)(cast(short8)a * cast(short8)b); 2379 } 2380 unittest 2381 { 2382 __m128i A = _mm_setr_epi16(16384, -16, 0, 3, 4, 1, 16, 7); 2383 __m128i B = _mm_set1_epi16(16384); 2384 short8 R = cast(short8)_mm_mullo_epi16(A, B); 2385 short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384]; 2386 assert(R.array == correct); 2387 } 2388 2389 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 2390 { 2391 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 2392 } 2393 2394 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 2395 { 2396 return a | b; 2397 } 2398 2399 version(LDC) 2400 { 2401 alias _mm_packs_epi32 = __builtin_ia32_packssdw128; 2402 } 2403 else 2404 { 2405 static if (GDC_with_SSE2) 2406 { 2407 alias _mm_packs_epi32 = __builtin_ia32_packssdw128; 2408 } 2409 else 2410 { 2411 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @safe 2412 { 2413 short8 r; 2414 r.array[0] = saturateSignedIntToSignedShort(a.array[0]); 2415 r.array[1] = saturateSignedIntToSignedShort(a.array[1]); 2416 r.array[2] = saturateSignedIntToSignedShort(a.array[2]); 2417 r.array[3] = saturateSignedIntToSignedShort(a.array[3]); 2418 r.array[4] = saturateSignedIntToSignedShort(b.array[0]); 2419 r.array[5] = saturateSignedIntToSignedShort(b.array[1]); 2420 r.array[6] = saturateSignedIntToSignedShort(b.array[2]); 2421 r.array[7] = saturateSignedIntToSignedShort(b.array[3]); 2422 return cast(__m128i)r; 2423 } 2424 } 2425 } 2426 unittest 2427 { 2428 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 2429 short8 R = cast(short8) _mm_packs_epi32(A, A); 2430 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0]; 2431 assert(R.array == correct); 2432 } 2433 2434 version(LDC) 2435 { 2436 alias _mm_packs_epi16 = __builtin_ia32_packsswb128; 2437 } 2438 else 2439 { 2440 static if (GDC_with_SSE2) 2441 { 2442 alias _mm_packs_epi16 = __builtin_ia32_packsswb128; 2443 } 2444 else 2445 { 2446 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @safe 2447 { 2448 byte16 r; 2449 short8 sa = cast(short8)a; 2450 short8 sb = cast(short8)b; 2451 foreach(i; 0..8) 2452 r.array[i] = saturateSignedWordToSignedByte(sa.array[i]); 2453 foreach(i; 0..8) 2454 r.array[i+8] = saturateSignedWordToSignedByte(sb.array[i]); 2455 return cast(__m128i)r; 2456 } 2457 } 2458 } 2459 unittest 2460 { 2461 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0); 2462 byte16 R = cast(byte16) _mm_packs_epi16(A, A); 2463 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0, 2464 127, -128, 127, 0, 127, -128, 127, 0]; 2465 assert(R.array == correct); 2466 } 2467 2468 version(LDC) 2469 { 2470 alias _mm_packus_epi16 = __builtin_ia32_packuswb128; 2471 } 2472 else 2473 { 2474 static if (GDC_with_SSE2) 2475 { 2476 alias _mm_packus_epi16 = __builtin_ia32_packuswb128; 2477 } 2478 else 2479 { 2480 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 2481 { 2482 short8 sa = cast(short8)a; 2483 short8 sb = cast(short8)b; 2484 ubyte[16] result = void; 2485 for (int i = 0; i < 8; ++i) 2486 { 2487 short s = sa[i]; 2488 if (s < 0) s = 0; 2489 if (s > 255) s = 255; 2490 result[i] = cast(ubyte)s; 2491 2492 s = sb[i]; 2493 if (s < 0) s = 0; 2494 if (s > 255) s = 255; 2495 result[i+8] = cast(ubyte)s; 2496 } 2497 return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr); 2498 } 2499 } 2500 } 2501 unittest 2502 { 2503 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 2504 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 2505 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 2506 0, 255, 0, 255, 255, 2, 1, 0]; 2507 foreach(i; 0..16) 2508 assert(AA.array[i] == cast(byte)(correctResult[i])); 2509 } 2510 2511 2512 version(GNU) 2513 { 2514 void _mm_pause() pure @trusted 2515 { 2516 static if (GDC_with_SSE2) 2517 { 2518 __builtin_ia32_pause(); 2519 } 2520 else version(X86) 2521 { 2522 asm pure nothrow @nogc @trusted 2523 { 2524 "pause;\n" : : : ; 2525 } 2526 } 2527 else 2528 static assert(false); 2529 } 2530 } 2531 else version(LDC) 2532 { 2533 alias _mm_pause = __builtin_ia32_pause; 2534 } 2535 else static if (DMD_with_asm) 2536 { 2537 void _mm_pause() pure @safe 2538 { 2539 asm nothrow @nogc pure @safe 2540 { 2541 rep; nop; // F3 90 = pause 2542 } 2543 } 2544 } 2545 else 2546 static assert(false); 2547 unittest 2548 { 2549 _mm_pause(); 2550 } 2551 2552 2553 version(LDC) 2554 { 2555 alias _mm_sad_epu8 = __builtin_ia32_psadbw128; 2556 } 2557 else 2558 { 2559 static if (GDC_with_SSE2) 2560 { 2561 alias _mm_sad_epu8 = __builtin_ia32_psadbw128; 2562 } 2563 else 2564 { 2565 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @safe 2566 { 2567 byte16 ab = cast(byte16)a; 2568 byte16 bb = cast(byte16)b; 2569 ubyte[16] t; 2570 foreach(i; 0..16) 2571 { 2572 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]); 2573 if (diff < 0) diff = -diff; 2574 t[i] = cast(ubyte)(diff); 2575 } 2576 int4 r = _mm_setzero_si128(); 2577 r.array[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 2578 r.array[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15]; 2579 return r; 2580 } 2581 } 2582 } 2583 unittest 2584 { 2585 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 2586 __m128i B = _mm_set1_epi8(1); 2587 __m128i R = _mm_sad_epu8(A, B); 2588 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 2589 0, 2590 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 2591 0]; 2592 assert(R.array == correct); 2593 } 2594 2595 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 2596 { 2597 short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7]; 2598 return cast(__m128i) loadUnaligned!(short8)(result.ptr); 2599 } 2600 unittest 2601 { 2602 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 2603 short8 B = cast(short8) A; 2604 foreach(i; 0..8) 2605 assert(B.array[i] == i); 2606 } 2607 2608 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 2609 { 2610 int[4] result = [e0, e1, e2, e3]; 2611 return loadUnaligned!(int4)(result.ptr); 2612 } 2613 unittest 2614 { 2615 __m128i A = _mm_set_epi32(3, 2, 1, 0); 2616 foreach(i; 0..4) 2617 assert(A.array[i] == i); 2618 } 2619 2620 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted 2621 { 2622 long[2] result = [e0.array[0], e1.array[0]]; 2623 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 2624 } 2625 unittest 2626 { 2627 __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678)); 2628 long2 B = cast(long2) A; 2629 assert(B.array[0] == 5678); 2630 assert(B.array[1] == 1234); 2631 } 2632 2633 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 2634 { 2635 long[2] result = [e0, e1]; 2636 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 2637 } 2638 unittest 2639 { 2640 __m128i A = _mm_set_epi64x(1234, 5678); 2641 long2 B = cast(long2) A; 2642 assert(B.array[0] == 5678); 2643 assert(B.array[1] == 1234); 2644 } 2645 2646 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 2647 byte e11, byte e10, byte e9, byte e8, 2648 byte e7, byte e6, byte e5, byte e4, 2649 byte e3, byte e2, byte e1, byte e0) pure @trusted 2650 { 2651 byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 2652 e8, e9, e10, e11, e12, e13, e14, e15]; 2653 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 2654 } 2655 2656 __m128d _mm_set_pd (double e1, double e0) pure @trusted 2657 { 2658 double[2] result = [e0, e1]; 2659 return loadUnaligned!(double2)(result.ptr); 2660 } 2661 unittest 2662 { 2663 __m128d A = _mm_set_pd(61.0, 55.0); 2664 double[2] correct = [55.0, 61.0]; 2665 assert(A.array == correct); 2666 } 2667 2668 __m128d _mm_set_pd1 (double a) pure @trusted 2669 { 2670 double[2] result = [a, a]; 2671 return loadUnaligned!(double2)(result.ptr); 2672 } 2673 unittest 2674 { 2675 __m128d A = _mm_set_pd1(61.0); 2676 double[2] correct = [61.0, 61.0]; 2677 assert(A.array == correct); 2678 } 2679 2680 __m128d _mm_set_sd (double a) pure @trusted 2681 { 2682 double[2] result = [a, 0]; 2683 return loadUnaligned!(double2)(result.ptr); 2684 } 2685 2686 __m128i _mm_set1_epi16 (short a) pure @trusted 2687 { 2688 return cast(__m128i)(short8(a)); 2689 } 2690 2691 __m128i _mm_set1_epi32 (int a) pure @trusted 2692 { 2693 return cast(__m128i)(int4(a)); 2694 } 2695 unittest 2696 { 2697 __m128 a = _mm_set1_ps(-1.0f); 2698 __m128 b = cast(__m128) _mm_set1_epi32(0x7fffffff); 2699 assert(_mm_and_ps(a, b).array == [1.0f, 1, 1, 1]); 2700 } 2701 2702 /// Broadcast 64-bit integer `a` to all elements of `dst`. 2703 __m128i _mm_set1_epi64 (__m64 a) pure @safe 2704 { 2705 return _mm_set_epi64(a, a); 2706 } 2707 2708 __m128i _mm_set1_epi64x (long a) pure @trusted 2709 { 2710 return cast(__m128i)(long2(a)); 2711 } 2712 2713 __m128i _mm_set1_epi8 (byte a) pure @trusted 2714 { 2715 return cast(__m128i)(byte16(a)); 2716 } 2717 2718 alias _mm_set1_pd = _mm_set_pd1; 2719 2720 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 2721 short e3, short e2, short e1, short e0) pure @trusted 2722 { 2723 short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0]; 2724 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 2725 } 2726 2727 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 2728 { 2729 int[4] result = [e3, e2, e1, e0]; 2730 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 2731 } 2732 2733 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 2734 { 2735 long[2] result = [e1, e0]; 2736 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 2737 } 2738 2739 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12, 2740 byte e11, byte e10, byte e9, byte e8, 2741 byte e7, byte e6, byte e5, byte e4, 2742 byte e3, byte e2, byte e1, byte e0) pure @trusted 2743 { 2744 byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 2745 e7, e6, e5, e4, e3, e2, e1, e0]; 2746 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 2747 } 2748 2749 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 2750 { 2751 double[2] result = [e1, e0]; 2752 return loadUnaligned!(double2)(result.ptr); 2753 } 2754 unittest 2755 { 2756 __m128d A = _mm_setr_pd(61.0, 55.0); 2757 double[2] correct = [61.0, 55.0]; 2758 assert(A.array == correct); 2759 } 2760 2761 __m128d _mm_setzero_pd () pure @trusted 2762 { 2763 double[2] result = [0.0, 0.0]; 2764 return loadUnaligned!(double2)(result.ptr); 2765 } 2766 2767 __m128i _mm_setzero_si128() pure @trusted 2768 { 2769 int[4] result = [0, 0, 0, 0]; 2770 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 2771 } 2772 2773 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe 2774 { 2775 static if (GDC_with_SSE2) 2776 { 2777 return __builtin_ia32_pshufd(a, imm8); 2778 } 2779 else 2780 { 2781 return shufflevector!(int4, (imm8 >> 0) & 3, 2782 (imm8 >> 2) & 3, 2783 (imm8 >> 4) & 3, 2784 (imm8 >> 6) & 3)(a, a); 2785 } 2786 } 2787 unittest 2788 { 2789 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 2790 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 2791 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 2792 int[4] expectedB = [ 3, 2, 1, 0 ]; 2793 assert(B.array == expectedB); 2794 } 2795 2796 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe 2797 { 2798 static if (GDC_with_SSE2) 2799 { 2800 return __builtin_ia32_shufpd(a, b, imm8); 2801 } 2802 else 2803 { 2804 return shufflevector!(double2, 0 + ( imm8 & 1 ), 2805 2 + ( (imm8 >> 1) & 1 ))(a, b); 2806 } 2807 } 2808 unittest 2809 { 2810 __m128d A = _mm_setr_pd(0.5, 2.0); 2811 __m128d B = _mm_setr_pd(4.0, 5.0); 2812 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 2813 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B); 2814 double[2] correct = [ 2.0, 5.0 ]; 2815 assert(R.array == correct); 2816 } 2817 2818 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe 2819 { 2820 static if (GDC_with_SSE2) 2821 { 2822 return __builtin_ia32_pshufhw(a, imm8); 2823 } 2824 else 2825 { 2826 return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3, 2827 4 + ( (imm8 >> 0) & 3 ), 2828 4 + ( (imm8 >> 2) & 3 ), 2829 4 + ( (imm8 >> 4) & 3 ), 2830 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 2831 } 2832 } 2833 unittest 2834 { 2835 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 2836 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 2837 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 2838 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 2839 assert(C.array == expectedC); 2840 } 2841 2842 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe 2843 { 2844 static if (GDC_with_SSE2) 2845 { 2846 return __builtin_ia32_pshuflw(a, imm8); 2847 } 2848 else 2849 { 2850 return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ), 2851 ( (imm8 >> 2) & 3 ), 2852 ( (imm8 >> 4) & 3 ), 2853 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 2854 } 2855 } 2856 unittest 2857 { 2858 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 2859 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 2860 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 2861 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 2862 assert(B.array == expectedB); 2863 } 2864 2865 version(LDC) 2866 { 2867 alias _mm_sll_epi32 = __builtin_ia32_pslld128; 2868 } 2869 else static if (GDC_with_SSE2) 2870 { 2871 alias _mm_sll_epi32 = __builtin_ia32_pslld128; 2872 } 2873 else static if (DMD_with_32bit_asm) 2874 { 2875 __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe 2876 { 2877 asm pure nothrow @nogc @trusted 2878 { 2879 movdqu XMM0, a; 2880 movdqu XMM1, count; 2881 pslld XMM0, XMM1; 2882 movdqu a, XMM0; 2883 } 2884 return a; 2885 } 2886 } 2887 else 2888 { 2889 2890 __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe 2891 { 2892 int4 r = void; 2893 long2 lc = cast(long2)count; 2894 int bits = cast(int)(lc.array[0]); 2895 foreach(i; 0..4) 2896 r[i] = cast(uint)(a[i]) << bits; 2897 return r; 2898 } 2899 } 2900 unittest 2901 { 2902 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 2903 __m128i B = _mm_sll_epi32(A, _mm_cvtsi32_si128(1)); 2904 int[4] expectedB = [ 0, 4, 6, -8]; 2905 assert(B.array == expectedB); 2906 } 2907 2908 version(LDC) 2909 { 2910 alias _mm_sll_epi64 = __builtin_ia32_psllq128; 2911 } 2912 else static if (GDC_with_SSE2) 2913 { 2914 alias _mm_sll_epi64 = __builtin_ia32_psllq128; 2915 } 2916 else static if (DMD_with_32bit_asm) 2917 { 2918 __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe 2919 { 2920 asm pure nothrow @nogc @trusted 2921 { 2922 movdqu XMM0, a; 2923 movdqu XMM1, count; 2924 psllq XMM0, XMM1; 2925 movdqu a, XMM0; 2926 } 2927 return a; 2928 } 2929 } 2930 else 2931 { 2932 __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe 2933 { 2934 long2 r = void; 2935 long2 sa = cast(long2)a; 2936 long2 lc = cast(long2)count; 2937 int bits = cast(int)(lc.array[0]); 2938 foreach(i; 0..2) 2939 r.array[i] = cast(ulong)(sa.array[i]) << bits; 2940 return cast(__m128i)r; 2941 } 2942 } 2943 unittest 2944 { 2945 __m128i A = _mm_setr_epi64(8, -4); 2946 long2 B = cast(long2) _mm_sll_epi64(A, _mm_cvtsi32_si128(1)); 2947 long[2] expectedB = [ 16, -8]; 2948 assert(B.array == expectedB); 2949 } 2950 2951 version(LDC) 2952 { 2953 alias _mm_sll_epi16 = __builtin_ia32_psllw128; 2954 } 2955 else static if (GDC_with_SSE2) 2956 { 2957 alias _mm_sll_epi16 = __builtin_ia32_psllw128; 2958 } 2959 else static if (DMD_with_32bit_asm) 2960 { 2961 __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 2962 { 2963 asm pure nothrow @nogc 2964 { 2965 movdqu XMM0, a; 2966 movdqu XMM1, count; 2967 psllw XMM0, XMM1; 2968 movdqu a, XMM0; 2969 } 2970 return a; 2971 } 2972 } 2973 else 2974 { 2975 __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 2976 { 2977 short8 sa = cast(short8)a; 2978 long2 lc = cast(long2)count; 2979 int bits = cast(int)(lc.array[0]); 2980 short8 r = void; 2981 foreach(i; 0..8) 2982 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits); 2983 return cast(int4)r; 2984 } 2985 } 2986 unittest 2987 { 2988 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 2989 short8 B = cast(short8)( _mm_sll_epi16(A, _mm_cvtsi32_si128(1)) ); 2990 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 2991 assert(B.array == expectedB); 2992 } 2993 2994 version(LDC) 2995 { 2996 alias _mm_slli_epi32 = __builtin_ia32_pslldi128; 2997 } 2998 else 2999 { 3000 static if (GDC_with_SSE2) 3001 { 3002 alias _mm_slli_epi32 = __builtin_ia32_pslldi128; 3003 } 3004 else 3005 { 3006 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @safe 3007 { 3008 int4 r = void; 3009 foreach(i; 0..4) 3010 r.array[i] = cast(uint)(a.array[i]) << imm8; 3011 return r; 3012 } 3013 } 3014 } 3015 unittest 3016 { 3017 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3018 __m128i B = _mm_slli_epi32(A, 1); 3019 int[4] expectedB = [ 0, 4, 6, -8]; 3020 assert(B.array == expectedB); 3021 } 3022 3023 version(LDC) 3024 { 3025 alias _mm_slli_epi64 = __builtin_ia32_psllqi128; 3026 } 3027 else 3028 { 3029 static if (GDC_with_SSE2) 3030 { 3031 alias _mm_slli_epi64 = __builtin_ia32_psllqi128; 3032 } 3033 else 3034 { 3035 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @safe 3036 { 3037 long2 r = void; 3038 long2 sa = cast(long2)a; 3039 foreach(i; 0..2) 3040 r.array[i] = cast(ulong)(sa.array[i]) << imm8; 3041 return cast(__m128i)r; 3042 } 3043 } 3044 } 3045 unittest 3046 { 3047 __m128i A = _mm_setr_epi64(8, -4); 3048 long2 B = cast(long2) _mm_slli_epi64(A, 1); 3049 long[2] expectedB = [ 16, -8]; 3050 assert(B.array == expectedB); 3051 } 3052 3053 version(LDC) 3054 { 3055 alias _mm_slli_epi16 = __builtin_ia32_psllwi128; 3056 } 3057 else 3058 { 3059 static if (GDC_with_SSE2) 3060 { 3061 alias _mm_slli_epi16 = __builtin_ia32_psllwi128; 3062 } 3063 else 3064 { 3065 __m128i _mm_slli_epi16 (__m128i a, int imm8) pure @safe 3066 { 3067 short8 sa = cast(short8)a; 3068 short8 r = void; 3069 foreach(i; 0..8) 3070 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) << imm8); 3071 return cast(int4)r; 3072 } 3073 } 3074 } 3075 unittest 3076 { 3077 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3078 short8 B = cast(short8)( _mm_slli_epi16(A, 1) ); 3079 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 3080 assert(B.array == expectedB); 3081 } 3082 3083 3084 /// Shift `a` left by `bytes` bytes while shifting in zeros. 3085 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted 3086 { 3087 static if (bytes & 0xF0) 3088 { 3089 return _mm_setzero_si128(); 3090 } 3091 else 3092 { 3093 static if (GDC_with_SSE2) 3094 { 3095 return __builtin_ia32_pslldqi128(op, cast(ubyte)(bytes * 8)); 3096 } 3097 else version(DigitalMars) 3098 { 3099 version(D_InlineAsm_X86) 3100 { 3101 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64 3102 { 3103 movdqu XMM0, op; 3104 pslldq XMM0, bytes; 3105 movdqu op, XMM0; 3106 } 3107 return op; 3108 } 3109 else 3110 { 3111 byte16 A = cast(byte16)op; 3112 byte16 R; 3113 for (int n = 15; n >= bytes; --n) 3114 R.ptr[n] = A.array[n-bytes]; 3115 for (int n = bytes-1; n >= 0; --n) 3116 R.ptr[n] = 0; 3117 return cast(__m128i)R; 3118 } 3119 } 3120 else 3121 { 3122 return cast(__m128i) shufflevector!(byte16, 3123 16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes, 3124 22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes, 3125 28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes) 3126 (cast(byte16)_mm_setzero_si128(), cast(byte16)op); 3127 } 3128 } 3129 } 3130 unittest 3131 { 3132 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3133 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left 3134 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ]; 3135 assert(R.array == correct); 3136 } 3137 3138 version(LDC) 3139 { 3140 // Disappeared with LDC 1.11 3141 static if (__VERSION__ < 2081) 3142 alias _mm_sqrt_pd = __builtin_ia32_sqrtpd; 3143 else 3144 { 3145 __m128d _mm_sqrt_pd(__m128d vec) pure @safe 3146 { 3147 vec.array[0] = llvm_sqrt(vec.array[0]); 3148 vec.array[1] = llvm_sqrt(vec.array[1]); 3149 return vec; 3150 } 3151 } 3152 } 3153 else 3154 { 3155 static if (GDC_with_SSE2) 3156 { 3157 alias _mm_sqrt_pd = __builtin_ia32_sqrtpd; 3158 } 3159 else 3160 { 3161 __m128d _mm_sqrt_pd(__m128d vec) pure @safe 3162 { 3163 vec.array[0] = sqrt(vec.array[0]); 3164 vec.array[1] = sqrt(vec.array[1]); 3165 return vec; 3166 } 3167 } 3168 } 3169 3170 3171 version(LDC) 3172 { 3173 // Disappeared with LDC 1.11 3174 static if (__VERSION__ < 2081) 3175 alias _mm_sqrt_sd = __builtin_ia32_sqrtsd; 3176 else 3177 { 3178 __m128d _mm_sqrt_sd(__m128d vec) pure @safe 3179 { 3180 vec.array[0] = llvm_sqrt(vec.array[0]); 3181 vec.array[1] = vec.array[1]; 3182 return vec; 3183 } 3184 } 3185 } 3186 else 3187 { 3188 static if (GDC_with_SSE2) 3189 { 3190 alias _mm_sqrt_sd = __builtin_ia32_sqrtsd; 3191 } 3192 else 3193 { 3194 __m128d _mm_sqrt_sd(__m128d vec) pure @safe 3195 { 3196 vec.array[0] = sqrt(vec.array[0]); 3197 vec.array[1] = vec.array[1]; 3198 return vec; 3199 } 3200 } 3201 } 3202 3203 3204 version(LDC) 3205 { 3206 alias _mm_sra_epi16 = __builtin_ia32_psraw128; 3207 } 3208 else 3209 { 3210 static if (GDC_with_SSE2) 3211 { 3212 alias _mm_sra_epi16 = __builtin_ia32_psraw128; 3213 } 3214 else 3215 { 3216 __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @safe 3217 { 3218 short8 sa = cast(short8)a; 3219 long2 lc = cast(long2)count; 3220 int bits = cast(int)(lc.array[0]); 3221 short8 r = void; 3222 foreach(i; 0..8) 3223 r.array[i] = cast(short)(sa.array[i] >> bits); 3224 return cast(int4)r; 3225 } 3226 } 3227 } 3228 unittest 3229 { 3230 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3231 short8 B = cast(short8)( _mm_sra_epi16(A, _mm_cvtsi32_si128(1)) ); 3232 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 3233 assert(B.array == expectedB); 3234 } 3235 3236 version(LDC) 3237 { 3238 alias _mm_sra_epi32 = __builtin_ia32_psrad128; 3239 } 3240 else 3241 { 3242 static if (GDC_with_SSE2) 3243 { 3244 alias _mm_sra_epi32 = __builtin_ia32_psrad128; 3245 } 3246 else 3247 { 3248 __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @safe 3249 { 3250 int4 r = void; 3251 long2 lc = cast(long2)count; 3252 int bits = cast(int)(lc.array[0]); 3253 foreach(i; 0..4) 3254 r.array[i] = (a.array[i] >> bits); 3255 return r; 3256 } 3257 } 3258 } 3259 unittest 3260 { 3261 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3262 __m128i B = _mm_sra_epi32(A, _mm_cvtsi32_si128(1)); 3263 int[4] expectedB = [ 0, 1, 1, -2]; 3264 assert(B.array == expectedB); 3265 } 3266 3267 3268 version(LDC) 3269 { 3270 alias _mm_srai_epi16 = __builtin_ia32_psrawi128; 3271 } 3272 else 3273 { 3274 static if (GDC_with_SSE2) 3275 { 3276 alias _mm_srai_epi16 = __builtin_ia32_psrawi128; 3277 } 3278 else 3279 { 3280 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @safe 3281 { 3282 short8 sa = cast(short8)a; 3283 short8 r = void; 3284 foreach(i; 0..8) 3285 r.array[i] = cast(short)(sa.array[i] >> imm8); 3286 return cast(int4)r; 3287 } 3288 } 3289 } 3290 unittest 3291 { 3292 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3293 short8 B = cast(short8)( _mm_srai_epi16(A, 1) ); 3294 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 3295 assert(B.array == expectedB); 3296 } 3297 3298 version(LDC) 3299 { 3300 alias _mm_srai_epi32 = __builtin_ia32_psradi128; 3301 } 3302 else 3303 { 3304 static if (GDC_with_SSE2) 3305 { 3306 alias _mm_srai_epi32 = __builtin_ia32_psradi128; 3307 } 3308 else 3309 { 3310 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @safe 3311 { 3312 int4 r = void; 3313 foreach(i; 0..4) 3314 r.array[i] = (a.array[i] >> imm8); 3315 return r; 3316 } 3317 } 3318 } 3319 unittest 3320 { 3321 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3322 __m128i B = _mm_srai_epi32(A, 1); 3323 int[4] expectedB = [ 0, 1, 1, -2]; 3324 assert(B.array == expectedB); 3325 } 3326 3327 version(LDC) 3328 { 3329 alias _mm_srl_epi16 = __builtin_ia32_psrlw128; 3330 } 3331 else 3332 { 3333 static if (GDC_with_SSE2) 3334 { 3335 alias _mm_srl_epi16 = __builtin_ia32_psrlw128; 3336 } 3337 else 3338 { 3339 __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @safe 3340 { 3341 short8 sa = cast(short8)a; 3342 long2 lc = cast(long2)count; 3343 int bits = cast(int)(lc.array[0]); 3344 short8 r = void; 3345 foreach(i; 0..8) 3346 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits); 3347 return cast(int4)r; 3348 } 3349 } 3350 } 3351 unittest 3352 { 3353 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3354 short8 B = cast(short8)( _mm_srl_epi16(A, _mm_cvtsi32_si128(1)) ); 3355 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 3356 assert(B.array == expectedB); 3357 } 3358 3359 version(LDC) 3360 { 3361 alias _mm_srl_epi32 = __builtin_ia32_psrld128; 3362 } 3363 else 3364 { 3365 static if (GDC_with_SSE2) 3366 { 3367 alias _mm_srl_epi32 = __builtin_ia32_psrld128; 3368 } 3369 else 3370 { 3371 __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @safe 3372 { 3373 int4 r = void; 3374 long2 lc = cast(long2)count; 3375 int bits = cast(int)(lc.array[0]); 3376 foreach(i; 0..4) 3377 r.array[i] = cast(uint)(a.array[i]) >> bits; 3378 return r; 3379 } 3380 } 3381 } 3382 unittest 3383 { 3384 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3385 __m128i B = _mm_srl_epi32(A, _mm_cvtsi32_si128(1)); 3386 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 3387 assert(B.array == expectedB); 3388 } 3389 3390 version(LDC) 3391 { 3392 alias _mm_srl_epi64 = __builtin_ia32_psrlq128; 3393 } 3394 else 3395 { 3396 static if (GDC_with_SSE2) 3397 { 3398 alias _mm_srl_epi64 = __builtin_ia32_psrlq128; 3399 } 3400 else 3401 { 3402 __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @safe 3403 { 3404 long2 r = void; 3405 long2 sa = cast(long2)a; 3406 long2 lc = cast(long2)count; 3407 int bits = cast(int)(lc.array[0]); 3408 foreach(i; 0..2) 3409 r.array[i] = cast(ulong)(sa.array[i]) >> bits; 3410 return cast(__m128i)r; 3411 } 3412 } 3413 } 3414 unittest 3415 { 3416 __m128i A = _mm_setr_epi64(8, -4); 3417 long2 B = cast(long2) _mm_srl_epi64(A, _mm_cvtsi32_si128(1)); 3418 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 3419 assert(B.array == expectedB); 3420 } 3421 3422 version(LDC) 3423 { 3424 alias _mm_srli_epi16 = __builtin_ia32_psrlwi128; 3425 } 3426 else 3427 { 3428 static if (GDC_with_SSE2) 3429 { 3430 alias _mm_srli_epi16 = __builtin_ia32_psrlwi128; 3431 } 3432 else 3433 { 3434 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @safe 3435 { 3436 short8 sa = cast(short8)a; 3437 short8 r = void; 3438 foreach(i; 0..8) 3439 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> imm8); 3440 return cast(int4)r; 3441 } 3442 } 3443 } 3444 unittest 3445 { 3446 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3447 short8 B = cast(short8)( _mm_srli_epi16(A, 1) ); 3448 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 3449 assert(B.array == expectedB); 3450 } 3451 3452 version(LDC) 3453 { 3454 alias _mm_srli_epi32 = __builtin_ia32_psrldi128; 3455 } 3456 else 3457 { 3458 static if (GDC_with_SSE2) 3459 { 3460 alias _mm_srli_epi32 = __builtin_ia32_psrldi128; 3461 } 3462 else 3463 { 3464 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @safe 3465 { 3466 int4 r = void; 3467 foreach(i; 0..4) 3468 r.array[i] = cast(uint)(a.array[i]) >> imm8; 3469 return r; 3470 } 3471 } 3472 } 3473 unittest 3474 { 3475 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3476 __m128i B = _mm_srli_epi32(A, 1); 3477 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 3478 assert(B.array == expectedB); 3479 } 3480 3481 version(LDC) 3482 { 3483 alias _mm_srli_epi64 = __builtin_ia32_psrlqi128; 3484 } 3485 else 3486 { 3487 static if (GDC_with_SSE2) 3488 { 3489 alias _mm_srli_epi64 = __builtin_ia32_psrlqi128; 3490 } 3491 else 3492 { 3493 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @safe 3494 { 3495 long2 r = void; 3496 long2 sa = cast(long2)a; 3497 foreach(i; 0..2) 3498 r.array[i] = cast(ulong)(sa.array[i]) >> imm8; 3499 return cast(__m128i)r; 3500 } 3501 } 3502 } 3503 unittest 3504 { 3505 __m128i A = _mm_setr_epi64(8, -4); 3506 long2 B = cast(long2) _mm_srli_epi64(A, 1); 3507 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 3508 assert(B.array == expectedB); 3509 } 3510 3511 /// Shift `v` right by `bytes` bytes while shifting in zeros. 3512 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe 3513 { 3514 static if (bytes & 0xF0) 3515 { 3516 return _mm_setzero_si128(); 3517 } 3518 else 3519 { 3520 static if (GDC_with_SSE2) 3521 { 3522 return cast(__m128i) __builtin_ia32_psrldqi128(v, cast(ubyte)(bytes * 8)); 3523 } 3524 else static if (DMD_with_32bit_asm) 3525 { 3526 asm pure nothrow @nogc @trusted 3527 { 3528 movdqu XMM0, v; 3529 psrldq XMM0, bytes; 3530 movdqu v, XMM0; 3531 } 3532 return v; 3533 } 3534 else 3535 { 3536 return cast(__m128i) shufflevector!(byte16, 3537 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7, 3538 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15) 3539 (cast(byte16) v, cast(byte16)_mm_setzero_si128()); 3540 } 3541 } 3542 3543 } 3544 3545 unittest 3546 { 3547 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1)); 3548 int[4] correct = [2, 3, 4, 0]; 3549 assert(R.array == correct); 3550 } 3551 3552 /// Shift `v` right by `bytes` bytes while shifting in zeros. 3553 /// #BONUS 3554 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe 3555 { 3556 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v); 3557 } 3558 unittest 3559 { 3560 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)); 3561 float[4] correct = [3.0f, 4.0f, 0, 0]; 3562 assert(R.array == correct); 3563 } 3564 3565 /// Shift `v` right by `bytes` bytes while shifting in zeros. 3566 /// #BONUS 3567 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe 3568 { 3569 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v); 3570 } 3571 3572 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted 3573 { 3574 __m128d* aligned = cast(__m128d*)mem_addr; 3575 *aligned = a; 3576 } 3577 3578 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted 3579 { 3580 __m128d* aligned = cast(__m128d*)mem_addr; 3581 __m128d r; 3582 r.ptr[0] = a.array[0]; 3583 r.ptr[1] = a.array[0]; 3584 *aligned = r; 3585 } 3586 3587 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 3588 { 3589 *mem_addr = a.array[0]; 3590 } 3591 3592 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 3593 { 3594 *mem_addr = a; 3595 } 3596 3597 alias _mm_store1_pd = _mm_store_pd1; 3598 3599 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 3600 { 3601 *mem_addr = a.array[1]; 3602 } 3603 3604 // Note: `mem_addr` doesn't have to actually be aligned, which breaks 3605 // expectations from the user point of view. This problem also exist in C++. 3606 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 3607 { 3608 long* dest = cast(long*)mem_addr; 3609 long2 la = cast(long2)a; 3610 *dest = la.array[0]; 3611 } 3612 unittest 3613 { 3614 long[3] A = [1, 2, 3]; 3615 _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 3616 long[3] correct = [1, 0x1_0000_0000, 3]; 3617 assert(A == correct); 3618 } 3619 3620 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 3621 { 3622 *mem_addr = a.array[0]; 3623 } 3624 3625 void _mm_storer_pd (double* mem_addr, __m128d a) pure 3626 { 3627 __m128d* aligned = cast(__m128d*)mem_addr; 3628 *aligned = shufflevector!(double2, 1, 0)(a, a); 3629 } 3630 3631 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe 3632 { 3633 storeUnaligned!double2(a, mem_addr); 3634 } 3635 3636 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe 3637 { 3638 storeUnaligned!__m128i(a, cast(int*)mem_addr); 3639 } 3640 3641 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 3642 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte 3643 /// boundary or a general-protection exception may be generated. 3644 void _mm_stream_pd (double* mem_addr, __m128d a) 3645 { 3646 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 3647 __m128d* dest = cast(__m128d*)mem_addr; 3648 *dest = a; 3649 } 3650 3651 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 3652 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception 3653 /// may be generated. 3654 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) 3655 { 3656 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 3657 __m128i* dest = cast(__m128i*)mem_addr; 3658 *dest = a; 3659 } 3660 3661 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 3662 /// pollution. If the cache line containing address mem_addr is already in the cache, 3663 /// the cache will be updated. 3664 void _mm_stream_si32 (int* mem_addr, int a) 3665 { 3666 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 3667 *mem_addr = a; 3668 } 3669 3670 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 3671 /// cache pollution. If the cache line containing address mem_addr is already 3672 /// in the cache, the cache will be updated. 3673 void _mm_stream_si64 (long* mem_addr, long a) 3674 { 3675 // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 3676 *mem_addr = a; 3677 } 3678 3679 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 3680 { 3681 return cast(__m128i)(cast(short8)a - cast(short8)b); 3682 } 3683 3684 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 3685 { 3686 return cast(__m128i)(cast(int4)a - cast(int4)b); 3687 } 3688 3689 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 3690 { 3691 return cast(__m128i)(cast(long2)a - cast(long2)b); 3692 } 3693 3694 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 3695 { 3696 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 3697 } 3698 3699 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 3700 { 3701 return a - b; 3702 } 3703 3704 version(DigitalMars) 3705 { 3706 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 3707 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe 3708 { 3709 asm pure nothrow @nogc @trusted { nop;} 3710 a[0] = a[0] - b[0]; 3711 return a; 3712 } 3713 } 3714 else static if (GDC_with_SSE2) 3715 { 3716 alias _mm_sub_sd = __builtin_ia32_subsd; 3717 } 3718 else 3719 { 3720 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe 3721 { 3722 a.array[0] -= b.array[0]; 3723 return a; 3724 } 3725 } 3726 unittest 3727 { 3728 __m128d a = [1.5, -2.0]; 3729 a = _mm_sub_sd(a, a); 3730 assert(a.array == [0.0, -2.0]); 3731 } 3732 3733 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe 3734 { 3735 return a - b; 3736 } 3737 3738 version(LDC) 3739 { 3740 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 3741 { 3742 // Generates PSUBSW since LDC 1.15 -O0 3743 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 3744 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 3745 { 3746 enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 3747 enum ir = ` 3748 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 3749 ret <8 x i16> %r`; 3750 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 3751 } 3752 } 3753 else 3754 alias _mm_subs_epi16 = __builtin_ia32_psubsw128; 3755 } 3756 else 3757 { 3758 static if (GDC_with_SSE2) 3759 { 3760 alias _mm_subs_epi16 = __builtin_ia32_psubsw128; 3761 } 3762 else 3763 { 3764 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 3765 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 3766 { 3767 short[8] res; 3768 short8 sa = cast(short8)a; 3769 short8 sb = cast(short8)b; 3770 foreach(i; 0..8) 3771 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 3772 return _mm_loadu_si128(cast(int4*)res.ptr); 3773 } 3774 } 3775 } 3776 unittest 3777 { 3778 short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0), 3779 _mm_setr_epi16(-10 , 16, 5, 4, 3, 2, 1, 0)); 3780 static immutable short[8] correctResult = [32767, -32768, 0, 0, 0, 0, 0, 0]; 3781 assert(res.array == correctResult); 3782 } 3783 3784 version(LDC) 3785 { 3786 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 3787 { 3788 // Generates PSUBSB since LDC 1.15 -O0 3789 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 3790 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 3791 { 3792 enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 3793 enum ir = ` 3794 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 3795 ret <16 x i8> %r`; 3796 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 3797 } 3798 } 3799 else 3800 alias _mm_subs_epi8 = __builtin_ia32_psubsb128; 3801 } 3802 else 3803 { 3804 static if (GDC_with_SSE2) 3805 { 3806 alias _mm_subs_epi8 = __builtin_ia32_psubsb128; 3807 } 3808 else 3809 { 3810 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 3811 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 3812 { 3813 byte[16] res; 3814 byte16 sa = cast(byte16)a; 3815 byte16 sb = cast(byte16)b; 3816 foreach(i; 0..16) 3817 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 3818 return _mm_loadu_si128(cast(int4*)res.ptr); 3819 } 3820 } 3821 } 3822 unittest 3823 { 3824 byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 3825 _mm_setr_epi8( 15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 3826 static immutable byte[16] correctResult = [-128, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 3827 assert(res.array == correctResult); 3828 } 3829 3830 version(LDC) 3831 { 3832 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 3833 { 3834 // Generates PSUBUSW since LDC 1.15 -O0 3835 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 3836 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 3837 { 3838 enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 3839 enum ir = ` 3840 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 3841 ret <8 x i16> %r`; 3842 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 3843 } 3844 } 3845 else 3846 alias _mm_subs_epu16 = __builtin_ia32_psubusw128; 3847 } 3848 else 3849 { 3850 static if (GDC_with_SSE2) 3851 { 3852 alias _mm_subs_epu16 = __builtin_ia32_psubusw128; 3853 } 3854 else 3855 { 3856 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 3857 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 3858 { 3859 short[8] res; 3860 short8 sa = cast(short8)a; 3861 short8 sb = cast(short8)b; 3862 foreach(i; 0..8) 3863 { 3864 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 3865 res[i] = saturateSignedIntToUnsignedShort(sum); 3866 } 3867 return _mm_loadu_si128(cast(int4*)res.ptr); 3868 } 3869 } 3870 } 3871 unittest 3872 { 3873 short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534, 1, 5, 4, 3, 2, 1, 0), 3874 _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0)); 3875 static immutable short[8] correct = [ 0, 0, 1, 0, 0, 2, 0, 0]; 3876 assert(R.array == correct); 3877 } 3878 3879 version(LDC) 3880 { 3881 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 3882 { 3883 // Generates PSUBUSB since LDC 1.15 -O0 3884 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 3885 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 3886 { 3887 enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 3888 enum ir = ` 3889 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 3890 ret <16 x i8> %r`; 3891 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 3892 } 3893 } 3894 else 3895 alias _mm_subs_epu8 = __builtin_ia32_psubusb128; 3896 } 3897 else 3898 { 3899 static if (GDC_with_SSE2) 3900 { 3901 alias _mm_subs_epu8 = __builtin_ia32_psubusb128; 3902 } 3903 else 3904 { 3905 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 3906 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 3907 { 3908 ubyte[16] res; 3909 byte16 sa = cast(byte16)a; 3910 byte16 sb = cast(byte16)b; 3911 foreach(i; 0..16) 3912 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 3913 return _mm_loadu_si128(cast(int4*)res.ptr); 3914 } 3915 } 3916 } 3917 unittest 3918 { 3919 byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 3920 _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 3921 static immutable byte[16] correctResult = [ 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 3922 assert(res.array == correctResult); 3923 } 3924 3925 // Note: the only difference between these intrinsics is the signalling 3926 // behaviour of quiet NaNs. This is incorrect but the case where 3927 // you would want to differentiate between qNaN and sNaN and then 3928 // treat them differently on purpose seems extremely rare. 3929 alias _mm_ucomieq_sd = _mm_comieq_sd; 3930 alias _mm_ucomige_sd = _mm_comige_sd; 3931 alias _mm_ucomigt_sd = _mm_comigt_sd; 3932 alias _mm_ucomile_sd = _mm_comile_sd; 3933 alias _mm_ucomilt_sd = _mm_comilt_sd; 3934 alias _mm_ucomineq_sd = _mm_comineq_sd; 3935 3936 __m128d _mm_undefined_pd() pure @safe 3937 { 3938 __m128d result = void; 3939 return result; 3940 } 3941 __m128i _mm_undefined_si128() pure @safe 3942 { 3943 __m128i result = void; 3944 return result; 3945 } 3946 3947 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe 3948 { 3949 static if (GDC_with_SSE2) 3950 { 3951 return __builtin_ia32_punpckhwd128(a, b); 3952 } 3953 else static if (DMD_with_32bit_asm) 3954 { 3955 asm pure nothrow @nogc @trusted 3956 { 3957 movdqu XMM0, a; 3958 movdqu XMM1, b; 3959 punpckhwd XMM0, XMM1; 3960 movdqu a, XMM0; 3961 } 3962 return a; 3963 } 3964 else 3965 { 3966 return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 3967 (cast(short8)a, cast(short8)b); 3968 } 3969 } 3970 unittest 3971 { 3972 __m128i A = _mm_setr_epi16(4, 5, 6, 7, 8, 9, 10, 11); 3973 __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19); 3974 short8 C = cast(short8)(_mm_unpackhi_epi16(A, B)); 3975 short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19]; 3976 assert(C.array == correct); 3977 } 3978 3979 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @safe 3980 { 3981 static if (GDC_with_SSE2) 3982 { 3983 return __builtin_ia32_punpckhdq128(a, b); 3984 } 3985 else 3986 { 3987 return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 3988 } 3989 } 3990 3991 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted 3992 { 3993 static if (GDC_with_SSE2) 3994 { 3995 return __builtin_ia32_punpckhqdq128(a, b); 3996 } 3997 else 3998 { 3999 __m128i r = cast(__m128i)b; 4000 r[0] = a[2]; 4001 r[1] = a[3]; 4002 return r; 4003 } 4004 } 4005 unittest // Issue #36 4006 { 4007 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4008 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4009 long2 C = cast(long2)(_mm_unpackhi_epi64(A, B)); 4010 long[2] correct = [0x33333333_33333333, 0x55555555_55555555]; 4011 assert(C.array == correct); 4012 } 4013 4014 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe 4015 { 4016 static if (GDC_with_SSE2) 4017 { 4018 return __builtin_ia32_punpckhbw128(a, b); 4019 } 4020 else static if (DMD_with_32bit_asm) 4021 { 4022 asm pure nothrow @nogc @trusted 4023 { 4024 movdqu XMM0, a; 4025 movdqu XMM1, b; 4026 punpckhbw XMM0, XMM1; 4027 movdqu a, XMM0; 4028 } 4029 return a; 4030 } 4031 else 4032 { 4033 return cast(__m128i)shufflevector!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 4034 12, 28, 13, 29, 14, 30, 15, 31) 4035 (cast(byte16)a, cast(byte16)b); 4036 } 4037 } 4038 4039 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe 4040 { 4041 static if (GDC_with_SSE2) 4042 { 4043 return __builtin_ia32_unpckhpd(a, b); 4044 } 4045 else 4046 { 4047 return shufflevector!(__m128d, 1, 3)(a, b); 4048 } 4049 } 4050 4051 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe 4052 { 4053 static if (GDC_with_SSE2) 4054 { 4055 return __builtin_ia32_punpcklwd128(a, b); 4056 } 4057 else static if (DMD_with_32bit_asm) 4058 { 4059 asm pure nothrow @nogc @trusted 4060 { 4061 movdqu XMM0, a; 4062 movdqu XMM1, b; 4063 punpcklwd XMM0, XMM1; 4064 movdqu a, XMM0; 4065 } 4066 return a; 4067 } 4068 else 4069 { 4070 return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11) 4071 (cast(short8)a, cast(short8)b); 4072 } 4073 } 4074 4075 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @safe 4076 { 4077 static if (GDC_with_SSE2) 4078 { 4079 return __builtin_ia32_punpckldq128(a, b); 4080 } 4081 else 4082 { 4083 return shufflevector!(int4, 0, 4, 1, 5) 4084 (cast(int4)a, cast(int4)b); 4085 } 4086 } 4087 4088 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted 4089 { 4090 static if (GDC_with_SSE2) 4091 { 4092 return __builtin_ia32_punpcklqdq128(a, b); 4093 } 4094 else 4095 { 4096 long2 lA = cast(long2)a; 4097 long2 lB = cast(long2)b; 4098 long2 R; 4099 R.ptr[0] = lA.array[0]; 4100 R.ptr[1] = lB.array[0]; 4101 return cast(__m128i)R; 4102 } 4103 } 4104 unittest // Issue #36 4105 { 4106 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4107 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4108 long2 C = cast(long2)(_mm_unpacklo_epi64(A, B)); 4109 long[2] correct = [0x22222222_22222222, 0x44444444_44444444]; 4110 assert(C.array == correct); 4111 } 4112 4113 4114 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe 4115 { 4116 static if (GDC_with_SSE2) 4117 { 4118 return __builtin_ia32_punpcklbw128(a, b); 4119 } 4120 else static if (DMD_with_32bit_asm) 4121 { 4122 asm pure nothrow @nogc @trusted 4123 { 4124 movdqu XMM0, a; 4125 movdqu XMM1, b; 4126 punpcklbw XMM0, XMM1; 4127 movdqu a, XMM0; 4128 } 4129 return a; 4130 } 4131 else 4132 { 4133 return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 4134 4, 20, 5, 21, 6, 22, 7, 23) 4135 (cast(byte16)a, cast(byte16)b); 4136 } 4137 } 4138 4139 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe 4140 { 4141 static if (GDC_with_SSE2) 4142 { 4143 return __builtin_ia32_unpcklpd(a, b); 4144 } 4145 else 4146 { 4147 return shufflevector!(__m128d, 0, 2)(a, b); 4148 } 4149 } 4150 4151 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 4152 { 4153 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 4154 } 4155 4156 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 4157 { 4158 return a ^ b; 4159 } 4160 4161 unittest 4162 { 4163 // distance between two points in 4D 4164 float distance(float[4] a, float[4] b) nothrow @nogc 4165 { 4166 __m128 va = _mm_loadu_ps(a.ptr); 4167 __m128 vb = _mm_loadu_ps(b.ptr); 4168 __m128 diffSquared = _mm_sub_ps(va, vb); 4169 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 4170 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared)); 4171 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum)); 4172 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 4173 } 4174 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 4175 }