1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2019, Stefanos Baziotis 2019. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.emmintrin; 7 8 public import inteli.types; 9 public import inteli.xmmintrin; // SSE2 includes SSE1 10 import inteli.mmx; 11 import inteli.internals; 12 13 nothrow @nogc: 14 15 16 // SSE2 instructions 17 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 18 19 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 20 { 21 return cast(__m128i)(cast(short8)a + cast(short8)b); 22 } 23 24 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 25 { 26 return cast(__m128i)(cast(int4)a + cast(int4)b); 27 } 28 29 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 30 { 31 return cast(__m128i)(cast(long2)a + cast(long2)b); 32 } 33 34 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 35 { 36 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 37 } 38 39 static if (GDC_with_SSE2) 40 { 41 alias _mm_add_sd = __builtin_ia32_addsd; 42 } 43 else version(DigitalMars) 44 { 45 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 46 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 47 { 48 asm pure nothrow @nogc @trusted { nop;} 49 a[0] = a[0] + b[0]; 50 return a; 51 } 52 } 53 else 54 { 55 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 56 { 57 a[0] += b[0]; 58 return a; 59 } 60 } 61 unittest 62 { 63 __m128d a = [1.5, -2.0]; 64 a = _mm_add_sd(a, a); 65 assert(a.array == [3.0, -2.0]); 66 } 67 68 69 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 70 { 71 return a + b; 72 } 73 unittest 74 { 75 __m128d a = [1.5, -2.0]; 76 a = _mm_add_pd(a, a); 77 assert(a.array == [3.0, -4.0]); 78 } 79 80 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe 81 { 82 return a + b; 83 } 84 85 static if (GDC_with_SSE2) 86 { 87 alias _mm_adds_epi16 = __builtin_ia32_paddsw128; 88 } 89 else version(LDC) 90 { 91 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 92 { 93 // Generates PADDSW since LDC 1.15 -O0 94 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 95 { 96 enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 97 enum ir = ` 98 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 99 ret <8 x i16> %r`; 100 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 101 } 102 } 103 else 104 alias _mm_adds_epi16 = __builtin_ia32_paddsw128; 105 } 106 else 107 { 108 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 109 { 110 short[8] res; 111 short8 sa = cast(short8)a; 112 short8 sb = cast(short8)b; 113 foreach(i; 0..8) 114 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 115 return _mm_loadu_si128(cast(int4*)res.ptr); 116 } 117 } 118 unittest 119 { 120 short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), 121 _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); 122 static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 123 assert(res.array == correctResult); 124 } 125 126 static if (GDC_with_SSE2) 127 { 128 alias _mm_adds_epi8 = __builtin_ia32_paddsb128; 129 } 130 else version(LDC) 131 { 132 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 133 { 134 // Generates PADDSB since LDC 1.15 -O0 135 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 136 { 137 enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 138 enum ir = ` 139 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 140 ret <16 x i8> %r`; 141 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 142 } 143 } 144 else 145 alias _mm_adds_epi8 = __builtin_ia32_paddsb128; 146 } 147 else 148 { 149 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 150 { 151 byte[16] res; 152 byte16 sa = cast(byte16)a; 153 byte16 sb = cast(byte16)b; 154 foreach(i; 0..16) 155 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 156 return _mm_loadu_si128(cast(int4*)res.ptr); 157 } 158 } 159 unittest 160 { 161 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 162 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 163 static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14, 164 16, 18, 20, 22, 24, 26, 28, 30]; 165 assert(res.array == correctResult); 166 } 167 168 version(LDC) 169 { 170 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 171 { 172 // Generates PADDUSB since LDC 1.15 -O0 173 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 174 { 175 enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 176 enum ir = ` 177 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 178 ret <16 x i8> %r`; 179 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 180 } 181 } 182 else 183 alias _mm_adds_epu8 = __builtin_ia32_paddusb128; 184 } 185 else 186 { 187 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 188 { 189 ubyte[16] res; 190 byte16 sa = cast(byte16)a; 191 byte16 sb = cast(byte16)b; 192 foreach(i; 0..16) 193 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 194 return _mm_loadu_si128(cast(int4*)res.ptr); 195 } 196 } 197 198 version(LDC) 199 { 200 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 201 { 202 // Generates PADDUSW since LDC 1.15 -O0 203 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 204 { 205 enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 206 enum ir = ` 207 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 208 ret <8 x i16> %r`; 209 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 210 } 211 } 212 else 213 alias _mm_adds_epu16 = __builtin_ia32_paddusw128; 214 } 215 else 216 { 217 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 218 { 219 ushort[8] res; 220 short8 sa = cast(short8)a; 221 short8 sb = cast(short8)b; 222 foreach(i; 0..8) 223 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 224 return _mm_loadu_si128(cast(int4*)res.ptr); 225 } 226 } 227 228 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 229 { 230 return cast(__m128d)( cast(__m128i)a & cast(__m128i)b ); 231 } 232 233 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 234 { 235 return a & b; 236 } 237 unittest 238 { 239 __m128i A = _mm_set1_epi32(7); 240 __m128i B = _mm_set1_epi32(14); 241 __m128i R = _mm_and_si128(A, B); 242 int[4] correct = [6, 6, 6, 6]; 243 assert(R.array == correct); 244 } 245 246 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 247 { 248 return cast(__m128d)( (~cast(__m128i)a) & cast(__m128i)b ); 249 } 250 251 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 252 { 253 return (~a) & b; 254 } 255 unittest 256 { 257 __m128i A = _mm_set1_epi32(7); 258 __m128i B = _mm_set1_epi32(14); 259 __m128i R = _mm_andnot_si128(A, B); 260 int[4] correct = [8, 8, 8, 8]; 261 assert(R.array == correct); 262 } 263 264 static if (GDC_with_SSE2) 265 { 266 alias _mm_avg_epu16 = __builtin_ia32_pavgw128; 267 } 268 else version(LDC) 269 { 270 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @safe 271 { 272 // Generates pavgw even in LDC 1.0, even in -O0 273 enum ir = ` 274 %ia = zext <8 x i16> %0 to <8 x i32> 275 %ib = zext <8 x i16> %1 to <8 x i32> 276 %isum = add <8 x i32> %ia, %ib 277 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 278 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 279 %r = trunc <8 x i32> %isums to <8 x i16> 280 ret <8 x i16> %r`; 281 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 282 } 283 } 284 else 285 { 286 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @safe 287 { 288 short8 sa = cast(short8)a; 289 short8 sb = cast(short8)b; 290 short8 sr = void; 291 foreach(i; 0..8) 292 { 293 sr[i] = cast(ushort)( (cast(ushort)(sa[i]) + cast(ushort)(sb[i]) + 1) >> 1 ); 294 } 295 return cast(int4)sr; 296 } 297 } 298 unittest 299 { 300 __m128i A = _mm_set1_epi16(31); 301 __m128i B = _mm_set1_epi16(64); 302 short8 avg = cast(short8)(_mm_avg_epu16(A, B)); 303 foreach(i; 0..8) 304 assert(avg.array[i] == 48); 305 } 306 307 static if (GDC_with_SSE2) 308 { 309 alias _mm_avg_epu8 = __builtin_ia32_pavgb128; 310 } 311 else version(LDC) 312 { 313 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @safe 314 { 315 // Generates pavgb even in LDC 1.0, even in -O0 316 enum ir = ` 317 %ia = zext <16 x i8> %0 to <16 x i16> 318 %ib = zext <16 x i8> %1 to <16 x i16> 319 %isum = add <16 x i16> %ia, %ib 320 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 321 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 322 %r = trunc <16 x i16> %isums to <16 x i8> 323 ret <16 x i8> %r`; 324 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 325 } 326 } 327 else 328 { 329 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @safe 330 { 331 byte16 sa = cast(byte16)a; 332 byte16 sb = cast(byte16)b; 333 byte16 sr = void; 334 foreach(i; 0..16) 335 { 336 sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 ); 337 } 338 return cast(int4)sr; 339 } 340 } 341 unittest 342 { 343 __m128i A = _mm_set1_epi8(31); 344 __m128i B = _mm_set1_epi8(64); 345 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B)); 346 foreach(i; 0..16) 347 assert(avg.array[i] == 48); 348 } 349 350 351 alias _mm_bslli_si128 = _mm_slli_si128; 352 353 unittest 354 { 355 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 356 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 357 __m128i result = _mm_bslli_si128!5(toShift); 358 assert( (cast(byte16)result).array == exact); 359 } 360 361 alias _mm_bsrli_si128 = _mm_srli_si128; 362 363 unittest 364 { 365 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 366 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0]; 367 __m128i result = _mm_bsrli_si128!5(toShift); 368 assert( (cast(byte16)result).array == exact); 369 } 370 371 __m128 _mm_castpd_ps (__m128d a) pure @safe 372 { 373 return cast(__m128)a; 374 } 375 376 __m128i _mm_castpd_si128 (__m128d a) pure @safe 377 { 378 return cast(__m128i)a; 379 } 380 381 __m128d _mm_castps_pd (__m128 a) pure @safe 382 { 383 return cast(__m128d)a; 384 } 385 386 __m128i _mm_castps_si128 (__m128 a) pure @safe 387 { 388 return cast(__m128i)a; 389 } 390 391 __m128d _mm_castsi128_pd (__m128i a) pure @safe 392 { 393 return cast(__m128d)a; 394 } 395 396 __m128 _mm_castsi128_ps (__m128i a) pure @safe 397 { 398 return cast(__m128)a; 399 } 400 401 static if (GDC_with_SSE2) 402 { 403 void _mm_clflush (const(void)* p) pure @safe 404 { 405 return __builtin_ia32_clflush(p); 406 } 407 } 408 else version(LDC) 409 { 410 alias _mm_clflush = __builtin_ia32_clflush; 411 } 412 else 413 { 414 void _mm_clflush (const(void)* p) pure @safe 415 { 416 version(D_InlineAsm_X86) 417 { 418 asm pure nothrow @nogc @safe 419 { 420 mov EAX, p; 421 clflush [EAX]; 422 } 423 } 424 else version(D_InlineAsm_X86_64) 425 { 426 asm pure nothrow @nogc @safe 427 { 428 mov RAX, p; 429 clflush [RAX]; 430 } 431 } 432 else 433 { 434 // Do nothing. Invalidating cacheline does 435 // not affect correctness. 436 } 437 } 438 } 439 unittest 440 { 441 ubyte[64] cacheline; 442 _mm_clflush(cacheline.ptr); 443 } 444 445 static if (GDC_with_SSE2) 446 { 447 alias _mm_cmpeq_epi16 = __builtin_ia32_pcmpeqw128; 448 } 449 else 450 { 451 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 452 { 453 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 454 } 455 } 456 unittest 457 { 458 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 459 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 460 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 461 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 462 assert(R.array == E); 463 } 464 465 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 466 { 467 static if (GDC_with_SSE2) 468 { 469 return __builtin_ia32_pcmpeqd128(a, b); 470 } 471 else 472 { 473 return equalMask!__m128i(a, b); 474 } 475 } 476 unittest 477 { 478 int4 A = [-3, -2, -1, 0]; 479 int4 B = [ 4, -2, 2, 0]; 480 int[4] E = [ 0, -1, 0, -1]; 481 int4 R = cast(int4)(_mm_cmpeq_epi16(A, B)); 482 assert(R.array == E); 483 } 484 485 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 486 { 487 static if (GDC_with_SSE2) 488 { 489 return __builtin_ia32_pcmpeqb128(a, b); 490 } 491 else 492 { 493 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 494 } 495 } 496 unittest 497 { 498 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 499 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 500 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 501 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 502 assert(C.array == correct); 503 } 504 505 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 506 { 507 static if (GDC_with_SSE2) 508 { 509 return __builtin_ia32_cmpeqpd(a, b); 510 } 511 else 512 { 513 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b); 514 } 515 } 516 517 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 518 { 519 static if (GDC_with_SSE2) 520 { 521 return __builtin_ia32_cmpeqsd(a, b); 522 } 523 else 524 { 525 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b); 526 } 527 } 528 529 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 530 { 531 static if (GDC_with_SSE2) 532 { 533 return __builtin_ia32_cmpgepd(a, b); 534 } 535 else 536 { 537 return cast(__m128d) cmppd!(FPComparison.oge)(a, b); 538 } 539 } 540 541 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 542 { 543 // Note: There is no __builtin_ia32_cmpgesd builtin. 544 static if (GDC_with_SSE2) 545 { 546 return __builtin_ia32_cmpnltsd(b, a); 547 } 548 else 549 { 550 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b); 551 } 552 } 553 554 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 555 { 556 static if (GDC_with_SSE2) 557 { 558 return __builtin_ia32_pcmpgtw128(a, b); 559 } 560 else 561 { 562 return cast(__m128i)( greaterMask!short8(cast(short8)a, cast(short8)b)); 563 } 564 } 565 unittest 566 { 567 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 568 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 569 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 570 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 571 assert(R.array == E); 572 } 573 574 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 575 { 576 static if (GDC_with_SSE2) 577 { 578 return __builtin_ia32_pcmpgtd128(a, b); 579 } 580 else 581 { 582 return cast(__m128i)( greaterMask!int4(a, b)); 583 } 584 } 585 unittest 586 { 587 int4 A = [-3, 2, -1, 0]; 588 int4 B = [ 4, -2, 2, 0]; 589 int[4] E = [ 0, -1, 0, 0]; 590 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 591 assert(R.array == E); 592 } 593 594 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 595 { 596 static if (GDC_with_SSE2) 597 { 598 return __builtin_ia32_pcmpgtb128(a, b); 599 } 600 else 601 { 602 return cast(__m128i)( greaterMask!byte16(cast(byte16)a, cast(byte16)b)); 603 } 604 } 605 unittest 606 { 607 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 608 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 609 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 610 byte[16] correct = [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 611 __m128i D = _mm_cmpeq_epi8(A, B); 612 assert(C.array == correct); 613 } 614 615 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 616 { 617 static if (GDC_with_SSE2) 618 { 619 return __builtin_ia32_cmpgtpd(a, b); 620 } 621 else 622 { 623 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b); 624 } 625 } 626 627 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 628 { 629 // Note: There is no __builtin_ia32_cmpgtsd builtin. 630 static if (GDC_with_SSE2) 631 { 632 return __builtin_ia32_cmpnlesd(b, a); 633 } 634 else 635 { 636 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b); 637 } 638 } 639 640 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 641 { 642 static if (GDC_with_SSE2) 643 { 644 return __builtin_ia32_cmplepd(a, b); 645 } 646 else 647 { 648 return cast(__m128d) cmppd!(FPComparison.ole)(a, b); 649 } 650 } 651 652 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 653 { 654 static if (GDC_with_SSE2) 655 { 656 return __builtin_ia32_cmplesd(a, b); 657 } 658 else 659 { 660 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b); 661 } 662 } 663 664 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 665 { 666 return _mm_cmpgt_epi16(b, a); 667 } 668 669 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 670 { 671 return _mm_cmpgt_epi32(b, a); 672 } 673 674 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 675 { 676 return _mm_cmpgt_epi8(b, a); 677 } 678 679 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 680 { 681 static if (GDC_with_SSE2) 682 { 683 return __builtin_ia32_cmpltpd(a, b); 684 } 685 else 686 { 687 return cast(__m128d) cmppd!(FPComparison.olt)(a, b); 688 } 689 } 690 691 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 692 { 693 static if (GDC_with_SSE2) 694 { 695 return __builtin_ia32_cmpltsd(a, b); 696 } 697 else 698 { 699 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b); 700 } 701 } 702 703 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 704 { 705 static if (GDC_with_SSE2) 706 { 707 return __builtin_ia32_cmpneqpd(a, b); 708 } 709 else 710 { 711 return cast(__m128d) cmppd!(FPComparison.une)(a, b); 712 } 713 } 714 715 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 716 { 717 static if (GDC_with_SSE2) 718 { 719 return __builtin_ia32_cmpneqsd(a, b); 720 } 721 else 722 { 723 return cast(__m128d) cmpsd!(FPComparison.une)(a, b); 724 } 725 } 726 727 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 728 { 729 static if (GDC_with_SSE2) 730 { 731 return __builtin_ia32_cmpngepd(a, b); 732 } 733 else 734 { 735 return cast(__m128d) cmppd!(FPComparison.ult)(a, b); 736 } 737 } 738 739 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 740 { 741 // Note: There is no __builtin_ia32_cmpngesd builtin. 742 static if (GDC_with_SSE2) 743 { 744 return __builtin_ia32_cmpltsd(b, a); 745 } 746 else 747 { 748 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b); 749 } 750 } 751 752 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 753 { 754 static if (GDC_with_SSE2) 755 { 756 return __builtin_ia32_cmpngtpd(a, b); 757 } 758 else 759 { 760 return cast(__m128d) cmppd!(FPComparison.ule)(a, b); 761 } 762 } 763 764 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 765 { 766 // Note: There is no __builtin_ia32_cmpngtsd builtin. 767 static if (GDC_with_SSE2) 768 { 769 return __builtin_ia32_cmplesd(b, a); 770 } 771 else 772 { 773 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b); 774 } 775 } 776 777 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 778 { 779 static if (GDC_with_SSE2) 780 { 781 return __builtin_ia32_cmpnlepd(a, b); 782 } 783 else 784 { 785 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b); 786 } 787 } 788 789 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 790 { 791 static if (GDC_with_SSE2) 792 { 793 return __builtin_ia32_cmpnlesd(a, b); 794 } 795 else 796 { 797 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b); 798 } 799 } 800 801 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 802 { 803 static if (GDC_with_SSE2) 804 { 805 return __builtin_ia32_cmpnltpd(a, b); 806 } 807 else 808 { 809 return cast(__m128d) cmppd!(FPComparison.uge)(a, b); 810 } 811 } 812 813 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 814 { 815 static if (GDC_with_SSE2) 816 { 817 return __builtin_ia32_cmpnltsd(a, b); 818 } 819 else 820 { 821 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b); 822 } 823 } 824 825 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 826 { 827 static if (GDC_with_SSE2) 828 { 829 return __builtin_ia32_cmpordpd(a, b); 830 } 831 else 832 { 833 return cast(__m128d) cmppd!(FPComparison.ord)(a, b); 834 } 835 } 836 837 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 838 { 839 static if (GDC_with_SSE2) 840 { 841 return __builtin_ia32_cmpordsd(a, b); 842 } 843 else 844 { 845 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b); 846 } 847 } 848 849 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 850 { 851 static if (GDC_with_SSE2) 852 { 853 return __builtin_ia32_cmpunordpd(a, b); 854 } 855 else 856 { 857 return cast(__m128d) cmppd!(FPComparison.uno)(a, b); 858 } 859 } 860 861 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 862 { 863 static if (GDC_with_SSE2) 864 { 865 return __builtin_ia32_cmpunordsd(a, b); 866 } 867 else 868 { 869 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b); 870 } 871 } 872 873 874 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS 875 // Some such comparisons yields true for NaNs, other don't. 876 877 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe 878 { 879 static if (GDC_with_SSE2) 880 { 881 return __builtin_ia32_comieq(a, b); 882 } 883 else 884 { 885 return comsd!(FPComparison.ueq)(a, b); // yields true for NaN, same as GCC 886 } 887 } 888 889 int _mm_comige_sd (__m128d a, __m128d b) pure @safe 890 { 891 static if (GDC_with_SSE2) 892 { 893 return __builtin_ia32_comige(a, b); 894 } 895 else 896 { 897 return comsd!(FPComparison.oge)(a, b); 898 } 899 } 900 901 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe 902 { 903 static if (GDC_with_SSE2) 904 { 905 return __builtin_ia32_comigt(a, b); 906 } 907 else 908 { 909 return comsd!(FPComparison.ogt)(a, b); 910 } 911 } 912 913 int _mm_comile_sd (__m128d a, __m128d b) pure @safe 914 { 915 static if (GDC_with_SSE2) 916 { 917 return __builtin_ia32_comile(a, b); 918 } 919 else 920 { 921 return comsd!(FPComparison.ule)(a, b); // yields true for NaN, same as GCC 922 } 923 } 924 925 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe 926 { 927 static if (GDC_with_SSE2) 928 { 929 return __builtin_ia32_comilt(a, b); 930 } 931 else 932 { 933 return comsd!(FPComparison.ult)(a, b); // yields true for NaN, same as GCC 934 } 935 } 936 937 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe 938 { 939 static if (GDC_with_SSE2) 940 { 941 return __builtin_ia32_comineq(a, b); 942 } 943 else 944 { 945 return comsd!(FPComparison.one)(a, b); 946 } 947 } 948 949 version(LDC) 950 { 951 __m128d _mm_cvtepi32_pd (__m128i a) pure @safe 952 { 953 // Generates cvtdq2pd since LDC 1.0, even without optimizations 954 enum ir = ` 955 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 956 %r = sitofp <2 x i32> %v to <2 x double> 957 ret <2 x double> %r`; 958 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a); 959 } 960 } 961 else 962 { 963 static if (GDC_with_SSE2) 964 { 965 966 __m128d _mm_cvtepi32_pd (__m128i a) pure @safe 967 { 968 return __builtin_ia32_cvtdq2pd(a); 969 } 970 } 971 else 972 { 973 __m128d _mm_cvtepi32_pd (__m128i a) pure @safe 974 { 975 double2 r = void; 976 r[0] = a[0]; 977 r[1] = a[1]; 978 return r; 979 } 980 } 981 } 982 unittest 983 { 984 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54)); 985 assert(A.array[0] == 54.0); 986 assert(A.array[1] == 54.0); 987 } 988 989 __m128 _mm_cvtepi32_ps(__m128i a) pure @safe 990 { 991 static if (GDC_with_SSE2) 992 { 993 return __builtin_ia32_cvtdq2ps(a); 994 } 995 else 996 { 997 // Generates cvtdq2ps since LDC 1.0.0 -O1 998 __m128 res; 999 res.array[0] = cast(float)a.array[0]; 1000 res.array[1] = cast(float)a.array[1]; 1001 res.array[2] = cast(float)a.array[2]; 1002 res.array[3] = cast(float)a.array[3]; 1003 return res; 1004 } 1005 } 1006 unittest 1007 { 1008 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 1009 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 1010 } 1011 1012 1013 version(LDC) 1014 { 1015 // Like in clang, implemented with a magic intrinsic right now 1016 alias _mm_cvtpd_epi32 = __builtin_ia32_cvtpd2dq; 1017 1018 /* Unfortunately this generates a cvttpd2dq instruction 1019 __m128i _mm_cvtpd_epi32 (__m128d a) pure @safe 1020 { 1021 enum ir = ` 1022 %i = fptosi <2 x double> %0 to <2 x i32> 1023 %r = shufflevector <2 x i32> %i,<2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1024 ret <4 x i32> %r`; 1025 1026 return cast(__m128i) inlineIR!(ir, __m128i, __m128d)(a); 1027 } */ 1028 } 1029 else 1030 { 1031 static if (GDC_with_SSE2) 1032 { 1033 alias _mm_cvtpd_epi32 = __builtin_ia32_cvtpd2dq; 1034 } 1035 else 1036 { 1037 __m128i _mm_cvtpd_epi32 (__m128d a) pure @safe 1038 { 1039 __m128i r = _mm_setzero_si128(); 1040 r[0] = convertDoubleToInt32UsingMXCSR(a[0]); 1041 r[1] = convertDoubleToInt32UsingMXCSR(a[1]); 1042 return r; 1043 } 1044 } 1045 } 1046 unittest 1047 { 1048 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0)); 1049 assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0); 1050 } 1051 1052 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1053 // to packed 32-bit integers 1054 __m64 _mm_cvtpd_pi32 (__m128d v) pure @safe 1055 { 1056 return to_m64(_mm_cvtpd_epi32(v)); 1057 } 1058 unittest 1059 { 1060 int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0)); 1061 assert(A.array[0] == 55 && A.array[1] == 61); 1062 } 1063 1064 version(LDC) 1065 { 1066 alias _mm_cvtpd_ps = __builtin_ia32_cvtpd2ps; // can't be done with IR unfortunately 1067 } 1068 else 1069 { 1070 static if (GDC_with_SSE2) 1071 { 1072 alias _mm_cvtpd_ps = __builtin_ia32_cvtpd2ps; // can't be done with IR unfortunately 1073 } 1074 else 1075 { 1076 __m128 _mm_cvtpd_ps (__m128d a) pure @safe 1077 { 1078 __m128 r = void; 1079 r[0] = a[0]; 1080 r[1] = a[1]; 1081 r[2] = 0; 1082 r[3] = 0; 1083 return r; 1084 } 1085 } 1086 } 1087 unittest 1088 { 1089 __m128d A = _mm_set_pd(5.25, 4.0); 1090 __m128 B = _mm_cvtpd_ps(A); 1091 assert(B.array == [4.0f, 5.25f, 0, 0]); 1092 } 1093 1094 /// Convert packed 32-bit integers in `v` to packed double-precision 1095 /// (64-bit) floating-point elements. 1096 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe 1097 { 1098 return _mm_cvtepi32_pd(to_m128i(v)); 1099 } 1100 unittest 1101 { 1102 __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5)); 1103 assert(A.array[0] == 4.0 && A.array[1] == -5.0); 1104 } 1105 1106 version(LDC) 1107 { 1108 // Disabled, since it fail with optimizations unfortunately 1109 //alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq; 1110 1111 __m128i _mm_cvtps_epi32 (__m128 a) pure @trusted 1112 { 1113 return __asm!__m128i("cvtps2dq $1,$0","=x,x",a); 1114 } 1115 } 1116 else 1117 { 1118 static if (GDC_with_SSE2) 1119 { 1120 alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq; 1121 } 1122 else 1123 { 1124 __m128i _mm_cvtps_epi32 (__m128 a) pure @safe 1125 { 1126 __m128i r = void; 1127 r[0] = convertFloatToInt32UsingMXCSR(a[0]); 1128 r[1] = convertFloatToInt32UsingMXCSR(a[1]); 1129 r[2] = convertFloatToInt32UsingMXCSR(a[2]); 1130 r[3] = convertFloatToInt32UsingMXCSR(a[3]); 1131 return r; 1132 } 1133 } 1134 } 1135 unittest 1136 { 1137 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1138 1139 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1140 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1141 assert(A.array == [1, -2, 54, -3]); 1142 1143 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1144 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1145 assert(A.array == [1, -3, 53, -3]); 1146 1147 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1148 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1149 assert(A.array == [2, -2, 54, -2]); 1150 1151 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1152 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1153 assert(A.array == [1, -2, 53, -2]); 1154 1155 _MM_SET_ROUNDING_MODE(savedRounding); 1156 } 1157 1158 1159 version(LDC) 1160 { 1161 __m128d _mm_cvtps_pd (__m128 a) pure @safe 1162 { 1163 // Generates cvtps2pd since LDC 1.0, no opt 1164 enum ir = ` 1165 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1> 1166 %r = fpext <2 x float> %v to <2 x double> 1167 ret <2 x double> %r`; 1168 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a); 1169 } 1170 } 1171 else 1172 { 1173 static if (GDC_with_SSE2) 1174 { 1175 alias _mm_cvtps_pd = __builtin_ia32_cvtps2pd; 1176 } 1177 else 1178 { 1179 __m128d _mm_cvtps_pd (__m128 a) pure @safe 1180 { 1181 double2 r = void; 1182 r[0] = a[0]; 1183 r[1] = a[1]; 1184 return r; 1185 } 1186 } 1187 } 1188 unittest 1189 { 1190 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f)); 1191 assert(A.array[0] == 54.0); 1192 assert(A.array[1] == 54.0); 1193 } 1194 1195 double _mm_cvtsd_f64 (__m128d a) pure @safe 1196 { 1197 return a.array[0]; 1198 } 1199 1200 version(LDC) 1201 { 1202 alias _mm_cvtsd_si32 = __builtin_ia32_cvtsd2si; 1203 } 1204 else 1205 { 1206 static if (GDC_with_SSE2) 1207 { 1208 alias _mm_cvtsd_si32 = __builtin_ia32_cvtsd2si; 1209 } 1210 else 1211 { 1212 int _mm_cvtsd_si32 (__m128d a) pure @safe 1213 { 1214 return convertDoubleToInt32UsingMXCSR(a[0]); 1215 } 1216 } 1217 } 1218 unittest 1219 { 1220 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0))); 1221 } 1222 1223 version(LDC) 1224 { 1225 // Unfortunately this builtin crashes in 32-bit 1226 version(X86_64) 1227 alias _mm_cvtsd_si64 = __builtin_ia32_cvtsd2si64; 1228 else 1229 { 1230 long _mm_cvtsd_si64 (__m128d a) pure @safe 1231 { 1232 return convertDoubleToInt64UsingMXCSR(a[0]); 1233 } 1234 } 1235 } 1236 else 1237 { 1238 long _mm_cvtsd_si64 (__m128d a) pure @safe 1239 { 1240 return convertDoubleToInt64UsingMXCSR(a.array[0]); 1241 } 1242 } 1243 unittest 1244 { 1245 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0))); 1246 1247 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1248 1249 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1250 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.5))); 1251 1252 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1253 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1))); 1254 1255 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1256 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1))); 1257 1258 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1259 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9))); 1260 1261 _MM_SET_ROUNDING_MODE(savedRounding); 1262 } 1263 1264 alias _mm_cvtsd_si64x = _mm_cvtsd_si64; 1265 1266 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @safe 1267 { 1268 static if (GDC_with_SSE2) 1269 { 1270 return __builtin_ia32_cvtsd2ss(a, b); 1271 } 1272 else 1273 { 1274 // Generates cvtsd2ss since LDC 1.3 -O0 1275 a[0] = b[0]; 1276 return a; 1277 } 1278 } 1279 unittest 1280 { 1281 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0)); 1282 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]); 1283 } 1284 1285 int _mm_cvtsi128_si32 (__m128i a) pure @safe 1286 { 1287 return a.array[0]; 1288 } 1289 1290 long _mm_cvtsi128_si64 (__m128i a) pure @safe 1291 { 1292 long2 la = cast(long2)a; 1293 return la.array[0]; 1294 } 1295 alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 1296 1297 __m128d _mm_cvtsi32_sd(__m128d v, int x) pure @trusted 1298 { 1299 v.ptr[0] = cast(double)x; 1300 return v; 1301 } 1302 unittest 1303 { 1304 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 1305 assert(a.array == [42.0, 0]); 1306 } 1307 1308 __m128i _mm_cvtsi32_si128 (int a) pure @trusted 1309 { 1310 int4 r = [0, 0, 0, 0]; 1311 r.ptr[0] = a; 1312 return r; 1313 } 1314 unittest 1315 { 1316 __m128i a = _mm_cvtsi32_si128(65); 1317 assert(a.array == [65, 0, 0, 0]); 1318 } 1319 1320 1321 // Note: on macOS, using "llvm.x86.sse2.cvtsi642sd" was buggy 1322 __m128d _mm_cvtsi64_sd(__m128d v, long x) pure @trusted 1323 { 1324 v.ptr[0] = cast(double)x; 1325 return v; 1326 } 1327 unittest 1328 { 1329 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 1330 assert(a.array == [42.0, 0]); 1331 } 1332 1333 __m128i _mm_cvtsi64_si128 (long a) pure @trusted 1334 { 1335 long2 r = [0, 0]; 1336 r.ptr[0] = a; 1337 return cast(__m128i)(r); 1338 } 1339 1340 alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; 1341 alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; 1342 1343 double2 _mm_cvtss_sd(double2 v, float4 x) pure @trusted 1344 { 1345 v.ptr[0] = x.array[0]; 1346 return v; 1347 } 1348 unittest 1349 { 1350 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 1351 assert(a.array == [42.0, 0]); 1352 } 1353 1354 long _mm_cvttss_si64 (__m128 a) pure @safe 1355 { 1356 return cast(long)(a.array[0]); // Generates cvttss2si as expected 1357 } 1358 unittest 1359 { 1360 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1361 } 1362 1363 version(LDC) 1364 { 1365 alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq; 1366 } 1367 else 1368 { 1369 static if (GDC_with_SSE2) 1370 { 1371 alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq; 1372 } 1373 else 1374 { 1375 __m128i _mm_cvttpd_epi32 (__m128d a) pure @safe 1376 { 1377 // Note: doesn't generate cvttpd2dq as of LDC 1.13 1378 __m128i r; 1379 r.array[0] = cast(int)a.array[0]; 1380 r.array[1] = cast(int)a.array[1]; 1381 r.array[2] = 0; 1382 r.array[3] = 0; 1383 return r; 1384 } 1385 } 1386 } 1387 unittest 1388 { 1389 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f)); 1390 assert(R.array == [-4, 45641, 0, 0]); 1391 } 1392 1393 1394 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1395 /// to packed 32-bit integers with truncation. 1396 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe 1397 { 1398 return to_m64(_mm_cvttpd_epi32(v)); 1399 } 1400 unittest 1401 { 1402 int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f)); 1403 int[2] correct = [-4, 45641]; 1404 assert(R.array == correct); 1405 } 1406 1407 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted 1408 { 1409 // Note: Generates cvttps2dq since LDC 1.3 -O2 1410 __m128i r; 1411 r.ptr[0] = cast(int)a.array[0]; 1412 r.ptr[1] = cast(int)a.array[1]; 1413 r.ptr[2] = cast(int)a.array[2]; 1414 r.ptr[3] = cast(int)a.array[3]; 1415 return r; 1416 } 1417 unittest 1418 { 1419 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f)); 1420 assert(R.array == [-4, 45641, 0, 1]); 1421 } 1422 1423 int _mm_cvttsd_si32 (__m128d a) 1424 { 1425 // Generates cvttsd2si since LDC 1.3 -O0 1426 return cast(int)a.array[0]; 1427 } 1428 1429 long _mm_cvttsd_si64 (__m128d a) 1430 { 1431 // Generates cvttsd2si since LDC 1.3 -O0 1432 // but in 32-bit instead, it's a long sequence that resort to FPU 1433 return cast(long)a.array[0]; 1434 } 1435 1436 alias _mm_cvttsd_si64x = _mm_cvttsd_si64; 1437 1438 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe 1439 { 1440 return a / b; 1441 } 1442 1443 static if (GDC_with_SSE2) 1444 { 1445 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted 1446 { 1447 return __builtin_ia32_divsd(a, b); 1448 } 1449 } 1450 else version(DigitalMars) 1451 { 1452 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1453 __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe 1454 { 1455 asm pure nothrow @nogc @trusted { nop;} 1456 a.array[0] = a.array[0] / b.array[0]; 1457 return a; 1458 } 1459 } 1460 else 1461 { 1462 __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe 1463 { 1464 a.array[0] /= b.array[0]; 1465 return a; 1466 } 1467 } 1468 unittest 1469 { 1470 __m128d a = [2.0, 4.5]; 1471 a = _mm_div_sd(a, a); 1472 assert(a.array == [1.0, 4.5]); 1473 } 1474 1475 /// Extract a 16-bit integer from `v`, selected with `index` 1476 int _mm_extract_epi16(__m128i v, int index) pure @safe 1477 { 1478 short8 r = cast(short8)v; 1479 return cast(ushort)(r.array[index]); 1480 } 1481 unittest 1482 { 1483 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1); 1484 assert(_mm_extract_epi16(A, 6) == 6); 1485 assert(_mm_extract_epi16(A, 0) == 65535); 1486 } 1487 1488 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`. 1489 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted 1490 { 1491 short8 r = cast(short8)v; 1492 r.ptr[index & 7] = cast(short)i; 1493 return cast(__m128i)r; 1494 } 1495 unittest 1496 { 1497 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1498 short8 R = cast(short8) _mm_insert_epi16(A, 42, 6); 1499 short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7]; 1500 assert(R.array == correct); 1501 } 1502 1503 version(GNU) 1504 { 1505 void _mm_lfence() pure @trusted 1506 { 1507 static if (GDC_with_SSE2) 1508 { 1509 __builtin_ia32_lfence(); 1510 } 1511 else version(X86) 1512 { 1513 asm pure nothrow @nogc @trusted 1514 { 1515 "lfence;\n" : : : ; 1516 } 1517 } 1518 else 1519 static assert(false); 1520 } 1521 } 1522 else version(LDC) 1523 { 1524 alias _mm_lfence = __builtin_ia32_lfence; 1525 } 1526 else static if (DMD_with_asm) 1527 { 1528 void _mm_lfence() pure @safe 1529 { 1530 asm nothrow @nogc pure @safe 1531 { 1532 lfence; 1533 } 1534 } 1535 } 1536 else 1537 static assert(false); 1538 unittest 1539 { 1540 _mm_lfence(); 1541 } 1542 1543 1544 __m128d _mm_load_pd (const(double) * mem_addr) pure 1545 { 1546 __m128d* aligned = cast(__m128d*)mem_addr; 1547 return *aligned; 1548 } 1549 1550 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 1551 { 1552 double[2] arr = [*mem_addr, *mem_addr]; 1553 return loadUnaligned!(double2)(&arr[0]); 1554 } 1555 1556 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted 1557 { 1558 double2 r = [0, 0]; 1559 r.ptr[0] = *mem_addr; 1560 return r; 1561 } 1562 unittest 1563 { 1564 double x = -42; 1565 __m128d a = _mm_load_sd(&x); 1566 assert(a.array == [-42.0, 0.0]); 1567 } 1568 1569 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted 1570 { 1571 return *mem_addr; 1572 } 1573 1574 alias _mm_load1_pd = _mm_load_pd1; 1575 1576 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted 1577 { 1578 a.ptr[1] = *mem_addr; 1579 return a; 1580 } 1581 1582 // Note: strange signature since the memory doesn't have to aligned 1583 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted 1584 { 1585 auto pLong = cast(const(long)*)mem_addr; 1586 long2 r = [0, 0]; 1587 r.ptr[0] = *pLong; 1588 return cast(__m128i)(r); 1589 } 1590 1591 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted 1592 { 1593 a.ptr[0] = *mem_addr; 1594 return a; 1595 } 1596 1597 __m128d _mm_loadr_pd2 (const(double)* mem_addr) pure @trusted 1598 { 1599 __m128d a = *cast(__m128d*)(mem_addr); 1600 __m128d r; 1601 r.ptr[0] = a.array[1]; 1602 r.ptr[1] = a.array[0]; 1603 return r; 1604 } 1605 1606 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @safe 1607 { 1608 static if (GDC_with_SSE2) 1609 { 1610 return __builtin_ia32_loadupd(mem_addr); 1611 } 1612 else 1613 { 1614 return loadUnaligned!(double2)(mem_addr); 1615 } 1616 } 1617 1618 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 1619 { 1620 static if (GDC_with_SSE2) 1621 { 1622 return __builtin_ia32_loaddqu(cast(const(char*))mem_addr); 1623 } 1624 else 1625 { 1626 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 1627 } 1628 } 1629 1630 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted 1631 { 1632 int r = *cast(int*)(mem_addr); 1633 int4 result = [0, 0, 0, 0]; 1634 result.ptr[0] = r; 1635 return result; 1636 } 1637 unittest 1638 { 1639 int r = 42; 1640 __m128i A = _mm_loadu_si32(&r); 1641 int[4] correct = [42, 0, 0, 0]; 1642 assert(A.array == correct); 1643 } 1644 1645 static if (GDC_with_SSE2) 1646 { 1647 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 1648 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 1649 /// and pack the results in destination. 1650 alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128; 1651 } 1652 else version(LDC) 1653 { 1654 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 1655 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 1656 /// and pack the results in destination. 1657 alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128; 1658 } 1659 else 1660 { 1661 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 1662 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 1663 /// and pack the results in destination. 1664 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @safe 1665 { 1666 short8 sa = cast(short8)a; 1667 short8 sb = cast(short8)b; 1668 1669 int4 r; 1670 foreach(i; 0..4) 1671 { 1672 r.array[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1]; 1673 } 1674 return r; 1675 } 1676 } 1677 unittest 1678 { 1679 short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 1680 short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 1681 int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B); 1682 int[4] correct = [1, 13, -2147483648, 2*32767*32767]; 1683 assert(R.array == correct); 1684 } 1685 1686 version(LDC) 1687 { 1688 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 1689 /// (elements are not stored when the highest bit is not set in the corresponding element) 1690 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 1691 /// boundary. 1692 alias _mm_maskmoveu_si128 = __builtin_ia32_maskmovdqu; // can't do it with pure IR 1693 } 1694 else 1695 { 1696 static if (GDC_with_SSE2) 1697 { 1698 ///ditto 1699 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) pure @trusted 1700 { 1701 return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr); 1702 } 1703 } 1704 else 1705 { 1706 ///ditto 1707 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) pure @trusted 1708 { 1709 byte16 b = cast(byte16)a; 1710 byte16 m = cast(byte16)mask; 1711 byte* dest = cast(byte*)(mem_addr); 1712 foreach(j; 0..16) 1713 { 1714 if (m.array[j] & 128) 1715 { 1716 dest[j] = b.array[j]; 1717 } 1718 } 1719 } 1720 } 1721 } 1722 unittest 1723 { 1724 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]; 1725 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0); 1726 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); 1727 _mm_maskmoveu_si128(A, mask, dest.ptr); 1728 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42]; 1729 assert(dest == correct); 1730 } 1731 1732 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe 1733 { 1734 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1735 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else 1736 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1737 __m128i mask = _mm_and_si128(aTob, lowerShorts); 1738 return _mm_xor_si128(b, mask); 1739 } 1740 unittest 1741 { 1742 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-57), 1743 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 1744 short[8] correct = [45, 1, 9, 7, 9, 7, 0, 0]; 1745 assert(R.array == correct); 1746 } 1747 1748 1749 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1750 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe 1751 { 1752 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1753 __m128i value128 = _mm_set1_epi8(-128); 1754 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 1755 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1756 __m128i mask = _mm_and_si128(aTob, higher); 1757 return _mm_xor_si128(b, mask); 1758 } 1759 unittest 1760 { 1761 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 1762 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 1763 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 1764 assert(R.array == correct); 1765 } 1766 1767 __m128d _mm_max_pd (__m128d a, __m128d b) pure @safe 1768 { 1769 static if (GDC_with_SSE2) 1770 { 1771 return __builtin_ia32_maxpd(a, b); 1772 } 1773 else 1774 { 1775 // Generates maxpd starting with LDC 1.9 1776 a[0] = (a[0] > b[0]) ? a[0] : b[0]; 1777 a[1] = (a[1] > b[1]) ? a[1] : b[1]; 1778 return a; 1779 } 1780 } 1781 unittest 1782 { 1783 __m128d A = _mm_setr_pd(4.0, 1.0); 1784 __m128d B = _mm_setr_pd(1.0, 8.0); 1785 __m128d M = _mm_max_pd(A, B); 1786 assert(M.array[0] == 4.0); 1787 assert(M.array[1] == 8.0); 1788 } 1789 1790 __m128d _mm_max_sd (__m128d a, __m128d b) pure @safe 1791 { 1792 static if (GDC_with_SSE2) 1793 { 1794 return __builtin_ia32_maxsd(a, b); 1795 } 1796 else 1797 { 1798 __m128d r = a; 1799 // Generates maxsd starting with LDC 1.3 1800 r.array[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 1801 return r; 1802 } 1803 } 1804 unittest 1805 { 1806 __m128d A = _mm_setr_pd(1.0, 1.0); 1807 __m128d B = _mm_setr_pd(4.0, 2.0); 1808 __m128d M = _mm_max_sd(A, B); 1809 assert(M.array[0] == 4.0); 1810 assert(M.array[1] == 1.0); 1811 } 1812 1813 version(GNU) 1814 { 1815 void _mm_mfence() pure @trusted 1816 { 1817 static if (GDC_with_SSE2) 1818 { 1819 __builtin_ia32_mfence(); 1820 } 1821 else version(X86) 1822 { 1823 asm pure nothrow @nogc @trusted 1824 { 1825 "mfence;\n" : : : ; 1826 } 1827 } 1828 else 1829 static assert(false); 1830 } 1831 } 1832 else version(LDC) 1833 { 1834 alias _mm_mfence = __builtin_ia32_mfence; 1835 } 1836 else static if (DMD_with_asm) 1837 { 1838 void _mm_mfence() pure @safe 1839 { 1840 asm nothrow @nogc pure @safe 1841 { 1842 mfence; 1843 } 1844 } 1845 } 1846 else 1847 static assert(false); 1848 unittest 1849 { 1850 _mm_mfence(); 1851 } 1852 1853 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe 1854 { 1855 // Note: clang uses a __builtin_ia32_pminsw128 which has disappeared from LDC LLVM (?) 1856 // Implemented using masks and XOR 1857 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else 1858 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1859 __m128i mask = _mm_and_si128(aTob, lowerShorts); 1860 return _mm_xor_si128(b, mask); 1861 } 1862 unittest 1863 { 1864 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-57), 1865 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 1866 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -57]; 1867 assert(R.array == correct); 1868 } 1869 1870 1871 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe 1872 { 1873 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1874 __m128i value128 = _mm_set1_epi8(-128); 1875 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 1876 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1877 __m128i mask = _mm_and_si128(aTob, lower); 1878 return _mm_xor_si128(b, mask); 1879 } 1880 unittest 1881 { 1882 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 1883 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 1884 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 1885 assert(R.array == correct); 1886 } 1887 1888 __m128d _mm_min_pd (__m128d a, __m128d b) pure @safe 1889 { 1890 static if (GDC_with_SSE2) 1891 { 1892 return __builtin_ia32_minpd(a, b); 1893 } 1894 else 1895 { 1896 // Generates minpd starting with LDC 1.9 1897 a.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 1898 a.array[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 1899 return a; 1900 } 1901 } 1902 unittest 1903 { 1904 __m128d A = _mm_setr_pd(1.0, 2.0); 1905 __m128d B = _mm_setr_pd(4.0, 1.0); 1906 __m128d M = _mm_min_pd(A, B); 1907 assert(M.array[0] == 1.0); 1908 assert(M.array[1] == 1.0); 1909 } 1910 1911 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe 1912 { 1913 static if (GDC_with_SSE2) 1914 { 1915 return __builtin_ia32_minsd(a, b); 1916 } 1917 else 1918 { 1919 // Generates minsd starting with LDC 1.3 1920 __m128d r = a; 1921 r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 1922 return r; 1923 } 1924 } 1925 unittest 1926 { 1927 __m128d A = _mm_setr_pd(1.0, 3.0); 1928 __m128d B = _mm_setr_pd(4.0, 2.0); 1929 __m128d M = _mm_min_sd(A, B); 1930 assert(M.array[0] == 1.0); 1931 assert(M.array[1] == 3.0); 1932 } 1933 1934 __m128i _mm_move_epi64 (__m128i a) pure @safe 1935 { 1936 static if (GDC_with_SSE2) 1937 { 1938 return __builtin_ia32_movq128(a); 1939 } 1940 else 1941 { 1942 long2 result = [ 0, 0 ]; 1943 long2 la = cast(long2) a; 1944 result.array[0] = la.array[0]; 1945 return cast(__m128i)(result); 1946 } 1947 } 1948 unittest 1949 { 1950 long2 A = [13, 47]; 1951 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 1952 long[2] correct = [13, 0]; 1953 assert(B.array == correct); 1954 } 1955 1956 __m128d _mm_move_sd (__m128d a, __m128d b) pure @safe 1957 { 1958 static if (GDC_with_SSE2) 1959 { 1960 return __builtin_ia32_movsd(a, b); 1961 } 1962 else 1963 { 1964 b.array[1] = a.array[1]; 1965 return b; 1966 } 1967 } 1968 unittest 1969 { 1970 double2 A = [13.0, 47.0]; 1971 double2 B = [34.0, 58.0]; 1972 double2 C = _mm_move_sd(A, B); 1973 double[2] correct = [34.0, 47.0]; 1974 assert(C.array == correct); 1975 } 1976 1977 version(LDC) 1978 { 1979 /// Create mask from the most significant bit of each 8-bit element in `v`. 1980 alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128; 1981 } 1982 else 1983 { 1984 static if (GDC_with_SSE2) 1985 { 1986 /// Create mask from the most significant bit of each 8-bit element in `v`. 1987 alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128; 1988 } 1989 else 1990 { 1991 /// Create mask from the most significant bit of each 8-bit element in `v`. 1992 int _mm_movemask_epi8(__m128i v) pure @safe 1993 { 1994 byte16 ai = cast(byte16)v; 1995 int r = 0; 1996 foreach(bit; 0..16) 1997 { 1998 if (ai.array[bit] < 0) r += (1 << bit); 1999 } 2000 return r; 2001 } 2002 } 2003 } 2004 unittest 2005 { 2006 assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 0, 0, -1, -1, -1, 0, 0, 0, 0, -1, -1, 0, -1, -1, 0))); 2007 } 2008 2009 version(LDC) 2010 { 2011 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2012 /// packed double-precision (64-bit) floating-point element in `v`. 2013 alias _mm_movemask_pd = __builtin_ia32_movmskpd; 2014 } 2015 else 2016 { 2017 static if (GDC_with_SSE2) 2018 { 2019 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2020 /// packed double-precision (64-bit) floating-point element in `v`. 2021 alias _mm_movemask_pd = __builtin_ia32_movmskpd; 2022 } 2023 else 2024 { 2025 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2026 /// packed double-precision (64-bit) floating-point element in `v`. 2027 int _mm_movemask_pd(__m128d v) pure @safe 2028 { 2029 long2 lv = cast(long2)v; 2030 int r = 0; 2031 if (lv.array[0] < 0) r += 1; 2032 if (lv.array[1] < 0) r += 2; 2033 return r; 2034 } 2035 } 2036 } 2037 unittest 2038 { 2039 __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0); 2040 assert(_mm_movemask_pd(A) == 2); 2041 } 2042 2043 /// Copy the lower 64-bit integer in `v`. 2044 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe 2045 { 2046 long2 lv = cast(long2)v; 2047 return long1(lv.array[0]); 2048 } 2049 unittest 2050 { 2051 __m128i A = _mm_set_epi64x(-1, -2); 2052 __m64 R = _mm_movepi64_pi64(A); 2053 assert(R.array[0] == -2); 2054 } 2055 2056 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element. 2057 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted 2058 { 2059 long2 r; 2060 r.ptr[0] = a.array[0]; 2061 r.ptr[1] = 0; 2062 return cast(__m128i)r; 2063 } 2064 2065 // PERF: unfortunately, __builtin_ia32_pmuludq128 disappeared from LDC 2066 // and is SSE4.1 in GDC 2067 // but seems there in clang 2068 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted 2069 { 2070 __m128i zero = _mm_setzero_si128(); 2071 long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero); 2072 long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero); 2073 static if (__VERSION__ >= 2076) 2074 { 2075 return cast(__m128i)(la * lb); 2076 } 2077 else 2078 { 2079 // long2 mul not supported before LDC 1.5 2080 la.ptr[0] *= lb.array[0]; 2081 la.ptr[1] *= lb.array[1]; 2082 return cast(__m128i)(la); 2083 } 2084 } 2085 unittest 2086 { 2087 __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff); 2088 __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff); 2089 __m128i C = _mm_mul_epu32(A, B); 2090 long2 LC = cast(long2)C; 2091 assert(LC.array[0] == 18446744065119617025uL); 2092 assert(LC.array[1] == 12723420444339690338uL); 2093 } 2094 2095 2096 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 2097 { 2098 return a * b; 2099 } 2100 unittest 2101 { 2102 __m128d a = [-2.0, 1.5]; 2103 a = _mm_mul_pd(a, a); 2104 assert(a.array == [4.0, 2.25]); 2105 } 2106 2107 version(DigitalMars) 2108 { 2109 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 2110 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe 2111 { 2112 asm pure nothrow @nogc @trusted { nop;} 2113 a.array[0] = a.array[0] * b.array[0]; 2114 return a; 2115 } 2116 } 2117 else 2118 { 2119 static if (GDC_with_SSE2) 2120 { 2121 alias _mm_mul_sd = __builtin_ia32_mulsd; 2122 } 2123 else 2124 { 2125 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe 2126 { 2127 a.array[0] *= b.array[0]; 2128 return a; 2129 } 2130 } 2131 } 2132 unittest 2133 { 2134 __m128d a = [-2.0, 1.5]; 2135 a = _mm_mul_sd(a, a); 2136 assert(a.array == [4.0, 1.5]); 2137 } 2138 2139 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 2140 /// and get an unsigned 64-bit result. 2141 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe 2142 { 2143 return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b))); 2144 } 2145 unittest 2146 { 2147 __m64 A = _mm_set_pi32(42, 0xDEADBEEF); 2148 __m64 B = _mm_set_pi32(42, 0xCAFEBABE); 2149 __m64 C = _mm_mul_su32(A, B); 2150 assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL); 2151 } 2152 2153 version(LDC) 2154 { 2155 alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128; 2156 } 2157 else 2158 { 2159 static if (GDC_with_SSE2) 2160 { 2161 alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128; 2162 } 2163 else 2164 { 2165 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @safe 2166 { 2167 short8 sa = cast(short8)a; 2168 short8 sb = cast(short8)b; 2169 short8 r = void; 2170 r.array[0] = (sa.array[0] * sb.array[0]) >> 16; 2171 r.array[1] = (sa.array[1] * sb.array[1]) >> 16; 2172 r.array[2] = (sa.array[2] * sb.array[2]) >> 16; 2173 r.array[3] = (sa.array[3] * sb.array[3]) >> 16; 2174 r.array[4] = (sa.array[4] * sb.array[4]) >> 16; 2175 r.array[5] = (sa.array[5] * sb.array[5]) >> 16; 2176 r.array[6] = (sa.array[6] * sb.array[6]) >> 16; 2177 r.array[7] = (sa.array[7] * sb.array[7]) >> 16; 2178 return cast(__m128i)r; 2179 } 2180 } 2181 } 2182 unittest 2183 { 2184 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2185 __m128i B = _mm_set1_epi16(16384); 2186 short8 R = cast(short8)_mm_mulhi_epi16(A, B); 2187 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1]; 2188 assert(R.array == correct); 2189 } 2190 2191 version(LDC) 2192 { 2193 alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128; 2194 } 2195 else 2196 { 2197 static if (GDC_with_SSE2) 2198 { 2199 alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128; 2200 } 2201 else 2202 { 2203 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @safe 2204 { 2205 short8 sa = cast(short8)a; 2206 short8 sb = cast(short8)b; 2207 short8 r = void; 2208 r.array[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 ); 2209 r.array[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 ); 2210 r.array[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 ); 2211 r.array[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 ); 2212 r.array[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 ); 2213 r.array[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 ); 2214 r.array[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 ); 2215 r.array[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 ); 2216 return cast(__m128i)r; 2217 } 2218 } 2219 } 2220 unittest 2221 { 2222 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2223 __m128i B = _mm_set1_epi16(16384); 2224 short8 R = cast(short8)_mm_mulhi_epu16(A, B); 2225 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1]; 2226 assert(R.array == correct); 2227 } 2228 2229 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe 2230 { 2231 return cast(__m128i)(cast(short8)a * cast(short8)b); 2232 } 2233 unittest 2234 { 2235 __m128i A = _mm_setr_epi16(16384, -16, 0, 3, 4, 1, 16, 7); 2236 __m128i B = _mm_set1_epi16(16384); 2237 short8 R = cast(short8)_mm_mullo_epi16(A, B); 2238 short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384]; 2239 assert(R.array == correct); 2240 } 2241 2242 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 2243 { 2244 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 2245 } 2246 2247 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 2248 { 2249 return a | b; 2250 } 2251 2252 version(LDC) 2253 { 2254 alias _mm_packs_epi32 = __builtin_ia32_packssdw128; 2255 } 2256 else 2257 { 2258 static if (GDC_with_SSE2) 2259 { 2260 alias _mm_packs_epi32 = __builtin_ia32_packssdw128; 2261 } 2262 else 2263 { 2264 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @safe 2265 { 2266 short8 r; 2267 r.array[0] = saturateSignedIntToSignedShort(a.array[0]); 2268 r.array[1] = saturateSignedIntToSignedShort(a.array[1]); 2269 r.array[2] = saturateSignedIntToSignedShort(a.array[2]); 2270 r.array[3] = saturateSignedIntToSignedShort(a.array[3]); 2271 r.array[4] = saturateSignedIntToSignedShort(b.array[0]); 2272 r.array[5] = saturateSignedIntToSignedShort(b.array[1]); 2273 r.array[6] = saturateSignedIntToSignedShort(b.array[2]); 2274 r.array[7] = saturateSignedIntToSignedShort(b.array[3]); 2275 return cast(__m128i)r; 2276 } 2277 } 2278 } 2279 unittest 2280 { 2281 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 2282 short8 R = cast(short8) _mm_packs_epi32(A, A); 2283 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0]; 2284 assert(R.array == correct); 2285 } 2286 2287 version(LDC) 2288 { 2289 alias _mm_packs_epi16 = __builtin_ia32_packsswb128; 2290 } 2291 else 2292 { 2293 static if (GDC_with_SSE2) 2294 { 2295 alias _mm_packs_epi16 = __builtin_ia32_packsswb128; 2296 } 2297 else 2298 { 2299 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @safe 2300 { 2301 byte16 r; 2302 short8 sa = cast(short8)a; 2303 short8 sb = cast(short8)b; 2304 foreach(i; 0..8) 2305 r.array[i] = saturateSignedWordToSignedByte(sa.array[i]); 2306 foreach(i; 0..8) 2307 r.array[i+8] = saturateSignedWordToSignedByte(sb.array[i]); 2308 return cast(__m128i)r; 2309 } 2310 } 2311 } 2312 unittest 2313 { 2314 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0); 2315 byte16 R = cast(byte16) _mm_packs_epi16(A, A); 2316 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0, 2317 127, -128, 127, 0, 127, -128, 127, 0]; 2318 assert(R.array == correct); 2319 } 2320 2321 version(LDC) 2322 { 2323 alias _mm_packus_epi16 = __builtin_ia32_packuswb128; 2324 } 2325 else 2326 { 2327 static if (GDC_with_SSE2) 2328 { 2329 alias _mm_packus_epi16 = __builtin_ia32_packuswb128; 2330 } 2331 else 2332 { 2333 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 2334 { 2335 short8 sa = cast(short8)a; 2336 short8 sb = cast(short8)b; 2337 ubyte[16] result = void; 2338 for (int i = 0; i < 8; ++i) 2339 { 2340 short s = sa[i]; 2341 if (s < 0) s = 0; 2342 if (s > 255) s = 255; 2343 result[i] = cast(ubyte)s; 2344 2345 s = sb[i]; 2346 if (s < 0) s = 0; 2347 if (s > 255) s = 255; 2348 result[i+8] = cast(ubyte)s; 2349 } 2350 return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr); 2351 } 2352 } 2353 } 2354 unittest 2355 { 2356 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 2357 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 2358 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 2359 0, 255, 0, 255, 255, 2, 1, 0]; 2360 foreach(i; 0..16) 2361 assert(AA.array[i] == cast(byte)(correctResult[i])); 2362 } 2363 2364 2365 version(GNU) 2366 { 2367 void _mm_pause() pure @trusted 2368 { 2369 static if (GDC_with_SSE2) 2370 { 2371 __builtin_ia32_pause(); 2372 } 2373 else version(X86) 2374 { 2375 asm pure nothrow @nogc @trusted 2376 { 2377 "pause;\n" : : : ; 2378 } 2379 } 2380 else 2381 static assert(false); 2382 } 2383 } 2384 else version(LDC) 2385 { 2386 alias _mm_pause = __builtin_ia32_pause; 2387 } 2388 else static if (DMD_with_asm) 2389 { 2390 void _mm_pause() pure @safe 2391 { 2392 asm nothrow @nogc pure @safe 2393 { 2394 rep; nop; // F3 90 = pause 2395 } 2396 } 2397 } 2398 else 2399 static assert(false); 2400 unittest 2401 { 2402 _mm_pause(); 2403 } 2404 2405 2406 version(LDC) 2407 { 2408 alias _mm_sad_epu8 = __builtin_ia32_psadbw128; 2409 } 2410 else 2411 { 2412 static if (GDC_with_SSE2) 2413 { 2414 alias _mm_sad_epu8 = __builtin_ia32_psadbw128; 2415 } 2416 else 2417 { 2418 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @safe 2419 { 2420 byte16 ab = cast(byte16)a; 2421 byte16 bb = cast(byte16)b; 2422 ubyte[16] t; 2423 foreach(i; 0..16) 2424 { 2425 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]); 2426 if (diff < 0) diff = -diff; 2427 t[i] = cast(ubyte)(diff); 2428 } 2429 int4 r = _mm_setzero_si128(); 2430 r.array[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 2431 r.array[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15]; 2432 return r; 2433 } 2434 } 2435 } 2436 unittest 2437 { 2438 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 2439 __m128i B = _mm_set1_epi8(1); 2440 __m128i R = _mm_sad_epu8(A, B); 2441 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 2442 0, 2443 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 2444 0]; 2445 assert(R.array == correct); 2446 } 2447 2448 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 2449 { 2450 short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7]; 2451 return cast(__m128i) loadUnaligned!(short8)(result.ptr); 2452 } 2453 unittest 2454 { 2455 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 2456 short8 B = cast(short8) A; 2457 foreach(i; 0..8) 2458 assert(B.array[i] == i); 2459 } 2460 2461 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 2462 { 2463 int[4] result = [e0, e1, e2, e3]; 2464 return loadUnaligned!(int4)(result.ptr); 2465 } 2466 unittest 2467 { 2468 __m128i A = _mm_set_epi32(3, 2, 1, 0); 2469 foreach(i; 0..4) 2470 assert(A.array[i] == i); 2471 } 2472 2473 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted 2474 { 2475 long[2] result = [e0.array[0], e1.array[0]]; 2476 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 2477 } 2478 unittest 2479 { 2480 __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678)); 2481 long2 B = cast(long2) A; 2482 assert(B.array[0] == 5678); 2483 assert(B.array[1] == 1234); 2484 } 2485 2486 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 2487 { 2488 long[2] result = [e0, e1]; 2489 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 2490 } 2491 unittest 2492 { 2493 __m128i A = _mm_set_epi64x(1234, 5678); 2494 long2 B = cast(long2) A; 2495 assert(B.array[0] == 5678); 2496 assert(B.array[1] == 1234); 2497 } 2498 2499 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 2500 byte e11, byte e10, byte e9, byte e8, 2501 byte e7, byte e6, byte e5, byte e4, 2502 byte e3, byte e2, byte e1, byte e0) pure @trusted 2503 { 2504 byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 2505 e8, e9, e10, e11, e12, e13, e14, e15]; 2506 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 2507 } 2508 2509 __m128d _mm_set_pd (double e1, double e0) pure @trusted 2510 { 2511 double[2] result = [e0, e1]; 2512 return loadUnaligned!(double2)(result.ptr); 2513 } 2514 unittest 2515 { 2516 __m128d A = _mm_set_pd(61.0, 55.0); 2517 double[2] correct = [55.0, 61.0]; 2518 assert(A.array == correct); 2519 } 2520 2521 __m128d _mm_set_pd1 (double a) pure @trusted 2522 { 2523 double[2] result = [a, a]; 2524 return loadUnaligned!(double2)(result.ptr); 2525 } 2526 unittest 2527 { 2528 __m128d A = _mm_set_pd1(61.0); 2529 double[2] correct = [61.0, 61.0]; 2530 assert(A.array == correct); 2531 } 2532 2533 __m128d _mm_set_sd (double a) pure @trusted 2534 { 2535 double[2] result = [a, 0]; 2536 return loadUnaligned!(double2)(result.ptr); 2537 } 2538 2539 __m128i _mm_set1_epi16 (short a) pure @trusted 2540 { 2541 return cast(__m128i)(short8(a)); 2542 } 2543 2544 __m128i _mm_set1_epi32 (int a) pure @trusted 2545 { 2546 return cast(__m128i)(int4(a)); 2547 } 2548 unittest 2549 { 2550 __m128 a = _mm_set1_ps(-1.0f); 2551 __m128 b = cast(__m128) _mm_set1_epi32(0x7fffffff); 2552 assert(_mm_and_ps(a, b).array == [1.0f, 1, 1, 1]); 2553 } 2554 2555 /// Broadcast 64-bit integer `a` to all elements of `dst`. 2556 __m128i _mm_set1_epi64 (__m64 a) pure @safe 2557 { 2558 return _mm_set_epi64(a, a); 2559 } 2560 2561 __m128i _mm_set1_epi64x (long a) pure @trusted 2562 { 2563 return cast(__m128i)(long2(a)); 2564 } 2565 2566 __m128i _mm_set1_epi8 (byte a) pure @trusted 2567 { 2568 return cast(__m128i)(byte16(a)); 2569 } 2570 2571 alias _mm_set1_pd = _mm_set_pd1; 2572 2573 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 2574 short e3, short e2, short e1, short e0) pure @trusted 2575 { 2576 short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0]; 2577 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 2578 } 2579 2580 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 2581 { 2582 int[4] result = [e3, e2, e1, e0]; 2583 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 2584 } 2585 2586 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 2587 { 2588 long[2] result = [e1, e0]; 2589 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 2590 } 2591 2592 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12, 2593 byte e11, byte e10, byte e9, byte e8, 2594 byte e7, byte e6, byte e5, byte e4, 2595 byte e3, byte e2, byte e1, byte e0) pure @trusted 2596 { 2597 byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 2598 e7, e6, e5, e4, e3, e2, e1, e0]; 2599 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 2600 } 2601 2602 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 2603 { 2604 double[2] result = [e1, e0]; 2605 return loadUnaligned!(double2)(result.ptr); 2606 } 2607 unittest 2608 { 2609 __m128d A = _mm_setr_pd(61.0, 55.0); 2610 double[2] correct = [61.0, 55.0]; 2611 assert(A.array == correct); 2612 } 2613 2614 __m128d _mm_setzero_pd () pure @trusted 2615 { 2616 double[2] result = [0.0, 0.0]; 2617 return loadUnaligned!(double2)(result.ptr); 2618 } 2619 2620 __m128i _mm_setzero_si128() pure @trusted 2621 { 2622 int[4] result = [0, 0, 0, 0]; 2623 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 2624 } 2625 2626 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe 2627 { 2628 static if (GDC_with_SSE2) 2629 { 2630 return __builtin_ia32_pshufd(a, imm8); 2631 } 2632 else 2633 { 2634 return shufflevector!(int4, (imm8 >> 0) & 3, 2635 (imm8 >> 2) & 3, 2636 (imm8 >> 4) & 3, 2637 (imm8 >> 6) & 3)(a, a); 2638 } 2639 } 2640 unittest 2641 { 2642 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 2643 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 2644 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 2645 int[4] expectedB = [ 3, 2, 1, 0 ]; 2646 assert(B.array == expectedB); 2647 } 2648 2649 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe 2650 { 2651 static if (GDC_with_SSE2) 2652 { 2653 return __builtin_ia32_shufpd(a, b, imm8); 2654 } 2655 else 2656 { 2657 return shufflevector!(double2, 0 + ( imm8 & 1 ), 2658 2 + ( (imm8 >> 1) & 1 ))(a, b); 2659 } 2660 } 2661 unittest 2662 { 2663 __m128d A = _mm_setr_pd(0.5, 2.0); 2664 __m128d B = _mm_setr_pd(4.0, 5.0); 2665 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 2666 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B); 2667 double[2] correct = [ 2.0, 5.0 ]; 2668 assert(R.array == correct); 2669 } 2670 2671 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe 2672 { 2673 static if (GDC_with_SSE2) 2674 { 2675 return __builtin_ia32_pshufhw(a, imm8); 2676 } 2677 else 2678 { 2679 return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3, 2680 4 + ( (imm8 >> 0) & 3 ), 2681 4 + ( (imm8 >> 2) & 3 ), 2682 4 + ( (imm8 >> 4) & 3 ), 2683 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 2684 } 2685 } 2686 unittest 2687 { 2688 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 2689 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 2690 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 2691 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 2692 assert(C.array == expectedC); 2693 } 2694 2695 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe 2696 { 2697 static if (GDC_with_SSE2) 2698 { 2699 return __builtin_ia32_pshuflw(a, imm8); 2700 } 2701 else 2702 { 2703 return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ), 2704 ( (imm8 >> 2) & 3 ), 2705 ( (imm8 >> 4) & 3 ), 2706 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 2707 } 2708 } 2709 unittest 2710 { 2711 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 2712 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 2713 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 2714 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 2715 assert(B.array == expectedB); 2716 } 2717 2718 version(LDC) 2719 { 2720 alias _mm_sll_epi32 = __builtin_ia32_pslld128; 2721 } 2722 else static if (GDC_with_SSE2) 2723 { 2724 alias _mm_sll_epi32 = __builtin_ia32_pslld128; 2725 } 2726 else static if (DMD_with_32bit_asm) 2727 { 2728 __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe 2729 { 2730 asm pure nothrow @nogc @trusted 2731 { 2732 movdqu XMM0, a; 2733 movdqu XMM1, count; 2734 pslld XMM0, XMM1; 2735 movdqu a, XMM0; 2736 } 2737 return a; 2738 } 2739 } 2740 else 2741 { 2742 2743 __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe 2744 { 2745 int4 r = void; 2746 long2 lc = cast(long2)count; 2747 int bits = cast(int)(lc.array[0]); 2748 foreach(i; 0..4) 2749 r[i] = cast(uint)(a[i]) << bits; 2750 return r; 2751 } 2752 } 2753 unittest 2754 { 2755 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 2756 __m128i B = _mm_sll_epi32(A, _mm_cvtsi32_si128(1)); 2757 int[4] expectedB = [ 0, 4, 6, -8]; 2758 assert(B.array == expectedB); 2759 } 2760 2761 version(LDC) 2762 { 2763 alias _mm_sll_epi64 = __builtin_ia32_psllq128; 2764 } 2765 else static if (GDC_with_SSE2) 2766 { 2767 alias _mm_sll_epi64 = __builtin_ia32_psllq128; 2768 } 2769 else static if (DMD_with_32bit_asm) 2770 { 2771 __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe 2772 { 2773 asm pure nothrow @nogc @trusted 2774 { 2775 movdqu XMM0, a; 2776 movdqu XMM1, count; 2777 psllq XMM0, XMM1; 2778 movdqu a, XMM0; 2779 } 2780 return a; 2781 } 2782 } 2783 else 2784 { 2785 __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe 2786 { 2787 long2 r = void; 2788 long2 sa = cast(long2)a; 2789 long2 lc = cast(long2)count; 2790 int bits = cast(int)(lc.array[0]); 2791 foreach(i; 0..2) 2792 r.array[i] = cast(ulong)(sa.array[i]) << bits; 2793 return cast(__m128i)r; 2794 } 2795 } 2796 unittest 2797 { 2798 __m128i A = _mm_setr_epi64(8, -4); 2799 long2 B = cast(long2) _mm_sll_epi64(A, _mm_cvtsi32_si128(1)); 2800 long[2] expectedB = [ 16, -8]; 2801 assert(B.array == expectedB); 2802 } 2803 2804 version(LDC) 2805 { 2806 alias _mm_sll_epi16 = __builtin_ia32_psllw128; 2807 } 2808 else static if (GDC_with_SSE2) 2809 { 2810 alias _mm_sll_epi16 = __builtin_ia32_psllw128; 2811 } 2812 else static if (DMD_with_32bit_asm) 2813 { 2814 __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 2815 { 2816 asm pure nothrow @nogc 2817 { 2818 movdqu XMM0, a; 2819 movdqu XMM1, count; 2820 psllw XMM0, XMM1; 2821 movdqu a, XMM0; 2822 } 2823 return a; 2824 } 2825 } 2826 else 2827 { 2828 __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 2829 { 2830 short8 sa = cast(short8)a; 2831 long2 lc = cast(long2)count; 2832 int bits = cast(int)(lc.array[0]); 2833 short8 r = void; 2834 foreach(i; 0..8) 2835 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits); 2836 return cast(int4)r; 2837 } 2838 } 2839 unittest 2840 { 2841 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 2842 short8 B = cast(short8)( _mm_sll_epi16(A, _mm_cvtsi32_si128(1)) ); 2843 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 2844 assert(B.array == expectedB); 2845 } 2846 2847 version(LDC) 2848 { 2849 alias _mm_slli_epi32 = __builtin_ia32_pslldi128; 2850 } 2851 else 2852 { 2853 static if (GDC_with_SSE2) 2854 { 2855 alias _mm_slli_epi32 = __builtin_ia32_pslldi128; 2856 } 2857 else 2858 { 2859 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @safe 2860 { 2861 int4 r = void; 2862 foreach(i; 0..4) 2863 r.array[i] = cast(uint)(a.array[i]) << imm8; 2864 return r; 2865 } 2866 } 2867 } 2868 unittest 2869 { 2870 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 2871 __m128i B = _mm_slli_epi32(A, 1); 2872 int[4] expectedB = [ 0, 4, 6, -8]; 2873 assert(B.array == expectedB); 2874 } 2875 2876 version(LDC) 2877 { 2878 alias _mm_slli_epi64 = __builtin_ia32_psllqi128; 2879 } 2880 else 2881 { 2882 static if (GDC_with_SSE2) 2883 { 2884 alias _mm_slli_epi64 = __builtin_ia32_psllqi128; 2885 } 2886 else 2887 { 2888 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @safe 2889 { 2890 long2 r = void; 2891 long2 sa = cast(long2)a; 2892 foreach(i; 0..2) 2893 r.array[i] = cast(ulong)(sa.array[i]) << imm8; 2894 return cast(__m128i)r; 2895 } 2896 } 2897 } 2898 unittest 2899 { 2900 __m128i A = _mm_setr_epi64(8, -4); 2901 long2 B = cast(long2) _mm_slli_epi64(A, 1); 2902 long[2] expectedB = [ 16, -8]; 2903 assert(B.array == expectedB); 2904 } 2905 2906 version(LDC) 2907 { 2908 alias _mm_slli_epi16 = __builtin_ia32_psllwi128; 2909 } 2910 else 2911 { 2912 static if (GDC_with_SSE2) 2913 { 2914 alias _mm_slli_epi16 = __builtin_ia32_psllwi128; 2915 } 2916 else 2917 { 2918 __m128i _mm_slli_epi16 (__m128i a, int imm8) pure @safe 2919 { 2920 short8 sa = cast(short8)a; 2921 short8 r = void; 2922 foreach(i; 0..8) 2923 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) << imm8); 2924 return cast(int4)r; 2925 } 2926 } 2927 } 2928 unittest 2929 { 2930 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 2931 short8 B = cast(short8)( _mm_slli_epi16(A, 1) ); 2932 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 2933 assert(B.array == expectedB); 2934 } 2935 2936 2937 /// Shift `a` left by `bytes` bytes while shifting in zeros. 2938 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted 2939 { 2940 static if (bytes & 0xF0) 2941 { 2942 return _mm_setzero_si128(); 2943 } 2944 else 2945 { 2946 static if (GDC_with_SSE2) 2947 { 2948 return __builtin_ia32_i128(op, cast(ubyte)(bytes * 8)); 2949 } 2950 else version(DigitalMars) 2951 { 2952 version(D_InlineAsm_X86) 2953 { 2954 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64 2955 { 2956 movdqu XMM0, op; 2957 pslldq XMM0, bytes; 2958 movdqu op, XMM0; 2959 } 2960 return op; 2961 } 2962 else 2963 { 2964 byte16 A = cast(byte16)op; 2965 byte16 R; 2966 for (int n = 15; n >= bytes; --n) 2967 R.ptr[n] = A.array[n-bytes]; 2968 for (int n = bytes-1; n >= 0; --n) 2969 R.ptr[n] = 0; 2970 return cast(__m128i)R; 2971 } 2972 } 2973 else 2974 { 2975 return cast(__m128i) shufflevector!(byte16, 2976 16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes, 2977 22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes, 2978 28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes) 2979 (cast(byte16)_mm_setzero_si128(), cast(byte16)op); 2980 } 2981 } 2982 } 2983 unittest 2984 { 2985 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 2986 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left 2987 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ]; 2988 assert(R.array == correct); 2989 } 2990 2991 version(LDC) 2992 { 2993 // Disappeared with LDC 1.11 2994 static if (__VERSION__ < 2081) 2995 alias _mm_sqrt_pd = __builtin_ia32_sqrtpd; 2996 else 2997 { 2998 __m128d _mm_sqrt_pd(__m128d vec) pure @safe 2999 { 3000 vec.array[0] = llvm_sqrt(vec.array[0]); 3001 vec.array[1] = llvm_sqrt(vec.array[1]); 3002 return vec; 3003 } 3004 } 3005 } 3006 else 3007 { 3008 static if (GDC_with_SSE2) 3009 { 3010 alias _mm_sqrt_pd = __builtin_ia32_sqrtpd; 3011 } 3012 else 3013 { 3014 __m128d _mm_sqrt_pd(__m128d vec) pure @safe 3015 { 3016 vec.array[0] = sqrt(vec.array[0]); 3017 vec.array[1] = sqrt(vec.array[1]); 3018 return vec; 3019 } 3020 } 3021 } 3022 3023 3024 version(LDC) 3025 { 3026 // Disappeared with LDC 1.11 3027 static if (__VERSION__ < 2081) 3028 alias _mm_sqrt_sd = __builtin_ia32_sqrtsd; 3029 else 3030 { 3031 __m128d _mm_sqrt_sd(__m128d vec) pure @safe 3032 { 3033 vec.array[0] = llvm_sqrt(vec.array[0]); 3034 vec.array[1] = vec.array[1]; 3035 return vec; 3036 } 3037 } 3038 } 3039 else 3040 { 3041 static if (GDC_with_SSE2) 3042 { 3043 alias _mm_sqrt_sd = __builtin_ia32_sqrtsd; 3044 } 3045 else 3046 { 3047 __m128d _mm_sqrt_sd(__m128d vec) pure @safe 3048 { 3049 vec.array[0] = sqrt(vec.array[0]); 3050 vec.array[1] = vec.array[1]; 3051 return vec; 3052 } 3053 } 3054 } 3055 3056 3057 version(LDC) 3058 { 3059 alias _mm_sra_epi16 = __builtin_ia32_psraw128; 3060 } 3061 else 3062 { 3063 static if (GDC_with_SSE2) 3064 { 3065 alias _mm_sra_epi16 = __builtin_ia32_psraw128; 3066 } 3067 else 3068 { 3069 __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @safe 3070 { 3071 short8 sa = cast(short8)a; 3072 long2 lc = cast(long2)count; 3073 int bits = cast(int)(lc.array[0]); 3074 short8 r = void; 3075 foreach(i; 0..8) 3076 r.array[i] = cast(short)(sa.array[i] >> bits); 3077 return cast(int4)r; 3078 } 3079 } 3080 } 3081 unittest 3082 { 3083 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3084 short8 B = cast(short8)( _mm_sra_epi16(A, _mm_cvtsi32_si128(1)) ); 3085 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 3086 assert(B.array == expectedB); 3087 } 3088 3089 version(LDC) 3090 { 3091 alias _mm_sra_epi32 = __builtin_ia32_psrad128; 3092 } 3093 else 3094 { 3095 static if (GDC_with_SSE2) 3096 { 3097 alias _mm_sra_epi32 = __builtin_ia32_psrad128; 3098 } 3099 else 3100 { 3101 __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @safe 3102 { 3103 int4 r = void; 3104 long2 lc = cast(long2)count; 3105 int bits = cast(int)(lc.array[0]); 3106 foreach(i; 0..4) 3107 r.array[i] = (a.array[i] >> bits); 3108 return r; 3109 } 3110 } 3111 } 3112 unittest 3113 { 3114 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3115 __m128i B = _mm_sra_epi32(A, _mm_cvtsi32_si128(1)); 3116 int[4] expectedB = [ 0, 1, 1, -2]; 3117 assert(B.array == expectedB); 3118 } 3119 3120 3121 version(LDC) 3122 { 3123 alias _mm_srai_epi16 = __builtin_ia32_psrawi128; 3124 } 3125 else 3126 { 3127 static if (GDC_with_SSE2) 3128 { 3129 alias _mm_srai_epi16 = __builtin_ia32_psrawi128; 3130 } 3131 else 3132 { 3133 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @safe 3134 { 3135 short8 sa = cast(short8)a; 3136 short8 r = void; 3137 foreach(i; 0..8) 3138 r.array[i] = cast(short)(sa.array[i] >> imm8); 3139 return cast(int4)r; 3140 } 3141 } 3142 } 3143 unittest 3144 { 3145 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3146 short8 B = cast(short8)( _mm_srai_epi16(A, 1) ); 3147 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 3148 assert(B.array == expectedB); 3149 } 3150 3151 version(LDC) 3152 { 3153 alias _mm_srai_epi32 = __builtin_ia32_psradi128; 3154 } 3155 else 3156 { 3157 static if (GDC_with_SSE2) 3158 { 3159 alias _mm_srai_epi32 = __builtin_ia32_psradi128; 3160 } 3161 else 3162 { 3163 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @safe 3164 { 3165 int4 r = void; 3166 foreach(i; 0..4) 3167 r.array[i] = (a.array[i] >> imm8); 3168 return r; 3169 } 3170 } 3171 } 3172 unittest 3173 { 3174 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3175 __m128i B = _mm_srai_epi32(A, 1); 3176 int[4] expectedB = [ 0, 1, 1, -2]; 3177 assert(B.array == expectedB); 3178 } 3179 3180 version(LDC) 3181 { 3182 alias _mm_srl_epi16 = __builtin_ia32_psrlw128; 3183 } 3184 else 3185 { 3186 static if (GDC_with_SSE2) 3187 { 3188 alias _mm_srl_epi16 = __builtin_ia32_psrlw128; 3189 } 3190 else 3191 { 3192 __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @safe 3193 { 3194 short8 sa = cast(short8)a; 3195 long2 lc = cast(long2)count; 3196 int bits = cast(int)(lc.array[0]); 3197 short8 r = void; 3198 foreach(i; 0..8) 3199 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits); 3200 return cast(int4)r; 3201 } 3202 } 3203 } 3204 unittest 3205 { 3206 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3207 short8 B = cast(short8)( _mm_srl_epi16(A, _mm_cvtsi32_si128(1)) ); 3208 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 3209 assert(B.array == expectedB); 3210 } 3211 3212 version(LDC) 3213 { 3214 alias _mm_srl_epi32 = __builtin_ia32_psrld128; 3215 } 3216 else 3217 { 3218 static if (GDC_with_SSE2) 3219 { 3220 alias _mm_srl_epi32 = __builtin_ia32_psrld128; 3221 } 3222 else 3223 { 3224 __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @safe 3225 { 3226 int4 r = void; 3227 long2 lc = cast(long2)count; 3228 int bits = cast(int)(lc.array[0]); 3229 foreach(i; 0..4) 3230 r.array[i] = cast(uint)(a.array[i]) >> bits; 3231 return r; 3232 } 3233 } 3234 } 3235 unittest 3236 { 3237 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3238 __m128i B = _mm_srl_epi32(A, _mm_cvtsi32_si128(1)); 3239 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 3240 assert(B.array == expectedB); 3241 } 3242 3243 version(LDC) 3244 { 3245 alias _mm_srl_epi64 = __builtin_ia32_psrlq128; 3246 } 3247 else 3248 { 3249 static if (GDC_with_SSE2) 3250 { 3251 alias _mm_srl_epi64 = __builtin_ia32_psrlq128; 3252 } 3253 else 3254 { 3255 __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @safe 3256 { 3257 long2 r = void; 3258 long2 sa = cast(long2)a; 3259 long2 lc = cast(long2)count; 3260 int bits = cast(int)(lc.array[0]); 3261 foreach(i; 0..2) 3262 r.array[i] = cast(ulong)(sa.array[i]) >> bits; 3263 return cast(__m128i)r; 3264 } 3265 } 3266 } 3267 unittest 3268 { 3269 __m128i A = _mm_setr_epi64(8, -4); 3270 long2 B = cast(long2) _mm_srl_epi64(A, _mm_cvtsi32_si128(1)); 3271 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 3272 assert(B.array == expectedB); 3273 } 3274 3275 version(LDC) 3276 { 3277 alias _mm_srli_epi16 = __builtin_ia32_psrlwi128; 3278 } 3279 else 3280 { 3281 static if (GDC_with_SSE2) 3282 { 3283 alias _mm_srli_epi16 = __builtin_ia32_psrlwi128; 3284 } 3285 else 3286 { 3287 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @safe 3288 { 3289 short8 sa = cast(short8)a; 3290 short8 r = void; 3291 foreach(i; 0..8) 3292 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> imm8); 3293 return cast(int4)r; 3294 } 3295 } 3296 } 3297 unittest 3298 { 3299 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3300 short8 B = cast(short8)( _mm_srli_epi16(A, 1) ); 3301 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 3302 assert(B.array == expectedB); 3303 } 3304 3305 version(LDC) 3306 { 3307 alias _mm_srli_epi32 = __builtin_ia32_psrldi128; 3308 } 3309 else 3310 { 3311 static if (GDC_with_SSE2) 3312 { 3313 alias _mm_srli_epi32 = __builtin_ia32_psrldi128; 3314 } 3315 else 3316 { 3317 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @safe 3318 { 3319 int4 r = void; 3320 foreach(i; 0..4) 3321 r.array[i] = cast(uint)(a.array[i]) >> imm8; 3322 return r; 3323 } 3324 } 3325 } 3326 unittest 3327 { 3328 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3329 __m128i B = _mm_srli_epi32(A, 1); 3330 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 3331 assert(B.array == expectedB); 3332 } 3333 3334 version(LDC) 3335 { 3336 alias _mm_srli_epi64 = __builtin_ia32_psrlqi128; 3337 } 3338 else 3339 { 3340 static if (GDC_with_SSE2) 3341 { 3342 alias _mm_srli_epi64 = __builtin_ia32_psrlqi128; 3343 } 3344 else 3345 { 3346 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @safe 3347 { 3348 long2 r = void; 3349 long2 sa = cast(long2)a; 3350 foreach(i; 0..2) 3351 r.array[i] = cast(ulong)(sa.array[i]) >> imm8; 3352 return cast(__m128i)r; 3353 } 3354 } 3355 } 3356 unittest 3357 { 3358 __m128i A = _mm_setr_epi64(8, -4); 3359 long2 B = cast(long2) _mm_srli_epi64(A, 1); 3360 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 3361 assert(B.array == expectedB); 3362 } 3363 3364 /// Shift `v` right by `bytes` bytes while shifting in zeros. 3365 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe 3366 { 3367 static if (bytes & 0xF0) 3368 { 3369 return _mm_setzero_si128(); 3370 } 3371 else 3372 { 3373 static if (GDC_with_SSE2) 3374 { 3375 return cast(__m128i) __builtin_ia32_psrldqi128(v, cast(ubyte)(bytes * 8)); 3376 } 3377 else static if (DMD_with_32bit_asm) 3378 { 3379 asm pure nothrow @nogc @trusted 3380 { 3381 movdqu XMM0, v; 3382 psrldq XMM0, bytes; 3383 movdqu v, XMM0; 3384 } 3385 return v; 3386 } 3387 else 3388 { 3389 return cast(__m128i) shufflevector!(byte16, 3390 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7, 3391 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15) 3392 (cast(byte16) v, cast(byte16)_mm_setzero_si128()); 3393 } 3394 } 3395 3396 } 3397 3398 unittest 3399 { 3400 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1)); 3401 int[4] correct = [2, 3, 4, 0]; 3402 assert(R.array == correct); 3403 } 3404 3405 /// Shift `v` right by `bytes` bytes while shifting in zeros. 3406 /// #BONUS 3407 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe 3408 { 3409 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v); 3410 } 3411 unittest 3412 { 3413 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)); 3414 float[4] correct = [3.0f, 4.0f, 0, 0]; 3415 assert(R.array == correct); 3416 } 3417 3418 /// Shift `v` right by `bytes` bytes while shifting in zeros. 3419 /// #BONUS 3420 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe 3421 { 3422 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v); 3423 } 3424 3425 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted 3426 { 3427 __m128d* aligned = cast(__m128d*)mem_addr; 3428 *aligned = a; 3429 } 3430 3431 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted 3432 { 3433 __m128d* aligned = cast(__m128d*)mem_addr; 3434 __m128d r; 3435 r.ptr[0] = a.array[0]; 3436 r.ptr[1] = a.array[0]; 3437 *aligned = r; 3438 } 3439 3440 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 3441 { 3442 *mem_addr = a.array[0]; 3443 } 3444 3445 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 3446 { 3447 *mem_addr = a; 3448 } 3449 3450 alias _mm_store1_pd = _mm_store_pd1; 3451 3452 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 3453 { 3454 *mem_addr = a.array[1]; 3455 } 3456 3457 // Note: `mem_addr` doesn't have to actually be aligned, which breaks 3458 // expectations from the user point of view. This problem also exist in C++. 3459 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 3460 { 3461 long* dest = cast(long*)mem_addr; 3462 long2 la = cast(long2)a; 3463 *dest = la.array[0]; 3464 } 3465 unittest 3466 { 3467 long[3] A = [1, 2, 3]; 3468 _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 3469 long[3] correct = [1, 0x1_0000_0000, 3]; 3470 assert(A == correct); 3471 } 3472 3473 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 3474 { 3475 *mem_addr = a.array[0]; 3476 } 3477 3478 void _mm_storer_pd (double* mem_addr, __m128d a) pure 3479 { 3480 __m128d* aligned = cast(__m128d*)mem_addr; 3481 *aligned = shufflevector!(double2, 1, 0)(a, a); 3482 } 3483 3484 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe 3485 { 3486 storeUnaligned!double2(a, mem_addr); 3487 } 3488 3489 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe 3490 { 3491 storeUnaligned!__m128i(a, cast(int*)mem_addr); 3492 } 3493 3494 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 3495 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte 3496 /// boundary or a general-protection exception may be generated. 3497 void _mm_stream_pd (double* mem_addr, __m128d a) 3498 { 3499 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 3500 __m128d* dest = cast(__m128d*)mem_addr; 3501 *dest = a; 3502 } 3503 3504 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 3505 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception 3506 /// may be generated. 3507 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) 3508 { 3509 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 3510 __m128i* dest = cast(__m128i*)mem_addr; 3511 *dest = a; 3512 } 3513 3514 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 3515 /// pollution. If the cache line containing address mem_addr is already in the cache, 3516 /// the cache will be updated. 3517 void _mm_stream_si32 (int* mem_addr, int a) 3518 { 3519 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 3520 *mem_addr = a; 3521 } 3522 3523 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 3524 /// cache pollution. If the cache line containing address mem_addr is already 3525 /// in the cache, the cache will be updated. 3526 void _mm_stream_si64 (long* mem_addr, long a) 3527 { 3528 // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 3529 *mem_addr = a; 3530 } 3531 3532 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 3533 { 3534 return cast(__m128i)(cast(short8)a - cast(short8)b); 3535 } 3536 3537 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 3538 { 3539 return cast(__m128i)(cast(int4)a - cast(int4)b); 3540 } 3541 3542 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 3543 { 3544 return cast(__m128i)(cast(long2)a - cast(long2)b); 3545 } 3546 3547 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 3548 { 3549 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 3550 } 3551 3552 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 3553 { 3554 return a - b; 3555 } 3556 3557 version(DigitalMars) 3558 { 3559 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 3560 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe 3561 { 3562 asm pure nothrow @nogc @trusted { nop;} 3563 a[0] = a[0] - b[0]; 3564 return a; 3565 } 3566 } 3567 else static if (GDC_with_SSE2) 3568 { 3569 alias _mm_sub_sd = __builtin_ia32_subsd; 3570 } 3571 else 3572 { 3573 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe 3574 { 3575 a.array[0] -= b.array[0]; 3576 return a; 3577 } 3578 } 3579 unittest 3580 { 3581 __m128d a = [1.5, -2.0]; 3582 a = _mm_sub_sd(a, a); 3583 assert(a.array == [0.0, -2.0]); 3584 } 3585 3586 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe 3587 { 3588 return a - b; 3589 } 3590 3591 version(LDC) 3592 { 3593 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 3594 { 3595 // Generates PSUBSW since LDC 1.15 -O0 3596 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 3597 { 3598 enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 3599 enum ir = ` 3600 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 3601 ret <8 x i16> %r`; 3602 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 3603 } 3604 } 3605 else 3606 alias _mm_subs_epi16 = __builtin_ia32_psubsw128; 3607 } 3608 else 3609 { 3610 static if (GDC_with_SSE2) 3611 { 3612 alias _mm_subs_epi16 = __builtin_ia32_psubsw128; 3613 } 3614 else 3615 { 3616 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 3617 { 3618 short[8] res; 3619 short8 sa = cast(short8)a; 3620 short8 sb = cast(short8)b; 3621 foreach(i; 0..8) 3622 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 3623 return _mm_loadu_si128(cast(int4*)res.ptr); 3624 } 3625 } 3626 } 3627 unittest 3628 { 3629 short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0), 3630 _mm_setr_epi16(-10 , 16, 5, 4, 3, 2, 1, 0)); 3631 static immutable short[8] correctResult = [32767, -32768, 0, 0, 0, 0, 0, 0]; 3632 assert(res.array == correctResult); 3633 } 3634 3635 version(LDC) 3636 { 3637 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 3638 { 3639 // Generates PSUBSB since LDC 1.15 -O0 3640 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 3641 { 3642 enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 3643 enum ir = ` 3644 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 3645 ret <16 x i8> %r`; 3646 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 3647 } 3648 } 3649 else 3650 alias _mm_subs_epi8 = __builtin_ia32_psubsb128; 3651 } 3652 else 3653 { 3654 static if (GDC_with_SSE2) 3655 { 3656 alias _mm_subs_epi8 = __builtin_ia32_psubsb128; 3657 } 3658 else 3659 { 3660 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 3661 { 3662 byte[16] res; 3663 byte16 sa = cast(byte16)a; 3664 byte16 sb = cast(byte16)b; 3665 foreach(i; 0..16) 3666 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 3667 return _mm_loadu_si128(cast(int4*)res.ptr); 3668 } 3669 } 3670 } 3671 unittest 3672 { 3673 byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 3674 _mm_setr_epi8( 15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 3675 static immutable byte[16] correctResult = [-128, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 3676 assert(res.array == correctResult); 3677 } 3678 3679 version(LDC) 3680 { 3681 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 3682 { 3683 // Generates PSUBUSW since LDC 1.15 -O0 3684 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 3685 { 3686 enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 3687 enum ir = ` 3688 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 3689 ret <8 x i16> %r`; 3690 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 3691 } 3692 } 3693 else 3694 alias _mm_subs_epu16 = __builtin_ia32_psubusw128; 3695 } 3696 else 3697 { 3698 static if (GDC_with_SSE2) 3699 { 3700 alias _mm_subs_epu16 = __builtin_ia32_psubusw128; 3701 } 3702 else 3703 { 3704 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 3705 { 3706 short[8] res; 3707 short8 sa = cast(short8)a; 3708 short8 sb = cast(short8)b; 3709 foreach(i; 0..8) 3710 { 3711 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 3712 res[i] = saturateSignedIntToUnsignedShort(sum); 3713 } 3714 return _mm_loadu_si128(cast(int4*)res.ptr); 3715 } 3716 } 3717 } 3718 unittest 3719 { 3720 __m128i A = _mm_setr_epi16(cast(short)65534, 0, 5, 4, 3, 2, 1, 0); 3721 short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534, 1, 5, 4, 3, 2, 1, 0), 3722 _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0)); 3723 static immutable short[8] correct = [ 0, 0, 1, 0, 0, 2, 0, 0]; 3724 assert(R.array == correct); 3725 } 3726 3727 version(LDC) 3728 { 3729 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 3730 { 3731 // Generates PSUBUSB since LDC 1.15 -O0 3732 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 3733 { 3734 enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 3735 enum ir = ` 3736 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 3737 ret <16 x i8> %r`; 3738 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 3739 } 3740 } 3741 else 3742 alias _mm_subs_epu8 = __builtin_ia32_psubusb128; 3743 } 3744 else 3745 { 3746 static if (GDC_with_SSE2) 3747 { 3748 alias _mm_subs_epu8 = __builtin_ia32_psubusb128; 3749 } 3750 else 3751 { 3752 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 3753 { 3754 ubyte[16] res; 3755 byte16 sa = cast(byte16)a; 3756 byte16 sb = cast(byte16)b; 3757 foreach(i; 0..16) 3758 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 3759 return _mm_loadu_si128(cast(int4*)res.ptr); 3760 } 3761 } 3762 } 3763 unittest 3764 { 3765 byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 3766 _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 3767 static immutable byte[16] correctResult = [ 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 3768 assert(res.array == correctResult); 3769 } 3770 3771 // Note: the only difference between these intrinsics is the signalling 3772 // behaviour of quiet NaNs. This is incorrect but the case where 3773 // you would want to differentiate between qNaN and sNaN and then 3774 // treat them differently on purpose seems extremely rare. 3775 alias _mm_ucomieq_sd = _mm_comieq_sd; 3776 alias _mm_ucomige_sd = _mm_comige_sd; 3777 alias _mm_ucomigt_sd = _mm_comigt_sd; 3778 alias _mm_ucomile_sd = _mm_comile_sd; 3779 alias _mm_ucomilt_sd = _mm_comilt_sd; 3780 alias _mm_ucomineq_sd = _mm_comineq_sd; 3781 3782 __m128d _mm_undefined_pd() pure @safe 3783 { 3784 __m128d result = void; 3785 return result; 3786 } 3787 __m128i _mm_undefined_si128() pure @safe 3788 { 3789 __m128i result = void; 3790 return result; 3791 } 3792 3793 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe 3794 { 3795 static if (GDC_with_SSE2) 3796 { 3797 return __builtin_ia32_punpckhwd128(a, b); 3798 } 3799 else static if (DMD_with_32bit_asm) 3800 { 3801 asm pure nothrow @nogc @trusted 3802 { 3803 movdqu XMM0, a; 3804 movdqu XMM1, b; 3805 punpckhwd XMM0, XMM1; 3806 movdqu a, XMM0; 3807 } 3808 return a; 3809 } 3810 else 3811 { 3812 return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 3813 (cast(short8)a, cast(short8)b); 3814 } 3815 } 3816 unittest 3817 { 3818 __m128i A = _mm_setr_epi16(4, 5, 6, 7, 8, 9, 10, 11); 3819 __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19); 3820 short8 C = cast(short8)(_mm_unpackhi_epi16(A, B)); 3821 short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19]; 3822 assert(C.array == correct); 3823 } 3824 3825 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @safe 3826 { 3827 static if (GDC_with_SSE2) 3828 { 3829 return __builtin_ia32_punpckhdq128(a, b); 3830 } 3831 else 3832 { 3833 return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 3834 } 3835 } 3836 3837 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted 3838 { 3839 static if (GDC_with_SSE2) 3840 { 3841 return __builtin_ia32_punpckhqdq128(a, b); 3842 } 3843 else 3844 { 3845 __m128i r = cast(__m128i)b; 3846 r[0] = a[2]; 3847 r[1] = a[3]; 3848 return r; 3849 } 3850 } 3851 unittest // Issue #36 3852 { 3853 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 3854 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 3855 long2 C = cast(long2)(_mm_unpackhi_epi64(A, B)); 3856 long[2] correct = [0x33333333_33333333, 0x55555555_55555555]; 3857 assert(C.array == correct); 3858 } 3859 3860 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe 3861 { 3862 static if (GDC_with_SSE2) 3863 { 3864 return __builtin_ia32_punpckhbw128(a, b); 3865 } 3866 else 3867 { 3868 return cast(__m128i)shufflevector!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 3869 12, 28, 13, 29, 14, 30, 15, 31) 3870 (cast(byte16)a, cast(byte16)b); 3871 } 3872 } 3873 3874 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe 3875 { 3876 static if (GDC_with_SSE2) 3877 { 3878 return __builtin_ia32_unpckhpd(a, b); 3879 } 3880 else 3881 { 3882 return shufflevector!(__m128d, 1, 3)(a, b); 3883 } 3884 } 3885 3886 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe 3887 { 3888 static if (GDC_with_SSE2) 3889 { 3890 return __builtin_ia32_punpcklwd128(a, b); 3891 } 3892 else 3893 { 3894 return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11) 3895 (cast(short8)a, cast(short8)b); 3896 } 3897 } 3898 3899 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @safe 3900 { 3901 static if (GDC_with_SSE2) 3902 { 3903 return __builtin_ia32_punpckldq128(a, b); 3904 } 3905 else 3906 { 3907 return shufflevector!(int4, 0, 4, 1, 5) 3908 (cast(int4)a, cast(int4)b); 3909 } 3910 } 3911 3912 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted 3913 { 3914 static if (GDC_with_SSE2) 3915 { 3916 return __builtin_ia32_punpcklqdq128(a, b); 3917 } 3918 else 3919 { 3920 long2 lA = cast(long2)a; 3921 long2 lB = cast(long2)b; 3922 long2 R; 3923 R.ptr[0] = lA.array[0]; 3924 R.ptr[1] = lB.array[0]; 3925 return cast(__m128i)R; 3926 } 3927 } 3928 unittest // Issue #36 3929 { 3930 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 3931 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 3932 long2 C = cast(long2)(_mm_unpacklo_epi64(A, B)); 3933 long[2] correct = [0x22222222_22222222, 0x44444444_44444444]; 3934 assert(C.array == correct); 3935 } 3936 3937 3938 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe 3939 { 3940 static if (GDC_with_SSE2) 3941 { 3942 return __builtin_ia32_punpcklbw128(a, b); 3943 } 3944 else 3945 { 3946 return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 3947 4, 20, 5, 21, 6, 22, 7, 23) 3948 (cast(byte16)a, cast(byte16)b); 3949 } 3950 } 3951 3952 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe 3953 { 3954 static if (GDC_with_SSE2) 3955 { 3956 return __builtin_ia32_unpcklpd(a, b); 3957 } 3958 else 3959 { 3960 return shufflevector!(__m128d, 0, 2)(a, b); 3961 } 3962 } 3963 3964 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 3965 { 3966 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 3967 } 3968 3969 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 3970 { 3971 return a ^ b; 3972 } 3973 3974 unittest 3975 { 3976 // distance between two points in 4D 3977 float distance(float[4] a, float[4] b) nothrow @nogc 3978 { 3979 __m128 va = _mm_loadu_ps(a.ptr); 3980 __m128 vb = _mm_loadu_ps(b.ptr); 3981 __m128 diffSquared = _mm_sub_ps(va, vb); 3982 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 3983 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared)); 3984 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum)); 3985 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 3986 } 3987 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 3988 }