1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2019. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.emmintrin; 7 8 public import inteli.types; 9 public import inteli.xmmintrin; // SSE2 includes SSE1 10 import inteli.mmx; 11 import inteli.internals; 12 13 nothrow @nogc: 14 15 // SSE2 instructions 16 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 17 18 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 19 { 20 return cast(__m128i)(cast(short8)a + cast(short8)b); 21 } 22 23 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 24 { 25 return cast(__m128i)(cast(int4)a + cast(int4)b); 26 } 27 28 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 29 { 30 return cast(__m128i)(cast(long2)a + cast(long2)b); 31 } 32 33 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 34 { 35 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 36 } 37 38 version(DigitalMars) 39 { 40 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 41 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 42 { 43 pragma(inline, false); 44 a[0] = a[0] + b[0]; 45 return a; 46 } 47 } 48 else 49 { 50 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 51 { 52 a[0] += b[0]; 53 return a; 54 } 55 } 56 unittest 57 { 58 __m128d a = [1.5, -2.0]; 59 a = _mm_add_sd(a, a); 60 assert(a.array == [3.0, -2.0]); 61 } 62 63 64 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 65 { 66 return a + b; 67 } 68 unittest 69 { 70 __m128d a = [1.5, -2.0]; 71 a = _mm_add_pd(a, a); 72 assert(a.array == [3.0, -4.0]); 73 } 74 75 // TODO: _mm_add_si64 76 77 version(LDC) 78 { 79 alias _mm_adds_epi16 = __builtin_ia32_paddsw128; 80 } 81 else 82 { 83 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 84 { 85 short[8] res; 86 short8 sa = cast(short8)a; 87 short8 sb = cast(short8)b; 88 foreach(i; 0..8) 89 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 90 return _mm_loadu_si128(cast(int4*)res.ptr); 91 } 92 } 93 unittest 94 { 95 short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), 96 _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); 97 static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 98 assert(res.array == correctResult); 99 } 100 101 version(LDC) 102 { 103 alias _mm_adds_epi8 = __builtin_ia32_paddsb128; 104 } 105 else 106 { 107 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 108 { 109 byte[16] res; 110 byte16 sa = cast(byte16)a; 111 byte16 sb = cast(byte16)b; 112 foreach(i; 0..16) 113 res[i] = saturateSignedWordToSignedByte(sa.array[i] + sb.array[i]); 114 return _mm_loadu_si128(cast(int4*)res.ptr); 115 } 116 } 117 unittest 118 { 119 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 120 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 121 static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14, 122 16, 18, 20, 22, 24, 26, 28, 30]; 123 assert(res.array == correctResult); 124 } 125 126 version(LDC) 127 { 128 alias _mm_adds_epu8 = __builtin_ia32_paddusb128; 129 } 130 else 131 { 132 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 133 { 134 ubyte[16] res; 135 byte16 sa = cast(byte16)a; 136 byte16 sb = cast(byte16)b; 137 foreach(i; 0..16) 138 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 139 return _mm_loadu_si128(cast(int4*)res.ptr); 140 } 141 } 142 143 version(LDC) 144 { 145 alias _mm_adds_epu16 = __builtin_ia32_paddusw128; 146 } 147 else 148 { 149 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 150 { 151 ushort[8] res; 152 short8 sa = cast(short8)a; 153 short8 sb = cast(short8)b; 154 foreach(i; 0..8) 155 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 156 return _mm_loadu_si128(cast(int4*)res.ptr); 157 } 158 } 159 160 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 161 { 162 return cast(__m128d)( cast(__m128i)a & cast(__m128i)b ); 163 } 164 165 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 166 { 167 return a & b; 168 } 169 unittest 170 { 171 __m128i A = _mm_set1_epi32(7); 172 __m128i B = _mm_set1_epi32(14); 173 __m128i R = _mm_and_si128(A, B); 174 int[4] correct = [6, 6, 6, 6]; 175 assert(R.array == correct); 176 } 177 178 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 179 { 180 return cast(__m128d)( (~cast(__m128i)a) & cast(__m128i)b ); 181 } 182 183 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 184 { 185 return (~a) & b; 186 } 187 unittest 188 { 189 __m128i A = _mm_set1_epi32(7); 190 __m128i B = _mm_set1_epi32(14); 191 __m128i R = _mm_andnot_si128(A, B); 192 int[4] correct = [8, 8, 8, 8]; 193 assert(R.array == correct); 194 } 195 196 version(LDC) 197 { 198 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @safe 199 { 200 // Generates pavgw even in LDC 1.0, even in -O0 201 enum ir = ` 202 %ia = zext <8 x i16> %0 to <8 x i32> 203 %ib = zext <8 x i16> %1 to <8 x i32> 204 %isum = add <8 x i32> %ia, %ib 205 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 206 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 207 %r = trunc <8 x i32> %isums to <8 x i16> 208 ret <8 x i16> %r`; 209 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 210 } 211 } 212 else 213 { 214 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @safe 215 { 216 short8 sa = cast(short8)a; 217 short8 sb = cast(short8)b; 218 short8 sr = void; 219 foreach(i; 0..8) 220 { 221 sr[i] = cast(ushort)( (cast(ushort)(sa[i]) + cast(ushort)(sb[i]) + 1) >> 1 ); 222 } 223 return cast(int4)sr; 224 } 225 } 226 unittest 227 { 228 __m128i A = _mm_set1_epi16(31); 229 __m128i B = _mm_set1_epi16(64); 230 short8 avg = cast(short8)(_mm_avg_epu16(A, B)); 231 foreach(i; 0..8) 232 assert(avg[i] == 48); 233 } 234 235 version(LDC) 236 { 237 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @safe 238 { 239 // Generates pavgb even in LDC 1.0, even in -O0 240 enum ir = ` 241 %ia = zext <16 x i8> %0 to <16 x i16> 242 %ib = zext <16 x i8> %1 to <16 x i16> 243 %isum = add <16 x i16> %ia, %ib 244 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 245 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 246 %r = trunc <16 x i16> %isums to <16 x i8> 247 ret <16 x i8> %r`; 248 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 249 } 250 } 251 else 252 { 253 __m128i _mm_avg_epu8 (__m128i a, __m128i b) 254 { 255 byte16 sa = cast(byte16)a; 256 byte16 sb = cast(byte16)b; 257 byte16 sr = void; 258 foreach(i; 0..16) 259 { 260 sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 ); 261 } 262 return cast(int4)sr; 263 } 264 } 265 unittest 266 { 267 __m128i A = _mm_set1_epi8(31); 268 __m128i B = _mm_set1_epi8(64); 269 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B)); 270 foreach(i; 0..16) 271 assert(avg[i] == 48); 272 } 273 274 // Note: unlike Intel API, shift amount is a compile-time parameter. 275 __m128i _mm_bslli_si128(int bits)(__m128i a) pure @safe 276 { 277 // Generates pslldq starting with LDC 1.1 -O2 278 __m128i zero = _mm_setzero_si128(); 279 return cast(__m128i) 280 shufflevector!(byte16, 16 - bits, 17 - bits, 18 - bits, 19 - bits, 281 20 - bits, 21 - bits, 22 - bits, 23 - bits, 282 24 - bits, 25 - bits, 26 - bits, 27 - bits, 283 28 - bits, 29 - bits, 30 - bits, 31 - bits) 284 (cast(byte16)zero, cast(byte16)a); 285 } 286 unittest 287 { 288 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 289 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 290 __m128i result = _mm_bslli_si128!5(toShift); 291 assert( (cast(byte16)result).array == exact); 292 } 293 294 // Note: unlike Intel API, shift amount is a compile-time parameter. 295 __m128i _mm_bsrli_si128(int bits)(__m128i a) pure @safe 296 { 297 // Generates psrldq starting with LDC 1.1 -O2 298 __m128i zero = _mm_setzero_si128(); 299 return cast(__m128i) 300 shufflevector!(byte16, 0 + bits, 1 + bits, 2 + bits, 3 + bits, 301 4 + bits, 5 + bits, 6 + bits, 7 + bits, 302 8 + bits, 9 + bits, 10 + bits, 11 + bits, 303 12 + bits, 13 + bits, 14 + bits, 15 + bits) 304 (cast(byte16)a, cast(byte16)zero); 305 } 306 unittest 307 { 308 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 309 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0]; 310 __m128i result = _mm_bsrli_si128!5(toShift); 311 assert( (cast(byte16)result).array == exact); 312 } 313 314 __m128 _mm_castpd_ps (__m128d a) pure @safe 315 { 316 return cast(__m128)a; 317 } 318 319 __m128i _mm_castpd_si128 (__m128d a) pure @safe 320 { 321 return cast(__m128i)a; 322 } 323 324 __m128d _mm_castps_pd (__m128 a) pure @safe 325 { 326 return cast(__m128d)a; 327 } 328 329 __m128i _mm_castps_si128 (__m128 a) pure @safe 330 { 331 return cast(__m128i)a; 332 } 333 334 __m128d _mm_castsi128_pd (__m128i a) pure @safe 335 { 336 return cast(__m128d)a; 337 } 338 339 __m128 _mm_castsi128_ps (__m128i a) pure @safe 340 { 341 return cast(__m128)a; 342 } 343 344 version(LDC) 345 { 346 alias _mm_clflush = __builtin_ia32_clflush; 347 } 348 else 349 { 350 void _mm_clflush (const(void)* p) pure @safe 351 { 352 version(D_InlineAsm_X86) 353 { 354 asm pure nothrow @nogc @safe 355 { 356 mov EAX, p; 357 clflush [EAX]; 358 } 359 } 360 else version(D_InlineAsm_X86_64) 361 { 362 asm pure nothrow @nogc @safe 363 { 364 mov RAX, p; 365 clflush [RAX]; 366 } 367 } 368 } 369 } 370 unittest 371 { 372 ubyte[64] cacheline; 373 _mm_clflush(cacheline.ptr); 374 } 375 376 377 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 378 { 379 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 380 } 381 unittest 382 { 383 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 384 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 385 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 386 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 387 assert(R.array == E); 388 } 389 390 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 391 { 392 return equalMask!__m128i(a, b); 393 } 394 unittest 395 { 396 int4 A = [-3, -2, -1, 0]; 397 int4 B = [ 4, -2, 2, 0]; 398 int[4] E = [ 0, -1, 0, -1]; 399 int4 R = cast(int4)(_mm_cmpeq_epi16(A, B)); 400 assert(R.array == E); 401 } 402 403 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 404 { 405 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 406 } 407 unittest 408 { 409 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 410 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 411 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 412 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 413 assert(C.array == correct); 414 } 415 416 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 417 { 418 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b); 419 } 420 421 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 422 { 423 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b); 424 } 425 426 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 427 { 428 return cast(__m128d) cmppd!(FPComparison.oge)(a, b); 429 } 430 431 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 432 { 433 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b); 434 } 435 436 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 437 { 438 return cast(__m128i)( greaterMask!short8(cast(short8)a, cast(short8)b)); 439 } 440 unittest 441 { 442 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 443 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 444 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 445 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 446 assert(R.array == E); 447 } 448 449 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 450 { 451 return cast(__m128i)( greaterMask!int4(a, b)); 452 } 453 unittest 454 { 455 int4 A = [-3, 2, -1, 0]; 456 int4 B = [ 4, -2, 2, 0]; 457 int[4] E = [ 0, -1, 0, 0]; 458 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 459 assert(R.array == E); 460 } 461 462 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 463 { 464 return cast(__m128i)( greaterMask!byte16(cast(byte16)a, cast(byte16)b)); 465 } 466 unittest 467 { 468 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 469 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 470 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 471 byte[16] correct = [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 472 __m128i D = _mm_cmpeq_epi8(A, B); 473 assert(C.array == correct); 474 } 475 476 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 477 { 478 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b); 479 } 480 481 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 482 { 483 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b); 484 } 485 486 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 487 { 488 return cast(__m128d) cmppd!(FPComparison.ole)(a, b); 489 } 490 491 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 492 { 493 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b); 494 } 495 496 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 497 { 498 return _mm_cmpgt_epi16(b, a); 499 } 500 501 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 502 { 503 return _mm_cmpgt_epi32(b, a); 504 } 505 506 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 507 { 508 return _mm_cmpgt_epi8(b, a); 509 } 510 511 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 512 { 513 return cast(__m128d) cmppd!(FPComparison.olt)(a, b); 514 } 515 516 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 517 { 518 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b); 519 } 520 521 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 522 { 523 return cast(__m128d) cmppd!(FPComparison.une)(a, b); 524 } 525 526 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 527 { 528 return cast(__m128d) cmpsd!(FPComparison.une)(a, b); 529 } 530 531 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 532 { 533 return cast(__m128d) cmppd!(FPComparison.ult)(a, b); 534 } 535 536 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 537 { 538 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b); 539 } 540 541 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 542 { 543 return cast(__m128d) cmppd!(FPComparison.ule)(a, b); 544 } 545 546 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 547 { 548 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b); 549 } 550 551 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 552 { 553 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b); 554 } 555 556 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 557 { 558 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b); 559 } 560 561 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 562 { 563 return cast(__m128d) cmppd!(FPComparison.uge)(a, b); 564 } 565 566 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 567 { 568 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b); 569 } 570 571 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 572 { 573 return cast(__m128d) cmppd!(FPComparison.ord)(a, b); 574 } 575 576 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 577 { 578 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b); 579 } 580 581 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 582 { 583 return cast(__m128d) cmppd!(FPComparison.uno)(a, b); 584 } 585 586 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 587 { 588 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b); 589 } 590 591 592 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS 593 // Some such comparisons yields true for NaNs, other don't. 594 595 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe 596 { 597 return comsd!(FPComparison.ueq)(a, b); // yields true for NaN, same as GCC 598 } 599 600 int _mm_comige_sd (__m128d a, __m128d b) pure @safe 601 { 602 return comsd!(FPComparison.oge)(a, b); 603 } 604 605 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe 606 { 607 return comsd!(FPComparison.ogt)(a, b); 608 } 609 610 int _mm_comile_sd (__m128d a, __m128d b) pure @safe 611 { 612 return comsd!(FPComparison.ule)(a, b); // yields true for NaN, same as GCC 613 } 614 615 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe 616 { 617 return comsd!(FPComparison.ult)(a, b); // yields true for NaN, same as GCC 618 } 619 620 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe 621 { 622 return comsd!(FPComparison.one)(a, b); 623 } 624 625 version(LDC) 626 { 627 __m128d _mm_cvtepi32_pd (__m128i a) pure @safe 628 { 629 // Generates cvtdq2pd since LDC 1.0, even without optimizations 630 enum ir = ` 631 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 632 %r = sitofp <2 x i32> %v to <2 x double> 633 ret <2 x double> %r`; 634 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a); 635 } 636 } 637 else 638 { 639 __m128d _mm_cvtepi32_pd (__m128i a) pure @safe 640 { 641 double2 r = void; 642 r[0] = a[0]; 643 r[1] = a[1]; 644 return r; 645 } 646 } 647 unittest 648 { 649 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54)); 650 assert(A[0] == 54.0); 651 assert(A[1] == 54.0); 652 } 653 654 __m128 _mm_cvtepi32_ps(__m128i a) pure @safe 655 { 656 // Generates cvtdq2ps since LDC 1.0.0 -O1 657 __m128 res; 658 res.array[0] = cast(float)a.array[0]; 659 res.array[1] = cast(float)a.array[1]; 660 res.array[2] = cast(float)a.array[2]; 661 res.array[3] = cast(float)a.array[3]; 662 return res; 663 } 664 unittest 665 { 666 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 667 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 668 } 669 670 671 version(LDC) 672 { 673 // Like in clang, implemented with a magic intrinsic right now 674 alias _mm_cvtpd_epi32 = __builtin_ia32_cvtpd2dq; 675 676 /* Unfortunately this generates a cvttpd2dq instruction 677 __m128i _mm_cvtpd_epi32 (__m128d a) pure @safe 678 { 679 enum ir = ` 680 %i = fptosi <2 x double> %0 to <2 x i32> 681 %r = shufflevector <2 x i32> %i,<2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 682 ret <4 x i32> %r`; 683 684 return cast(__m128i) inlineIR!(ir, __m128i, __m128d)(a); 685 } */ 686 } 687 else 688 { 689 __m128i _mm_cvtpd_epi32 (__m128d a) pure @safe 690 { 691 __m128i r = _mm_setzero_si128(); 692 r[0] = convertDoubleToInt32UsingMXCSR(a[0]); 693 r[1] = convertDoubleToInt32UsingMXCSR(a[1]); 694 return r; 695 } 696 } 697 unittest 698 { 699 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0)); 700 assert(A[0] == 55 && A[1] == 61 && A[2] == 0 && A[3] == 0); 701 } 702 703 // TODO: _mm_cvtpd_pi32 704 705 version(LDC) 706 { 707 alias _mm_cvtpd_ps = __builtin_ia32_cvtpd2ps; // can't be done with IR unfortunately 708 } 709 else 710 { 711 __m128 _mm_cvtpd_ps (__m128d a) pure @safe 712 { 713 __m128 r = void; 714 r[0] = a[0]; 715 r[1] = a[1]; 716 r[2] = 0; 717 r[3] = 0; 718 return r; 719 } 720 } 721 unittest 722 { 723 __m128d A = _mm_set_pd(5.25, 4.0); 724 __m128 B = _mm_cvtpd_ps(A); 725 assert(B.array == [4.0f, 5.25f, 0, 0]); 726 } 727 728 // TODO: _mm_cvtpi32_pd 729 730 version(LDC) 731 { 732 // Disabled, since it fail with optimizations unfortunately 733 //alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq; 734 735 __m128i _mm_cvtps_epi32 (__m128 a) pure @trusted 736 { 737 return __asm!__m128i("cvtps2dq $1,$0","=x,x",a); 738 } 739 } 740 else 741 { 742 __m128i _mm_cvtps_epi32 (__m128 a) pure @safe 743 { 744 __m128i r = void; 745 r[0] = convertFloatToInt32UsingMXCSR(a[0]); 746 r[1] = convertFloatToInt32UsingMXCSR(a[1]); 747 r[2] = convertFloatToInt32UsingMXCSR(a[2]); 748 r[3] = convertFloatToInt32UsingMXCSR(a[3]); 749 return r; 750 } 751 } 752 unittest 753 { 754 uint savedRounding = _MM_GET_ROUNDING_MODE(); 755 756 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 757 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 758 assert(A.array == [1, -2, 54, -3]); 759 760 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 761 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 762 assert(A.array == [1, -3, 53, -3]); 763 764 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 765 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 766 assert(A.array == [2, -2, 54, -2]); 767 768 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 769 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 770 assert(A.array == [1, -2, 53, -2]); 771 772 _MM_SET_ROUNDING_MODE(savedRounding); 773 } 774 775 776 version(LDC) 777 { 778 __m128d _mm_cvtps_pd (__m128 a) pure @safe 779 { 780 // Generates cvtps2pd since LDC 1.0, no opt 781 enum ir = ` 782 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1> 783 %r = fpext <2 x float> %v to <2 x double> 784 ret <2 x double> %r`; 785 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a); 786 } 787 } 788 else 789 { 790 __m128d _mm_cvtps_pd (__m128 a) pure @safe 791 { 792 double2 r = void; 793 r[0] = a[0]; 794 r[1] = a[1]; 795 return r; 796 } 797 } 798 unittest 799 { 800 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f)); 801 assert(A[0] == 54.0); 802 assert(A[1] == 54.0); 803 } 804 805 double _mm_cvtsd_f64 (__m128d a) pure @safe 806 { 807 return a[0]; 808 } 809 810 version(LDC) 811 { 812 alias _mm_cvtsd_si32 = __builtin_ia32_cvtsd2si; 813 } 814 else 815 { 816 int _mm_cvtsd_si32 (__m128d a) pure @safe 817 { 818 return convertDoubleToInt32UsingMXCSR(a[0]); 819 } 820 } 821 unittest 822 { 823 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0))); 824 } 825 826 version(LDC) 827 { 828 // Unfortunately this builtin crashes in 32-bit 829 version(X86_64) 830 alias _mm_cvtsd_si64 = __builtin_ia32_cvtsd2si64; 831 else 832 { 833 long _mm_cvtsd_si64 (__m128d a) pure @safe 834 { 835 return convertDoubleToInt64UsingMXCSR(a[0]); 836 } 837 } 838 } 839 else 840 { 841 long _mm_cvtsd_si64 (__m128d a) pure @safe 842 { 843 return convertDoubleToInt64UsingMXCSR(a[0]); 844 } 845 } 846 unittest 847 { 848 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0))); 849 850 uint savedRounding = _MM_GET_ROUNDING_MODE(); 851 852 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 853 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.5))); 854 855 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 856 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1))); 857 858 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 859 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1))); 860 861 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 862 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9))); 863 864 _MM_SET_ROUNDING_MODE(savedRounding); 865 } 866 867 alias _mm_cvtsd_si64x = _mm_cvtsd_si64; 868 869 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @safe 870 { 871 // Generates cvtsd2ss since LDC 1.3 -O0 872 a[0] = b[0]; 873 return a; 874 } 875 unittest 876 { 877 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0)); 878 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]); 879 } 880 881 int _mm_cvtsi128_si32 (__m128i a) pure @safe 882 { 883 return a[0]; 884 } 885 886 long _mm_cvtsi128_si64 (__m128i a) pure @safe 887 { 888 long2 la = cast(long2)a; 889 return la[0]; 890 } 891 alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 892 893 __m128d _mm_cvtsi32_sd(__m128d v, int x) pure @safe 894 { 895 v[0] = cast(double)x; 896 return v; 897 } 898 unittest 899 { 900 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 901 assert(a.array == [42.0, 0]); 902 } 903 904 __m128i _mm_cvtsi32_si128 (int a) pure @safe 905 { 906 int4 r = [0, 0, 0, 0]; 907 r[0] = a; 908 return r; 909 } 910 unittest 911 { 912 __m128i a = _mm_cvtsi32_si128(65); 913 assert(a.array == [65, 0, 0, 0]); 914 } 915 916 917 // Note: on macOS, using "llvm.x86.sse2.cvtsi642sd" was buggy 918 __m128d _mm_cvtsi64_sd(__m128d v, long x) pure @safe 919 { 920 v[0] = cast(double)x; 921 return v; 922 } 923 unittest 924 { 925 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 926 assert(a.array == [42.0, 0]); 927 } 928 929 __m128i _mm_cvtsi64_si128 (long a) pure @safe 930 { 931 long2 r = [0, 0]; 932 r[0] = a; 933 return cast(__m128i)(r); 934 } 935 936 alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; 937 alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; 938 939 double2 _mm_cvtss_sd(double2 v, float4 x) pure @safe 940 { 941 v[0] = x[0]; 942 return v; 943 } 944 unittest 945 { 946 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 947 assert(a.array == [42.0, 0]); 948 } 949 950 long _mm_cvttss_si64 (__m128 a) pure @safe 951 { 952 return cast(long)(a[0]); // Generates cvttss2si as expected 953 } 954 unittest 955 { 956 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 957 } 958 959 version(LDC) 960 { 961 alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq; 962 } 963 else 964 { 965 __m128i _mm_cvttpd_epi32 (__m128d a) pure @safe 966 { 967 // Note: doesn't generate cvttpd2dq as of LDC 1.13 968 __m128i r; 969 r[0] = cast(int)a[0]; 970 r[1] = cast(int)a[1]; 971 r[2] = 0; 972 r[3] = 0; 973 return r; 974 } 975 } 976 unittest 977 { 978 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f)); 979 assert(R.array == [-4, 45641, 0, 0]); 980 } 981 982 //TODO: _mm_cvttpd_pi32 983 984 __m128i _mm_cvttps_epi32 (__m128 a) pure @safe 985 { 986 // Note: Generates cvttps2dq since LDC 1.3 -O2 987 __m128i r; 988 r[0] = cast(int)a[0]; 989 r[1] = cast(int)a[1]; 990 r[2] = cast(int)a[2]; 991 r[3] = cast(int)a[3]; 992 return r; 993 } 994 unittest 995 { 996 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f)); 997 assert(R.array == [-4, 45641, 0, 1]); 998 } 999 1000 int _mm_cvttsd_si32 (__m128d a) 1001 { 1002 // Generates cvttsd2si since LDC 1.3 -O0 1003 return cast(int)a[0]; 1004 } 1005 1006 long _mm_cvttsd_si64 (__m128d a) 1007 { 1008 // Generates cvttsd2si since LDC 1.3 -O0 1009 // but in 32-bit instead, it's a long sequence that resort to FPU 1010 return cast(long)a[0]; 1011 } 1012 1013 alias _mm_cvttsd_si64x = _mm_cvttsd_si64; 1014 1015 __m128d _mm_div_ps(__m128d a, __m128d b) 1016 { 1017 return a / b; 1018 } 1019 1020 version(DigitalMars) 1021 { 1022 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1023 __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe 1024 { 1025 pragma(inline, false); 1026 a[0] = a[0] / b[0]; 1027 return a; 1028 } 1029 } 1030 else 1031 { 1032 __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe 1033 { 1034 a[0] /= b[0]; 1035 return a; 1036 } 1037 } 1038 unittest 1039 { 1040 __m128d a = [2.0, 4.5]; 1041 a = _mm_div_sd(a, a); 1042 assert(a.array == [1.0, 4.5]); 1043 } 1044 1045 /// Extract a 16-bit integer from `v`, selected with `index` 1046 int _mm_extract_epi16(__m128i v, int index) pure @safe 1047 { 1048 short8 r = cast(short8)v; 1049 return r[index]; 1050 } 1051 unittest 1052 { 1053 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1054 assert(_mm_extract_epi16(A, 6) == 6); 1055 } 1056 1057 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`. 1058 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted 1059 { 1060 short8 r = cast(short8)v; 1061 r[index] = cast(short)i; 1062 return cast(__m128i)r; 1063 } 1064 unittest 1065 { 1066 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1067 short8 R = cast(short8) _mm_insert_epi16(A, 42, 6); 1068 short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7]; 1069 assert(R.array == correct); 1070 } 1071 1072 version(LDC) 1073 { 1074 alias _mm_lfence = __builtin_ia32_lfence; 1075 } 1076 else 1077 { 1078 void _mm_lfence() pure @safe 1079 { 1080 asm nothrow @nogc pure @safe 1081 { 1082 lfence; 1083 } 1084 } 1085 } 1086 unittest 1087 { 1088 _mm_lfence(); 1089 } 1090 1091 1092 __m128d _mm_load_pd (const(double) * mem_addr) pure 1093 { 1094 __m128d* aligned = cast(__m128d*)mem_addr; 1095 return *aligned; 1096 } 1097 1098 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 1099 { 1100 double[2] arr = [*mem_addr, *mem_addr]; 1101 return loadUnaligned!(double2)(&arr[0]); 1102 } 1103 1104 __m128d _mm_load_sd (const(double)* mem_addr) pure @safe 1105 { 1106 double2 r = [0, 0]; 1107 r[0] = *mem_addr; 1108 return r; 1109 } 1110 unittest 1111 { 1112 double x = -42; 1113 __m128d a = _mm_load_sd(&x); 1114 assert(a.array == [-42.0, 0.0]); 1115 } 1116 1117 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted 1118 { 1119 return *mem_addr; 1120 } 1121 1122 alias _mm_load1_pd = _mm_load_pd1; 1123 1124 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @safe 1125 { 1126 a[1] = *mem_addr; 1127 return a; 1128 } 1129 1130 // Note: strange signature since the memory doesn't have to aligned 1131 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @safe 1132 { 1133 auto pLong = cast(const(long)*)mem_addr; 1134 long2 r = [0, 0]; 1135 r[0] = *pLong; 1136 return cast(__m128i)(r); 1137 } 1138 1139 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @safe 1140 { 1141 a[0] = *mem_addr; 1142 return a; 1143 } 1144 1145 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted 1146 { 1147 __m128d a = _mm_load_pd(mem_addr); 1148 return shufflevector!(__m128d, 1, 0)(a, a); 1149 } 1150 1151 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @safe 1152 { 1153 return loadUnaligned!(double2)(mem_addr); 1154 } 1155 1156 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 1157 { 1158 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 1159 } 1160 1161 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted 1162 { 1163 int r = *cast(int*)(mem_addr); 1164 int4 result = [0, 0, 0, 0]; 1165 result[0] = r; 1166 return result; 1167 } 1168 unittest 1169 { 1170 int r = 42; 1171 __m128i A = _mm_loadu_si32(&r); 1172 int[4] correct = [42, 0, 0, 0]; 1173 assert(A.array == correct); 1174 } 1175 1176 version(LDC) 1177 { 1178 alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128; 1179 } 1180 1181 version(LDC) 1182 { 1183 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 1184 /// (elements are not stored when the highest bit is not set in the corresponding element) 1185 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 1186 /// boundary. 1187 alias _mm_maskmoveu_si128 = __builtin_ia32_maskmovdqu; // can't do it with pure IR 1188 } 1189 else 1190 { 1191 ///ditto 1192 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) pure @trusted 1193 { 1194 byte16 b = cast(byte16)a; 1195 byte16 m = cast(byte16)mask; 1196 byte* dest = cast(byte*)(mem_addr); 1197 foreach(j; 0..16) 1198 { 1199 if (m[j] & 128) 1200 { 1201 dest[j] = b[j]; 1202 } 1203 } 1204 } 1205 } 1206 unittest 1207 { 1208 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]; 1209 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0); 1210 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); 1211 _mm_maskmoveu_si128(A, mask, dest.ptr); 1212 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42]; 1213 assert(dest == correct); 1214 } 1215 1216 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe 1217 { 1218 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1219 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else 1220 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1221 __m128i mask = _mm_and_si128(aTob, lowerShorts); 1222 return _mm_xor_si128(b, mask); 1223 } 1224 unittest 1225 { 1226 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-57), 1227 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 1228 short[8] correct = [45, 1, 9, 7, 9, 7, 0, 0]; 1229 assert(R.array == correct); 1230 } 1231 1232 1233 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1234 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe 1235 { 1236 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1237 __m128i value128 = _mm_set1_epi8(-128); 1238 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 1239 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1240 __m128i mask = _mm_and_si128(aTob, higher); 1241 return _mm_xor_si128(b, mask); 1242 } 1243 unittest 1244 { 1245 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 1246 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 1247 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 1248 assert(R.array == correct); 1249 } 1250 1251 __m128d _mm_max_pd (__m128d a, __m128d b) pure @safe 1252 { 1253 // Generates maxpd starting with LDC 1.9 1254 a[0] = (a[0] > b[0]) ? a[0] : b[0]; 1255 a[1] = (a[1] > b[1]) ? a[1] : b[1]; 1256 return a; 1257 } 1258 unittest 1259 { 1260 __m128d A = _mm_setr_pd(4.0, 1.0); 1261 __m128d B = _mm_setr_pd(1.0, 8.0); 1262 __m128d M = _mm_max_pd(A, B); 1263 assert(M[0] == 4.0); 1264 assert(M[1] == 8.0); 1265 } 1266 1267 __m128d _mm_max_sd (__m128d a, __m128d b) pure @safe 1268 { 1269 __m128d r = a; 1270 // Generates maxsd starting with LDC 1.3 1271 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 1272 return r; 1273 } 1274 unittest 1275 { 1276 __m128d A = _mm_setr_pd(1.0, 1.0); 1277 __m128d B = _mm_setr_pd(4.0, 2.0); 1278 __m128d M = _mm_max_sd(A, B); 1279 assert(M[0] == 4.0); 1280 assert(M[1] == 1.0); 1281 } 1282 1283 version(LDC) 1284 { 1285 alias _mm_mfence = __builtin_ia32_mfence; 1286 } 1287 else 1288 { 1289 void _mm_mfence() pure @safe 1290 { 1291 asm nothrow @nogc pure @safe 1292 { 1293 mfence; 1294 } 1295 } 1296 } 1297 unittest 1298 { 1299 _mm_mfence(); 1300 } 1301 1302 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe 1303 { 1304 // Note: clang uses a __builtin_ia32_pminsw128 which has disappeared from LDC LLVM (?) 1305 // Implemented using masks and XOR 1306 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else 1307 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1308 __m128i mask = _mm_and_si128(aTob, lowerShorts); 1309 return _mm_xor_si128(b, mask); 1310 } 1311 unittest 1312 { 1313 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-57), 1314 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 1315 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -57]; 1316 assert(R.array == correct); 1317 } 1318 1319 1320 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe 1321 { 1322 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1323 __m128i value128 = _mm_set1_epi8(-128); 1324 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 1325 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1326 __m128i mask = _mm_and_si128(aTob, lower); 1327 return _mm_xor_si128(b, mask); 1328 } 1329 unittest 1330 { 1331 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 1332 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 1333 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 1334 assert(R.array == correct); 1335 } 1336 1337 __m128d _mm_min_pd (__m128d a, __m128d b) pure @safe 1338 { 1339 // Generates minpd starting with LDC 1.9 1340 a[0] = (a[0] < b[0]) ? a[0] : b[0]; 1341 a[1] = (a[1] < b[1]) ? a[1] : b[1]; 1342 return a; 1343 } 1344 unittest 1345 { 1346 __m128d A = _mm_setr_pd(1.0, 2.0); 1347 __m128d B = _mm_setr_pd(4.0, 1.0); 1348 __m128d M = _mm_min_pd(A, B); 1349 assert(M[0] == 1.0); 1350 assert(M[1] == 1.0); 1351 } 1352 1353 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe 1354 { 1355 // Generates minsd starting with LDC 1.3 1356 __m128d r = a; 1357 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 1358 return r; 1359 } 1360 unittest 1361 { 1362 __m128d A = _mm_setr_pd(1.0, 3.0); 1363 __m128d B = _mm_setr_pd(4.0, 2.0); 1364 __m128d M = _mm_min_sd(A, B); 1365 assert(M[0] == 1.0); 1366 assert(M[1] == 3.0); 1367 } 1368 1369 __m128i _mm_move_epi64 (__m128i a) pure @safe 1370 { 1371 long2 result = [ 0, 0 ]; 1372 long2 la = cast(long2) a; 1373 result[0] = la[0]; 1374 return cast(__m128i)(result); 1375 } 1376 unittest 1377 { 1378 long2 A = [13, 47]; 1379 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 1380 long[2] correct = [13, 0]; 1381 assert(B.array == correct); 1382 } 1383 1384 __m128d _mm_move_sd (__m128d a, __m128d b) pure @safe 1385 { 1386 b[1] = a[1]; 1387 return b; 1388 } 1389 unittest 1390 { 1391 double2 A = [13.0, 47.0]; 1392 double2 B = [34.0, 58.0]; 1393 double2 C = _mm_move_sd(A, B); 1394 double[2] correct = [34.0, 47.0]; 1395 assert(C.array == correct); 1396 } 1397 1398 version(LDC) 1399 { 1400 /// Create mask from the most significant bit of each 8-bit element in `v`. 1401 alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128; 1402 } 1403 else 1404 { 1405 /// Create mask from the most significant bit of each 8-bit element in `v`. 1406 int _mm_movemask_epi8(__m128i v) pure @safe 1407 { 1408 byte16 ai = cast(byte16)v; 1409 int r = 0; 1410 foreach(bit; 0..16) 1411 { 1412 if (ai[bit] < 0) r += (1 << bit); 1413 } 1414 return r; 1415 } 1416 } 1417 unittest 1418 { 1419 assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 0, 0, -1, -1, -1, 0, 0, 0, 0, -1, -1, 0, -1, -1, 0))); 1420 } 1421 1422 version(LDC) 1423 { 1424 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 1425 /// packed double-precision (64-bit) floating-point element in `v`. 1426 alias _mm_movemask_pd = __builtin_ia32_movmskpd; 1427 } 1428 else 1429 { 1430 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 1431 /// packed double-precision (64-bit) floating-point element in `v`. 1432 int _mm_movemask_pd(__m128d v) pure @safe 1433 { 1434 long2 lv = cast(long2)v; 1435 int r = 0; 1436 if (lv[0] < 0) r += 1; 1437 if (lv[1] < 0) r += 2; 1438 return r; 1439 } 1440 } 1441 unittest 1442 { 1443 __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0); 1444 assert(_mm_movemask_pd(A) == 2); 1445 } 1446 1447 1448 // TODO: _mm_movepi64_pi64 1449 // TODO: __m128i _mm_movpi64_epi64 (__m64 a) 1450 1451 // PERF: unfortunately, __builtin_ia32_pmuludq128 disappeared from LDC 1452 // but seems there in clang 1453 __m128i _mm_mul_epu32(__m128i a, __m128i b) pure @safe 1454 { 1455 __m128i zero = _mm_setzero_si128(); 1456 long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero); 1457 long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero); 1458 static if (__VERSION__ >= 2076) 1459 { 1460 return cast(__m128i)(la * lb); 1461 } 1462 else 1463 { 1464 // long2 mul not supported before LDC 1.5 1465 la[0] *= lb[0]; 1466 la[1] *= lb[1]; 1467 return cast(__m128i)(la); 1468 } 1469 } 1470 unittest 1471 { 1472 __m128i A = _mm_set_epi32(0, 0xDEADBEEF, 0, 0xffffffff); 1473 __m128i B = _mm_set_epi32(0, 0xCAFEBABE, 0, 0xffffffff); 1474 __m128i C = _mm_mul_epu32(A, B); 1475 long2 LC = cast(long2)C; 1476 assert(LC.array[0] == 18446744065119617025uL); 1477 assert(LC.array[1] == 12723420444339690338uL); 1478 } 1479 1480 1481 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 1482 { 1483 return a * b; 1484 } 1485 unittest 1486 { 1487 __m128d a = [-2.0, 1.5]; 1488 a = _mm_mul_pd(a, a); 1489 assert(a.array == [4.0, 2.25]); 1490 } 1491 1492 version(DigitalMars) 1493 { 1494 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1495 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe 1496 { 1497 pragma(inline, false); 1498 a[0] = a[0] * b[0]; 1499 return a; 1500 } 1501 } 1502 else 1503 { 1504 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe 1505 { 1506 a[0] *= b[0]; 1507 return a; 1508 } 1509 } 1510 unittest 1511 { 1512 __m128d a = [-2.0, 1.5]; 1513 a = _mm_mul_sd(a, a); 1514 assert(a.array == [4.0, 1.5]); 1515 } 1516 1517 1518 // TODO: _mm_mul_su32 1519 1520 version(LDC) 1521 { 1522 alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128; 1523 } 1524 else 1525 { 1526 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @safe 1527 { 1528 short8 sa = cast(short8)a; 1529 short8 sb = cast(short8)b; 1530 short8 r = void; 1531 r[0] = (sa[0] * sb[0]) >> 16; 1532 r[1] = (sa[1] * sb[1]) >> 16; 1533 r[2] = (sa[2] * sb[2]) >> 16; 1534 r[3] = (sa[3] * sb[3]) >> 16; 1535 r[4] = (sa[4] * sb[4]) >> 16; 1536 r[5] = (sa[5] * sb[5]) >> 16; 1537 r[6] = (sa[6] * sb[6]) >> 16; 1538 r[7] = (sa[7] * sb[7]) >> 16; 1539 return cast(__m128i)r; 1540 } 1541 } 1542 unittest 1543 { 1544 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 1545 __m128i B = _mm_set1_epi16(16384); 1546 short8 R = cast(short8)_mm_mulhi_epi16(A, B); 1547 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1]; 1548 assert(R.array == correct); 1549 } 1550 1551 version(LDC) 1552 { 1553 alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128; 1554 } 1555 else 1556 { 1557 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @safe 1558 { 1559 short8 sa = cast(short8)a; 1560 short8 sb = cast(short8)b; 1561 short8 r = void; 1562 r[0] = cast(short)( (cast(ushort)sa[0] * cast(ushort)sb[0]) >> 16 ); 1563 r[1] = cast(short)( (cast(ushort)sa[1] * cast(ushort)sb[1]) >> 16 ); 1564 r[2] = cast(short)( (cast(ushort)sa[2] * cast(ushort)sb[2]) >> 16 ); 1565 r[3] = cast(short)( (cast(ushort)sa[3] * cast(ushort)sb[3]) >> 16 ); 1566 r[4] = cast(short)( (cast(ushort)sa[4] * cast(ushort)sb[4]) >> 16 ); 1567 r[5] = cast(short)( (cast(ushort)sa[5] * cast(ushort)sb[5]) >> 16 ); 1568 r[6] = cast(short)( (cast(ushort)sa[6] * cast(ushort)sb[6]) >> 16 ); 1569 r[7] = cast(short)( (cast(ushort)sa[7] * cast(ushort)sb[7]) >> 16 ); 1570 return cast(__m128i)r; 1571 } 1572 } 1573 unittest 1574 { 1575 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 1576 __m128i B = _mm_set1_epi16(16384); 1577 short8 R = cast(short8)_mm_mulhi_epu16(A, B); 1578 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1]; 1579 assert(R.array == correct); 1580 } 1581 1582 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) 1583 { 1584 return cast(__m128i)(cast(short8)a * cast(short8)b); 1585 } 1586 1587 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 1588 { 1589 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 1590 } 1591 1592 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 1593 { 1594 return a | b; 1595 } 1596 1597 version(LDC) 1598 { 1599 alias _mm_packs_epi32 = __builtin_ia32_packssdw128; 1600 } 1601 else 1602 { 1603 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @safe 1604 { 1605 short8 r; 1606 r[0] = saturateSignedIntToSignedShort(a[0]); 1607 r[1] = saturateSignedIntToSignedShort(a[1]); 1608 r[2] = saturateSignedIntToSignedShort(a[2]); 1609 r[3] = saturateSignedIntToSignedShort(a[3]); 1610 r[4] = saturateSignedIntToSignedShort(b[0]); 1611 r[5] = saturateSignedIntToSignedShort(b[1]); 1612 r[6] = saturateSignedIntToSignedShort(b[2]); 1613 r[7] = saturateSignedIntToSignedShort(b[3]); 1614 return cast(__m128i)r; 1615 } 1616 } 1617 unittest 1618 { 1619 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 1620 short8 R = cast(short8) _mm_packs_epi32(A, A); 1621 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0]; 1622 assert(R.array == correct); 1623 } 1624 1625 version(LDC) 1626 { 1627 alias _mm_packs_epi16 = __builtin_ia32_packsswb128; 1628 } 1629 else 1630 { 1631 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @safe 1632 { 1633 byte16 r; 1634 short8 sa = cast(short8)a; 1635 short8 sb = cast(short8)b; 1636 foreach(i; 0..8) 1637 r[i] = saturateSignedWordToSignedByte(sa[i]); 1638 foreach(i; 0..8) 1639 r[i+8] = saturateSignedWordToSignedByte(sb[i]); 1640 return cast(__m128i)r; 1641 } 1642 } 1643 unittest 1644 { 1645 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0); 1646 byte16 R = cast(byte16) _mm_packs_epi16(A, A); 1647 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0, 1648 127, -128, 127, 0, 127, -128, 127, 0]; 1649 assert(R.array == correct); 1650 } 1651 1652 version(LDC) 1653 { 1654 alias _mm_packus_epi16 = __builtin_ia32_packuswb128; 1655 } 1656 else 1657 { 1658 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure 1659 { 1660 short8 sa = cast(short8)a; 1661 short8 sb = cast(short8)b; 1662 ubyte[16] result = void; 1663 for (int i = 0; i < 8; ++i) 1664 { 1665 short s = sa[i]; 1666 if (s < 0) s = 0; 1667 if (s > 255) s = 255; 1668 result[i] = cast(ubyte)s; 1669 1670 s = sb[i]; 1671 if (s < 0) s = 0; 1672 if (s > 255) s = 255; 1673 result[i+8] = cast(ubyte)s; 1674 } 1675 return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr); 1676 } 1677 } 1678 unittest 1679 { 1680 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 1681 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 1682 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 1683 0, 255, 0, 255, 255, 2, 1, 0]; 1684 foreach(i; 0..16) 1685 assert(AA[i] == cast(byte)(correctResult[i])); 1686 } 1687 1688 version(LDC) 1689 { 1690 alias _mm_pause = __builtin_ia32_pause; 1691 } 1692 else 1693 { 1694 void _mm_pause() pure @safe 1695 { 1696 asm nothrow @nogc pure @safe 1697 { 1698 rep; nop; // F3 90 = pause 1699 } 1700 } 1701 } 1702 unittest 1703 { 1704 _mm_pause(); 1705 } 1706 1707 1708 version(LDC) 1709 { 1710 alias _mm_sad_epu8 = __builtin_ia32_psadbw128; 1711 } 1712 else 1713 { 1714 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @safe 1715 { 1716 byte16 ab = cast(byte16)a; 1717 byte16 bb = cast(byte16)b; 1718 ubyte[16] t; 1719 foreach(i; 0..16) 1720 { 1721 int diff = cast(ubyte)(ab[i]) - cast(ubyte)(bb[i]); 1722 if (diff < 0) diff = -diff; 1723 t[i] = cast(ubyte)(diff); 1724 } 1725 int4 r = _mm_setzero_si128(); 1726 r[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 1727 r[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15]; 1728 return r; 1729 } 1730 } 1731 unittest 1732 { 1733 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 1734 __m128i B = _mm_set1_epi8(1); 1735 __m128i R = _mm_sad_epu8(A, B); 1736 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 1737 0, 1738 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 1739 0]; 1740 assert(R.array == correct); 1741 } 1742 1743 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 1744 { 1745 short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7]; 1746 return cast(__m128i) loadUnaligned!(short8)(result.ptr); 1747 } 1748 unittest 1749 { 1750 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1751 short8 B = cast(short8) A; 1752 foreach(i; 0..8) 1753 assert(B.array[i] == i); 1754 } 1755 1756 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 1757 { 1758 int[4] result = [e0, e1, e2, e3]; 1759 return loadUnaligned!(int4)(result.ptr); 1760 } 1761 unittest 1762 { 1763 __m128i A = _mm_set_epi32(3, 2, 1, 0); 1764 foreach(i; 0..4) 1765 assert(A.array[i] == i); 1766 } 1767 1768 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted 1769 { 1770 long[2] result = [e0[0], e1[0]]; 1771 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 1772 } 1773 unittest 1774 { 1775 __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678)); 1776 long2 B = cast(long2) A; 1777 assert(B.array[0] == 5678); 1778 assert(B.array[1] == 1234); 1779 } 1780 1781 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 1782 { 1783 long[2] result = [e0, e1]; 1784 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 1785 } 1786 unittest 1787 { 1788 __m128i A = _mm_set_epi64x(1234, 5678); 1789 long2 B = cast(long2) A; 1790 assert(B.array[0] == 5678); 1791 assert(B.array[1] == 1234); 1792 } 1793 1794 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 1795 byte e11, byte e10, byte e9, byte e8, 1796 byte e7, byte e6, byte e5, byte e4, 1797 byte e3, byte e2, byte e1, byte e0) pure @trusted 1798 { 1799 byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 1800 e8, e9, e10, e11, e12, e13, e14, e15]; 1801 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 1802 } 1803 1804 __m128d _mm_set_pd (double e1, double e0) pure @trusted 1805 { 1806 double[2] result = [e0, e1]; 1807 return loadUnaligned!(double2)(result.ptr); 1808 } 1809 1810 __m128d _mm_set_pd1 (double a) pure @trusted 1811 { 1812 double[2] result = [a, a]; 1813 return loadUnaligned!(double2)(result.ptr); 1814 } 1815 1816 __m128d _mm_set_sd (double a) pure @trusted 1817 { 1818 double[2] result = [a, 0]; 1819 return loadUnaligned!(double2)(result.ptr); 1820 } 1821 1822 __m128i _mm_set1_epi16 (short a) pure @trusted 1823 { 1824 short[8] result = [a, a, a, a, a, a, a, a]; 1825 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 1826 } 1827 1828 __m128i _mm_set1_epi32 (int a) pure @trusted 1829 { 1830 int[4] result = [a, a, a, a]; 1831 return loadUnaligned!(int4)(result.ptr); 1832 } 1833 unittest 1834 { 1835 __m128 a = _mm_set1_ps(-1.0f); 1836 __m128 b = cast(__m128) _mm_set1_epi32(0x7fffffff); 1837 assert(_mm_and_ps(a, b).array == [1.0f, 1, 1, 1]); 1838 } 1839 1840 /// Broadcast 64-bit integer `a` to all elements of `dst`. 1841 __m128i _mm_set1_epi64 (__m64 a) pure @safe 1842 { 1843 return _mm_set_epi64(a, a); 1844 } 1845 1846 __m128i _mm_set1_epi64x (long a) pure @trusted 1847 { 1848 long[2] result = [a, a]; 1849 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 1850 } 1851 1852 __m128i _mm_set1_epi8 (byte a) pure @trusted 1853 { 1854 byte[16] result = [a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a]; 1855 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 1856 } 1857 1858 alias _mm_set1_pd = _mm_set_pd1; 1859 1860 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 1861 { 1862 short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0]; 1863 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 1864 } 1865 1866 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 1867 { 1868 int[4] result = [e3, e2, e1, e0]; 1869 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 1870 } 1871 1872 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 1873 { 1874 long[2] result = [e1, e0]; 1875 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 1876 } 1877 1878 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12, 1879 byte e11, byte e10, byte e9, byte e8, 1880 byte e7, byte e6, byte e5, byte e4, 1881 byte e3, byte e2, byte e1, byte e0) pure @trusted 1882 { 1883 byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 1884 e7, e6, e5, e4, e3, e2, e1, e0]; 1885 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 1886 } 1887 1888 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 1889 { 1890 double[2] result = [e1, e0]; 1891 return loadUnaligned!(double2)(result.ptr); 1892 } 1893 1894 __m128d _mm_setzero_pd () pure @trusted 1895 { 1896 double[2] result = [0.0, 0.0]; 1897 return loadUnaligned!(double2)(result.ptr); 1898 } 1899 1900 __m128i _mm_setzero_si128() pure @trusted 1901 { 1902 int[4] result = [0, 0, 0, 0]; 1903 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 1904 } 1905 1906 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe 1907 { 1908 return shufflevector!(int4, (imm8 >> 0) & 3, 1909 (imm8 >> 2) & 3, 1910 (imm8 >> 4) & 3, 1911 (imm8 >> 6) & 3)(a, a); 1912 } 1913 unittest 1914 { 1915 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 1916 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 1917 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 1918 int[4] expectedB = [ 3, 2, 1, 0 ]; 1919 assert(B.array == expectedB); 1920 } 1921 1922 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe 1923 { 1924 return shufflevector!(double2, 0 + ( imm8 & 1 ), 1925 2 + ( (imm8 >> 1) & 1 ))(a, b); 1926 } 1927 unittest 1928 { 1929 __m128d A = _mm_setr_pd(0.5, 2.0); 1930 __m128d B = _mm_setr_pd(4.0, 5.0); 1931 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 1932 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B); 1933 double[2] correct = [ 2.0, 5.0 ]; 1934 assert(R.array == correct); 1935 } 1936 1937 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe 1938 { 1939 return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3, 1940 4 + ( (imm8 >> 0) & 3 ), 1941 4 + ( (imm8 >> 2) & 3 ), 1942 4 + ( (imm8 >> 4) & 3 ), 1943 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 1944 } 1945 unittest 1946 { 1947 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 1948 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 1949 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 1950 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 1951 assert(C.array == expectedC); 1952 } 1953 1954 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe 1955 { 1956 return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ), 1957 ( (imm8 >> 2) & 3 ), 1958 ( (imm8 >> 4) & 3 ), 1959 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 1960 } 1961 unittest 1962 { 1963 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 1964 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 1965 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 1966 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 1967 assert(B.array == expectedB); 1968 } 1969 1970 version(LDC) 1971 { 1972 alias _mm_sll_epi32 = __builtin_ia32_pslld128; 1973 } 1974 else 1975 { 1976 __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe 1977 { 1978 int4 r = void; 1979 long2 lc = cast(long2)count; 1980 int bits = cast(int)(lc[0]); 1981 foreach(i; 0..4) 1982 r[i] = cast(uint)(a[i]) << bits; 1983 return r; 1984 } 1985 } 1986 unittest 1987 { 1988 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 1989 __m128i B = _mm_sll_epi32(A, _mm_cvtsi32_si128(1)); 1990 int[4] expectedB = [ 0, 4, 6, -8]; 1991 assert(B.array == expectedB); 1992 } 1993 1994 version(LDC) 1995 { 1996 alias _mm_sll_epi64 = __builtin_ia32_psllq128; 1997 } 1998 else 1999 { 2000 __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe 2001 { 2002 long2 r = void; 2003 long2 sa = cast(long2)a; 2004 long2 lc = cast(long2)count; 2005 int bits = cast(int)(lc[0]); 2006 foreach(i; 0..2) 2007 r[i] = cast(ulong)(sa[i]) << bits; 2008 return cast(__m128i)r; 2009 } 2010 } 2011 unittest 2012 { 2013 __m128i A = _mm_setr_epi64(8, -4); 2014 long2 B = cast(long2) _mm_sll_epi64(A, _mm_cvtsi32_si128(1)); 2015 long[2] expectedB = [ 16, -8]; 2016 assert(B.array == expectedB); 2017 } 2018 2019 version(LDC) 2020 { 2021 alias _mm_sll_epi16 = __builtin_ia32_psllw128; 2022 } 2023 else 2024 { 2025 __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @safe 2026 { 2027 short8 sa = cast(short8)a; 2028 long2 lc = cast(long2)count; 2029 int bits = cast(int)(lc[0]); 2030 short8 r = void; 2031 foreach(i; 0..8) 2032 r[i] = cast(short)(cast(ushort)(sa[i]) << bits); 2033 return cast(int4)r; 2034 } 2035 } 2036 unittest 2037 { 2038 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 2039 short8 B = cast(short8)( _mm_sll_epi16(A, _mm_cvtsi32_si128(1)) ); 2040 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 2041 assert(B.array == expectedB); 2042 } 2043 2044 version(LDC) 2045 { 2046 alias _mm_slli_epi32 = __builtin_ia32_pslldi128; 2047 } 2048 else 2049 { 2050 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @safe 2051 { 2052 int4 r = void; 2053 foreach(i; 0..4) 2054 r[i] = cast(uint)(a[i]) << imm8; 2055 return r; 2056 } 2057 } 2058 unittest 2059 { 2060 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 2061 __m128i B = _mm_slli_epi32(A, 1); 2062 int[4] expectedB = [ 0, 4, 6, -8]; 2063 assert(B.array == expectedB); 2064 } 2065 2066 version(LDC) 2067 { 2068 alias _mm_slli_epi64 = __builtin_ia32_psllqi128; 2069 } 2070 else 2071 { 2072 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @safe 2073 { 2074 long2 r = void; 2075 long2 sa = cast(long2)a; 2076 foreach(i; 0..2) 2077 r[i] = cast(ulong)(sa[i]) << imm8; 2078 return cast(__m128i)r; 2079 } 2080 } 2081 unittest 2082 { 2083 __m128i A = _mm_setr_epi64(8, -4); 2084 long2 B = cast(long2) _mm_slli_epi64(A, 1); 2085 long[2] expectedB = [ 16, -8]; 2086 assert(B.array == expectedB); 2087 } 2088 2089 version(LDC) 2090 { 2091 alias _mm_slli_epi16 = __builtin_ia32_psllwi128; 2092 } 2093 else 2094 { 2095 __m128i _mm_slli_epi16 (__m128i a, int imm8) pure @safe 2096 { 2097 short8 sa = cast(short8)a; 2098 short8 r = void; 2099 foreach(i; 0..8) 2100 r[i] = cast(short)(cast(ushort)(sa[i]) << imm8); 2101 return cast(int4)r; 2102 } 2103 } 2104 unittest 2105 { 2106 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 2107 short8 B = cast(short8)( _mm_slli_epi16(A, 1) ); 2108 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 2109 assert(B.array == expectedB); 2110 } 2111 2112 /// Shift `a` left by `imm8` bytes while shifting in zeros. 2113 __m128i _mm_slli_si128(ubyte imm8)(__m128i op) pure @safe 2114 { 2115 static if (imm8 & 0xF0) 2116 return _mm_setzero_si128(); 2117 else 2118 return cast(__m128i) shufflevector!(byte16, 2119 16 - imm8, 17 - imm8, 18 - imm8, 19 - imm8, 20 - imm8, 21 - imm8, 22 - imm8, 23 - imm8, 2120 24 - imm8, 25 - imm8, 26 - imm8, 27 - imm8, 28 - imm8, 29 - imm8, 30 - imm8, 31 - imm8) 2121 (cast(byte16)_mm_setzero_si128(), cast(byte16)op); 2122 } 2123 unittest 2124 { 2125 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 2126 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left 2127 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ]; 2128 assert(R.array == correct); 2129 } 2130 2131 version(LDC) 2132 { 2133 // Disappeared with LDC 1.11 2134 static if (__VERSION__ < 2081) 2135 alias _mm_sqrt_pd = __builtin_ia32_sqrtpd; 2136 else 2137 { 2138 __m128d _mm_sqrt_pd(__m128d vec) pure @safe 2139 { 2140 vec.array[0] = llvm_sqrt(vec.array[0]); 2141 vec.array[1] = llvm_sqrt(vec.array[1]); 2142 return vec; 2143 } 2144 } 2145 } 2146 else 2147 { 2148 __m128d _mm_sqrt_pd(__m128d vec) pure @safe 2149 { 2150 vec.array[0] = sqrt(vec.array[0]); 2151 vec.array[1] = sqrt(vec.array[1]); 2152 return vec; 2153 } 2154 } 2155 2156 2157 version(LDC) 2158 { 2159 // Disappeared with LDC 1.11 2160 static if (__VERSION__ < 2081) 2161 alias _mm_sqrt_sd = __builtin_ia32_sqrtsd; 2162 else 2163 { 2164 __m128d _mm_sqrt_sd(__m128d vec) pure @safe 2165 { 2166 vec.array[0] = llvm_sqrt(vec.array[0]); 2167 vec.array[1] = vec.array[1]; 2168 return vec; 2169 } 2170 } 2171 } 2172 else 2173 { 2174 __m128d _mm_sqrt_sd(__m128d vec) pure @safe 2175 { 2176 vec.array[0] = sqrt(vec.array[0]); 2177 vec.array[1] = vec.array[1]; 2178 return vec; 2179 } 2180 } 2181 2182 2183 version(LDC) 2184 { 2185 alias _mm_sra_epi16 = __builtin_ia32_psraw128; 2186 } 2187 else 2188 { 2189 __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @safe 2190 { 2191 short8 sa = cast(short8)a; 2192 long2 lc = cast(long2)count; 2193 int bits = cast(int)(lc[0]); 2194 short8 r = void; 2195 foreach(i; 0..8) 2196 r[i] = cast(short)(sa[i] >> bits); 2197 return cast(int4)r; 2198 } 2199 } 2200 unittest 2201 { 2202 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 2203 short8 B = cast(short8)( _mm_sra_epi16(A, _mm_cvtsi32_si128(1)) ); 2204 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 2205 assert(B.array == expectedB); 2206 } 2207 2208 version(LDC) 2209 { 2210 alias _mm_sra_epi32 = __builtin_ia32_psrad128; 2211 } 2212 else 2213 { 2214 __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @safe 2215 { 2216 int4 r = void; 2217 long2 lc = cast(long2)count; 2218 int bits = cast(int)(lc[0]); 2219 foreach(i; 0..4) 2220 r[i] = (a[i] >> bits); 2221 return r; 2222 } 2223 } 2224 unittest 2225 { 2226 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 2227 __m128i B = _mm_sra_epi32(A, _mm_cvtsi32_si128(1)); 2228 int[4] expectedB = [ 0, 1, 1, -2]; 2229 assert(B.array == expectedB); 2230 } 2231 2232 2233 version(LDC) 2234 { 2235 alias _mm_srai_epi16 = __builtin_ia32_psrawi128; 2236 } 2237 else 2238 { 2239 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @safe 2240 { 2241 short8 sa = cast(short8)a; 2242 short8 r = void; 2243 foreach(i; 0..8) 2244 r[i] = cast(short)(sa[i] >> imm8); 2245 return cast(int4)r; 2246 } 2247 } 2248 unittest 2249 { 2250 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 2251 short8 B = cast(short8)( _mm_srai_epi16(A, 1) ); 2252 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 2253 assert(B.array == expectedB); 2254 } 2255 2256 version(LDC) 2257 { 2258 alias _mm_srai_epi32 = __builtin_ia32_psradi128; 2259 } 2260 else 2261 { 2262 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @safe 2263 { 2264 int4 r = void; 2265 foreach(i; 0..4) 2266 r[i] = (a[i] >> imm8); 2267 return r; 2268 } 2269 } 2270 unittest 2271 { 2272 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 2273 __m128i B = _mm_srai_epi32(A, 1); 2274 int[4] expectedB = [ 0, 1, 1, -2]; 2275 assert(B.array == expectedB); 2276 } 2277 2278 version(LDC) 2279 { 2280 alias _mm_srl_epi16 = __builtin_ia32_psrlw128; 2281 } 2282 else 2283 { 2284 __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @safe 2285 { 2286 short8 sa = cast(short8)a; 2287 long2 lc = cast(long2)count; 2288 int bits = cast(int)(lc[0]); 2289 short8 r = void; 2290 foreach(i; 0..8) 2291 r[i] = cast(short)(cast(ushort)(sa[i]) >> bits); 2292 return cast(int4)r; 2293 } 2294 } 2295 unittest 2296 { 2297 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 2298 short8 B = cast(short8)( _mm_srl_epi16(A, _mm_cvtsi32_si128(1)) ); 2299 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 2300 assert(B.array == expectedB); 2301 } 2302 2303 version(LDC) 2304 { 2305 alias _mm_srl_epi32 = __builtin_ia32_psrld128; 2306 } 2307 else 2308 { 2309 __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @safe 2310 { 2311 int4 r = void; 2312 long2 lc = cast(long2)count; 2313 int bits = cast(int)(lc[0]); 2314 foreach(i; 0..4) 2315 r[i] = cast(uint)(a[i]) >> bits; 2316 return r; 2317 } 2318 } 2319 unittest 2320 { 2321 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 2322 __m128i B = _mm_srl_epi32(A, _mm_cvtsi32_si128(1)); 2323 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 2324 assert(B.array == expectedB); 2325 } 2326 2327 version(LDC) 2328 { 2329 alias _mm_srl_epi64 = __builtin_ia32_psrlq128; 2330 } 2331 else 2332 { 2333 __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @safe 2334 { 2335 long2 r = void; 2336 long2 sa = cast(long2)a; 2337 long2 lc = cast(long2)count; 2338 int bits = cast(int)(lc[0]); 2339 foreach(i; 0..2) 2340 r[i] = cast(ulong)(sa[i]) >> bits; 2341 return cast(__m128i)r; 2342 } 2343 } 2344 unittest 2345 { 2346 __m128i A = _mm_setr_epi64(8, -4); 2347 long2 B = cast(long2) _mm_srl_epi64(A, _mm_cvtsi32_si128(1)); 2348 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 2349 assert(B.array == expectedB); 2350 } 2351 2352 version(LDC) 2353 { 2354 alias _mm_srli_epi16 = __builtin_ia32_psrlwi128; 2355 } 2356 else 2357 { 2358 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @safe 2359 { 2360 short8 sa = cast(short8)a; 2361 short8 r = void; 2362 foreach(i; 0..8) 2363 r[i] = cast(short)(cast(ushort)(sa[i]) >> imm8); 2364 return cast(int4)r; 2365 } 2366 } 2367 unittest 2368 { 2369 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 2370 short8 B = cast(short8)( _mm_srli_epi16(A, 1) ); 2371 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 2372 assert(B.array == expectedB); 2373 } 2374 2375 version(LDC) 2376 { 2377 alias _mm_srli_epi32 = __builtin_ia32_psrldi128; 2378 } 2379 else 2380 { 2381 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @safe 2382 { 2383 int4 r = void; 2384 foreach(i; 0..4) 2385 r[i] = cast(uint)(a[i]) >> imm8; 2386 return r; 2387 } 2388 } 2389 unittest 2390 { 2391 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 2392 __m128i B = _mm_srli_epi32(A, 1); 2393 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 2394 assert(B.array == expectedB); 2395 } 2396 2397 version(LDC) 2398 { 2399 alias _mm_srli_epi64 = __builtin_ia32_psrlqi128; 2400 } 2401 else 2402 { 2403 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @safe 2404 { 2405 long2 r = void; 2406 long2 sa = cast(long2)a; 2407 foreach(i; 0..2) 2408 r[i] = cast(ulong)(sa[i]) >> imm8; 2409 return cast(__m128i)r; 2410 } 2411 } 2412 unittest 2413 { 2414 __m128i A = _mm_setr_epi64(8, -4); 2415 long2 B = cast(long2) _mm_srli_epi64(A, 1); 2416 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 2417 assert(B.array == expectedB); 2418 } 2419 2420 /// Shift `v` right by `bytes` bytes while shifting in zeros. 2421 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe 2422 { 2423 static if (bytes & 0xF0) 2424 return _mm_setzero_si128(); 2425 else 2426 return cast(__m128i) shufflevector!(byte16, 2427 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7, 2428 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15) 2429 (cast(byte16) v, cast(byte16)_mm_setzero_si128()); 2430 } 2431 unittest 2432 { 2433 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1)); 2434 int[4] correct = [2, 3, 4, 0]; 2435 assert(R.array == correct); 2436 } 2437 2438 /// Shift `v` right by `bytes` bytes while shifting in zeros. 2439 /// #BONUS 2440 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe 2441 { 2442 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v); 2443 } 2444 unittest 2445 { 2446 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)); 2447 float[4] correct = [3.0f, 4.0f, 0, 0]; 2448 assert(R.array == correct); 2449 } 2450 2451 /// Shift `v` right by `bytes` bytes while shifting in zeros. 2452 /// #BONUS 2453 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe 2454 { 2455 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v); 2456 } 2457 2458 void _mm_store_pd (double* mem_addr, __m128d a) pure 2459 { 2460 __m128d* aligned = cast(__m128d*)mem_addr; 2461 *aligned = a; 2462 } 2463 2464 void _mm_store_pd1 (double* mem_addr, __m128d a) pure 2465 { 2466 __m128d* aligned = cast(__m128d*)mem_addr; 2467 *aligned = shufflevector!(double2, 0, 0)(a, a); 2468 } 2469 2470 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 2471 { 2472 *mem_addr = a[0]; 2473 } 2474 2475 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 2476 { 2477 *mem_addr = a; 2478 } 2479 2480 alias _mm_store1_pd = _mm_store_pd1; 2481 2482 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 2483 { 2484 *mem_addr = a[1]; 2485 } 2486 2487 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 2488 { 2489 long* dest = cast(long*)mem_addr; 2490 long2 la = cast(long2)a; 2491 *dest = a[0]; 2492 } 2493 2494 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 2495 { 2496 *mem_addr = a[0]; 2497 } 2498 2499 void _mm_storer_pd (double* mem_addr, __m128d a) pure 2500 { 2501 __m128d* aligned = cast(__m128d*)mem_addr; 2502 *aligned = shufflevector!(double2, 1, 0)(a, a); 2503 } 2504 2505 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe 2506 { 2507 storeUnaligned!double2(a, mem_addr); 2508 } 2509 2510 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe 2511 { 2512 storeUnaligned!__m128i(a, cast(int*)mem_addr); 2513 } 2514 2515 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 2516 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte 2517 /// boundary or a general-protection exception may be generated. 2518 void _mm_stream_pd (double* mem_addr, __m128d a) 2519 { 2520 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 2521 __m128d* dest = cast(__m128d*)mem_addr; 2522 *dest = a; 2523 } 2524 2525 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 2526 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception 2527 /// may be generated. 2528 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) 2529 { 2530 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 2531 __m128i* dest = cast(__m128i*)mem_addr; 2532 *dest = a; 2533 } 2534 2535 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 2536 /// pollution. If the cache line containing address mem_addr is already in the cache, 2537 /// the cache will be updated. 2538 void _mm_stream_si32 (int* mem_addr, int a) 2539 { 2540 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 2541 *mem_addr = a; 2542 } 2543 2544 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 2545 /// cache pollution. If the cache line containing address mem_addr is already 2546 /// in the cache, the cache will be updated. 2547 void _mm_stream_si64 (long* mem_addr, long a) 2548 { 2549 // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 2550 *mem_addr = a; 2551 } 2552 2553 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 2554 { 2555 return cast(__m128i)(cast(short8)a - cast(short8)b); 2556 } 2557 2558 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 2559 { 2560 return cast(__m128i)(cast(int4)a - cast(int4)b); 2561 } 2562 2563 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 2564 { 2565 return cast(__m128i)(cast(long2)a - cast(long2)b); 2566 } 2567 2568 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 2569 { 2570 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 2571 } 2572 2573 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 2574 { 2575 return a - b; 2576 } 2577 2578 version(DigitalMars) 2579 { 2580 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 2581 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe 2582 { 2583 pragma(inline, false); 2584 a[0] = a[0] - b[0]; 2585 return a; 2586 } 2587 } 2588 else 2589 { 2590 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe 2591 { 2592 a[0] -= b[0]; 2593 return a; 2594 } 2595 } 2596 unittest 2597 { 2598 __m128d a = [1.5, -2.0]; 2599 a = _mm_sub_sd(a, a); 2600 assert(a.array == [0.0, -2.0]); 2601 } 2602 2603 2604 // TODO: _mm_sub_si64 2605 2606 version(LDC) 2607 { 2608 alias _mm_subs_epi16 = __builtin_ia32_psubsw128; 2609 } 2610 else 2611 { 2612 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 2613 { 2614 short[8] res; 2615 short8 sa = cast(short8)a; 2616 short8 sb = cast(short8)b; 2617 foreach(i; 0..8) 2618 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 2619 return _mm_loadu_si128(cast(int4*)res.ptr); 2620 } 2621 } 2622 unittest 2623 { 2624 short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0), 2625 _mm_setr_epi16(-10 , 16, 5, 4, 3, 2, 1, 0)); 2626 static immutable short[8] correctResult = [32767, -32768, 0, 0, 0, 0, 0, 0]; 2627 assert(res.array == correctResult); 2628 } 2629 2630 version(LDC) 2631 { 2632 alias _mm_subs_epi8 = __builtin_ia32_psubsb128; 2633 } 2634 else 2635 { 2636 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 2637 { 2638 byte[16] res; 2639 byte16 sa = cast(byte16)a; 2640 byte16 sb = cast(byte16)b; 2641 foreach(i; 0..16) 2642 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 2643 return _mm_loadu_si128(cast(int4*)res.ptr); 2644 } 2645 } 2646 unittest 2647 { 2648 byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 2649 _mm_setr_epi8( 15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 2650 static immutable byte[16] correctResult = [-128, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 2651 assert(res.array == correctResult); 2652 } 2653 2654 version(LDC) 2655 { 2656 alias _mm_subs_epu16 = __builtin_ia32_psubusw128; 2657 } 2658 else 2659 { 2660 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 2661 { 2662 short[8] res; 2663 short8 sa = cast(short8)a; 2664 short8 sb = cast(short8)b; 2665 foreach(i; 0..8) 2666 { 2667 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 2668 res[i] = saturateSignedIntToUnsignedShort(sum); 2669 } 2670 return _mm_loadu_si128(cast(int4*)res.ptr); 2671 } 2672 } 2673 unittest 2674 { 2675 __m128i A = _mm_setr_epi16(cast(short)65534, 0, 5, 4, 3, 2, 1, 0); 2676 short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534, 1, 5, 4, 3, 2, 1, 0), 2677 _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0)); 2678 static immutable short[8] correct = [ 0, 0, 1, 0, 0, 2, 0, 0]; 2679 assert(R.array == correct); 2680 } 2681 2682 version(LDC) 2683 { 2684 alias _mm_subs_epu8 = __builtin_ia32_psubusb128; 2685 } 2686 else 2687 { 2688 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 2689 { 2690 ubyte[16] res; 2691 byte16 sa = cast(byte16)a; 2692 byte16 sb = cast(byte16)b; 2693 foreach(i; 0..16) 2694 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 2695 return _mm_loadu_si128(cast(int4*)res.ptr); 2696 } 2697 } 2698 unittest 2699 { 2700 byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 2701 _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 2702 static immutable byte[16] correctResult = [ 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 2703 assert(res.array == correctResult); 2704 } 2705 2706 // Note: the only difference between these intrinsics is the signalling 2707 // behaviour of quiet NaNs. This is incorrect but the case where 2708 // you would want to differentiate between qNaN and sNaN and then 2709 // treat them differently on purpose seems extremely rare. 2710 alias _mm_ucomieq_sd = _mm_comieq_sd; 2711 alias _mm_ucomige_sd = _mm_comige_sd; 2712 alias _mm_ucomigt_sd = _mm_comigt_sd; 2713 alias _mm_ucomile_sd = _mm_comile_sd; 2714 alias _mm_ucomilt_sd = _mm_comilt_sd; 2715 alias _mm_ucomineq_sd = _mm_comineq_sd; 2716 2717 __m128d _mm_undefined_pd() pure @safe 2718 { 2719 __m128d result = void; 2720 return result; 2721 } 2722 __m128i _mm_undefined_si128() pure @safe 2723 { 2724 __m128i result = void; 2725 return result; 2726 } 2727 2728 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe 2729 { 2730 return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 2731 (cast(short8)a, cast(short8)b); 2732 } 2733 2734 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @safe 2735 { 2736 return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 2737 } 2738 2739 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @safe 2740 { 2741 return cast(__m128i) shufflevector!(long2, 1, 3)(cast(long2)a, cast(long2)b); 2742 } 2743 2744 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe 2745 { 2746 return cast(__m128i)shufflevector!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 2747 12, 28, 13, 29, 14, 30, 15, 31) 2748 (cast(byte16)a, cast(byte16)b); 2749 } 2750 2751 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe 2752 { 2753 return shufflevector!(__m128d, 1, 3)(a, b); 2754 } 2755 2756 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe 2757 { 2758 return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11) 2759 (cast(short8)a, cast(short8)b); 2760 } 2761 2762 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @safe 2763 { 2764 return shufflevector!(int4, 0, 4, 1, 5) 2765 (cast(int4)a, cast(int4)b); 2766 } 2767 2768 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @safe 2769 { 2770 return cast(__m128i) shufflevector!(long2, 0, 2) 2771 (cast(long2)a, cast(long2)b); 2772 } 2773 2774 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe 2775 { 2776 return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 2777 4, 20, 5, 21, 6, 22, 7, 23) 2778 (cast(byte16)a, cast(byte16)b); 2779 } 2780 2781 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe 2782 { 2783 return shufflevector!(__m128d, 0, 2)(a, b); 2784 } 2785 2786 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 2787 { 2788 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 2789 } 2790 2791 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 2792 { 2793 return a ^ b; 2794 } 2795 2796 unittest 2797 { 2798 // distance between two points in 4D 2799 float distance(float[4] a, float[4] b) nothrow @nogc 2800 { 2801 __m128 va = _mm_loadu_ps(a.ptr); 2802 __m128 vb = _mm_loadu_ps(b.ptr); 2803 __m128 diffSquared = _mm_sub_ps(va, vb); 2804 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 2805 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared)); 2806 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum)); 2807 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 2808 } 2809 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 2810 }