1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2019. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.emmintrin; 7 8 public import inteli.types; 9 public import inteli.xmmintrin; // SSE2 includes SSE1 10 11 import inteli.internals; 12 13 nothrow @nogc: 14 15 // SSE2 instructions 16 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 17 18 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 19 { 20 return cast(__m128i)(cast(short8)a + cast(short8)b); 21 } 22 23 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 24 { 25 return cast(__m128i)(cast(int4)a + cast(int4)b); 26 } 27 28 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 29 { 30 return cast(__m128i)(cast(long2)a + cast(long2)b); 31 } 32 33 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 34 { 35 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 36 } 37 38 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 39 { 40 a[0] += b[0]; 41 return a; 42 } 43 unittest 44 { 45 __m128d a = [1.5, -2.0]; 46 a = _mm_add_sd(a, a); 47 assert(a.array == [3.0, -2.0]); 48 } 49 50 51 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 52 { 53 return a + b; 54 } 55 unittest 56 { 57 __m128d a = [1.5, -2.0]; 58 a = _mm_add_pd(a, a); 59 assert(a.array == [3.0, -4.0]); 60 } 61 62 // MMXREG: _mm_add_si64 63 64 version(LDC) 65 { 66 alias _mm_adds_epi16 = __builtin_ia32_paddsw128; 67 } 68 else 69 { 70 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 71 { 72 short[8] res; 73 short8 sa = cast(short8)a; 74 short8 sb = cast(short8)b; 75 foreach(i; 0..8) 76 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 77 return _mm_loadu_si128(cast(int4*)res.ptr); 78 } 79 } 80 unittest 81 { 82 short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), 83 _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); 84 static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 85 assert(res.array == correctResult); 86 } 87 88 version(LDC) 89 { 90 alias _mm_adds_epi8 = __builtin_ia32_paddsb128; 91 } 92 else 93 { 94 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 95 { 96 byte[16] res; 97 byte16 sa = cast(byte16)a; 98 byte16 sb = cast(byte16)b; 99 foreach(i; 0..16) 100 res[i] = saturateSignedWordToSignedByte(sa.array[i] + sb.array[i]); 101 return _mm_loadu_si128(cast(int4*)res.ptr); 102 } 103 } 104 unittest 105 { 106 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 107 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 108 static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14, 109 16, 18, 20, 22, 24, 26, 28, 30]; 110 assert(res.array == correctResult); 111 } 112 113 version(LDC) 114 { 115 alias _mm_adds_epu8 = __builtin_ia32_paddusb128; 116 } 117 else 118 { 119 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 120 { 121 ubyte[16] res; 122 byte16 sa = cast(byte16)a; 123 byte16 sb = cast(byte16)b; 124 foreach(i; 0..16) 125 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 126 return _mm_loadu_si128(cast(int4*)res.ptr); 127 } 128 } 129 130 version(LDC) 131 { 132 alias _mm_adds_epu16 = __builtin_ia32_paddusw128; 133 } 134 else 135 { 136 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 137 { 138 ushort[8] res; 139 short8 sa = cast(short8)a; 140 short8 sb = cast(short8)b; 141 foreach(i; 0..8) 142 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 143 return _mm_loadu_si128(cast(int4*)res.ptr); 144 } 145 } 146 147 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 148 { 149 return cast(__m128d)( cast(__m128i)a & cast(__m128i)b ); 150 } 151 152 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 153 { 154 return a & b; 155 } 156 157 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 158 { 159 return cast(__m128d)( (~cast(__m128i)a) & cast(__m128i)b ); 160 } 161 162 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 163 { 164 return (~a) & b; 165 } 166 167 version(LDC) 168 { 169 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @safe 170 { 171 // Generates pavgw even in LDC 1.0, even in -O0 172 enum ir = ` 173 %ia = zext <8 x i16> %0 to <8 x i32> 174 %ib = zext <8 x i16> %1 to <8 x i32> 175 %isum = add <8 x i32> %ia, %ib 176 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 177 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 178 %r = trunc <8 x i32> %isums to <8 x i16> 179 ret <8 x i16> %r`; 180 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 181 } 182 } 183 else 184 { 185 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @safe 186 { 187 short8 sa = cast(short8)a; 188 short8 sb = cast(short8)b; 189 short8 sr = void; 190 foreach(i; 0..8) 191 { 192 sr[i] = cast(ushort)( (cast(ushort)(sa[i]) + cast(ushort)(sb[i]) + 1) >> 1 ); 193 } 194 return cast(int4)sr; 195 } 196 } 197 unittest 198 { 199 __m128i A = _mm_set1_epi16(31); 200 __m128i B = _mm_set1_epi16(64); 201 short8 avg = cast(short8)(_mm_avg_epu16(A, B)); 202 foreach(i; 0..8) 203 assert(avg[i] == 48); 204 } 205 206 version(LDC) 207 { 208 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @safe 209 { 210 // Generates pavgb even in LDC 1.0, even in -O0 211 enum ir = ` 212 %ia = zext <16 x i8> %0 to <16 x i16> 213 %ib = zext <16 x i8> %1 to <16 x i16> 214 %isum = add <16 x i16> %ia, %ib 215 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 216 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 217 %r = trunc <16 x i16> %isums to <16 x i8> 218 ret <16 x i8> %r`; 219 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 220 } 221 } 222 else 223 { 224 __m128i _mm_avg_epu8 (__m128i a, __m128i b) 225 { 226 byte16 sa = cast(byte16)a; 227 byte16 sb = cast(byte16)b; 228 byte16 sr = void; 229 foreach(i; 0..16) 230 { 231 sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 ); 232 } 233 return cast(int4)sr; 234 } 235 } 236 unittest 237 { 238 __m128i A = _mm_set1_epi8(31); 239 __m128i B = _mm_set1_epi8(64); 240 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B)); 241 foreach(i; 0..16) 242 assert(avg[i] == 48); 243 } 244 245 // Note: unlike Intel API, shift amount is a compile-time parameter. 246 __m128i _mm_bslli_si128(int bits)(__m128i a) pure @safe 247 { 248 // Generates pslldq starting with LDC 1.1 -O2 249 __m128i zero = _mm_setzero_si128(); 250 return cast(__m128i) 251 shufflevector!(byte16, 16 - bits, 17 - bits, 18 - bits, 19 - bits, 252 20 - bits, 21 - bits, 22 - bits, 23 - bits, 253 24 - bits, 25 - bits, 26 - bits, 27 - bits, 254 28 - bits, 29 - bits, 30 - bits, 31 - bits) 255 (cast(byte16)zero, cast(byte16)a); 256 } 257 unittest 258 { 259 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 260 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 261 __m128i result = _mm_bslli_si128!5(toShift); 262 assert( (cast(byte16)result).array == exact); 263 } 264 265 // Note: unlike Intel API, shift amount is a compile-time parameter. 266 __m128i _mm_bsrli_si128(int bits)(__m128i a) pure @safe 267 { 268 // Generates psrldq starting with LDC 1.1 -O2 269 __m128i zero = _mm_setzero_si128(); 270 return cast(__m128i) 271 shufflevector!(byte16, 0 + bits, 1 + bits, 2 + bits, 3 + bits, 272 4 + bits, 5 + bits, 6 + bits, 7 + bits, 273 8 + bits, 9 + bits, 10 + bits, 11 + bits, 274 12 + bits, 13 + bits, 14 + bits, 15 + bits) 275 (cast(byte16)a, cast(byte16)zero); 276 } 277 unittest 278 { 279 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 280 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0]; 281 __m128i result = _mm_bsrli_si128!5(toShift); 282 assert( (cast(byte16)result).array == exact); 283 } 284 285 __m128 _mm_castpd_ps (__m128d a) pure @safe 286 { 287 return cast(__m128)a; 288 } 289 290 __m128i _mm_castpd_si128 (__m128d a) pure @safe 291 { 292 return cast(__m128i)a; 293 } 294 295 __m128d _mm_castps_pd (__m128 a) pure @safe 296 { 297 return cast(__m128d)a; 298 } 299 300 __m128i _mm_castps_si128 (__m128 a) pure @safe 301 { 302 return cast(__m128i)a; 303 } 304 305 __m128d _mm_castsi128_pd (__m128i a) pure @safe 306 { 307 return cast(__m128d)a; 308 } 309 310 __m128 _mm_castsi128_ps (__m128i a) pure @safe 311 { 312 return cast(__m128)a; 313 } 314 315 version(LDC) 316 { 317 alias _mm_clflush = __builtin_ia32_clflush; 318 } 319 else 320 { 321 void _mm_clflush (const(void)* p) pure @safe 322 { 323 version(D_InlineAsm_X86) 324 { 325 asm pure nothrow @nogc @safe 326 { 327 mov EAX, p; 328 clflush [EAX]; 329 } 330 } 331 else version(D_InlineAsm_X86_64) 332 { 333 asm pure nothrow @nogc @safe 334 { 335 mov RAX, p; 336 clflush [RAX]; 337 } 338 } 339 else 340 static assert(false, "Should implement clflush for this compiler"); 341 } 342 } 343 unittest 344 { 345 ubyte[64] cacheline; 346 _mm_clflush(cacheline.ptr); 347 } 348 349 350 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 351 { 352 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 353 } 354 unittest 355 { 356 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 357 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 358 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 359 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 360 assert(R.array == E); 361 } 362 363 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 364 { 365 return equalMask!__m128i(a, b); 366 } 367 unittest 368 { 369 int4 A = [-3, -2, -1, 0]; 370 int4 B = [ 4, -2, 2, 0]; 371 int[4] E = [ 0, -1, 0, -1]; 372 int4 R = cast(int4)(_mm_cmpeq_epi16(A, B)); 373 assert(R.array == E); 374 } 375 376 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 377 { 378 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 379 } 380 unittest 381 { 382 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 383 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 384 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 385 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 386 __m128i D = _mm_cmpeq_epi8(A, B); 387 assert(C.array == correct); 388 } 389 390 391 392 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 393 { 394 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b); 395 } 396 397 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 398 { 399 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b); 400 } 401 402 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 403 { 404 return cast(__m128d) cmppd!(FPComparison.oge)(a, b); 405 } 406 407 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 408 { 409 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b); 410 } 411 412 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 413 { 414 return cast(__m128i)( greaterMask!short8(cast(short8)a, cast(short8)b)); 415 } 416 unittest 417 { 418 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 419 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 420 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 421 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 422 assert(R.array == E); 423 } 424 425 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 426 { 427 return cast(__m128i)( greaterMask!int4(a, b)); 428 } 429 unittest 430 { 431 int4 A = [-3, 2, -1, 0]; 432 int4 B = [ 4, -2, 2, 0]; 433 int[4] E = [ 0, -1, 0, 0]; 434 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 435 assert(R.array == E); 436 } 437 438 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 439 { 440 return cast(__m128i)( greaterMask!byte16(cast(byte16)a, cast(byte16)b)); 441 } 442 unittest 443 { 444 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 445 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 446 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 447 byte[16] correct = [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 448 __m128i D = _mm_cmpeq_epi8(A, B); 449 assert(C.array == correct); 450 } 451 452 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 453 { 454 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b); 455 } 456 457 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 458 { 459 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b); 460 } 461 462 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 463 { 464 return cast(__m128d) cmppd!(FPComparison.ole)(a, b); 465 } 466 467 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 468 { 469 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b); 470 } 471 472 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 473 { 474 return _mm_cmpgt_epi16(b, a); 475 } 476 477 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 478 { 479 return _mm_cmpgt_epi32(b, a); 480 } 481 482 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 483 { 484 return _mm_cmpgt_epi8(b, a); 485 } 486 487 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 488 { 489 return cast(__m128d) cmppd!(FPComparison.olt)(a, b); 490 } 491 492 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 493 { 494 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b); 495 } 496 497 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 498 { 499 return cast(__m128d) cmppd!(FPComparison.une)(a, b); 500 } 501 502 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 503 { 504 return cast(__m128d) cmpsd!(FPComparison.une)(a, b); 505 } 506 507 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 508 { 509 return cast(__m128d) cmppd!(FPComparison.ult)(a, b); 510 } 511 512 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 513 { 514 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b); 515 } 516 517 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 518 { 519 return cast(__m128d) cmppd!(FPComparison.ule)(a, b); 520 } 521 522 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 523 { 524 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b); 525 } 526 527 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 528 { 529 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b); 530 } 531 532 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 533 { 534 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b); 535 } 536 537 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 538 { 539 return cast(__m128d) cmppd!(FPComparison.uge)(a, b); 540 } 541 542 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 543 { 544 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b); 545 } 546 547 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 548 { 549 return cast(__m128d) cmppd!(FPComparison.ord)(a, b); 550 } 551 552 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 553 { 554 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b); 555 } 556 557 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 558 { 559 return cast(__m128d) cmppd!(FPComparison.uno)(a, b); 560 } 561 562 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 563 { 564 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b); 565 } 566 567 568 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS 569 // Some such comparisons yields true for NaNs, other don't. 570 571 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe 572 { 573 return comsd!(FPComparison.ueq)(a, b); // yields true for NaN, same as GCC 574 } 575 576 int _mm_comige_sd (__m128d a, __m128d b) pure @safe 577 { 578 return comsd!(FPComparison.oge)(a, b); 579 } 580 581 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe 582 { 583 return comsd!(FPComparison.ogt)(a, b); 584 } 585 586 int _mm_comile_sd (__m128d a, __m128d b) pure @safe 587 { 588 return comsd!(FPComparison.ule)(a, b); // yields true for NaN, same as GCC 589 } 590 591 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe 592 { 593 return comsd!(FPComparison.ult)(a, b); // yields true for NaN, same as GCC 594 } 595 596 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe 597 { 598 return comsd!(FPComparison.one)(a, b); 599 } 600 601 version(LDC) 602 { 603 __m128d _mm_cvtepi32_pd (__m128i a) pure @safe 604 { 605 // Generates cvtdq2pd since LDC 1.0, even without optimizations 606 enum ir = ` 607 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 608 %r = sitofp <2 x i32> %v to <2 x double> 609 ret <2 x double> %r`; 610 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a); 611 } 612 } 613 else 614 { 615 __m128d _mm_cvtepi32_pd (__m128i a) pure @safe 616 { 617 double2 r = void; 618 r[0] = a[0]; 619 r[1] = a[1]; 620 return r; 621 } 622 } 623 unittest 624 { 625 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54)); 626 assert(A[0] == 54.0); 627 assert(A[1] == 54.0); 628 } 629 630 // PERF: verify the instruction generated 631 __m128 _mm_cvtepi32_ps(__m128i a) pure @safe 632 { 633 __m128 res; 634 res.array[0] = cast(float)a.array[0]; 635 res.array[1] = cast(float)a.array[1]; 636 res.array[2] = cast(float)a.array[2]; 637 res.array[3] = cast(float)a.array[3]; 638 return res; 639 } 640 unittest 641 { 642 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 643 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 644 } 645 646 647 version(LDC) 648 { 649 // Like in clang, implemented with a magic intrinsic right now 650 alias _mm_cvtpd_epi32 = __builtin_ia32_cvtpd2dq; 651 652 /* Unfortunately this generates a cvttpd2dq instruction 653 __m128i _mm_cvtpd_epi32 (__m128d a) pure @safe 654 { 655 enum ir = ` 656 %i = fptosi <2 x double> %0 to <2 x i32> 657 %r = shufflevector <2 x i32> %i,<2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 658 ret <4 x i32> %r`; 659 660 return cast(__m128i) inlineIR!(ir, __m128i, __m128d)(a); 661 } */ 662 } 663 else 664 { 665 __m128i _mm_cvtpd_epi32 (__m128d a) pure @safe 666 { 667 __m128i r = _mm_setzero_si128(); 668 r[0] = convertDoubleToInt32UsingMXCSR(a[0]); 669 r[1] = convertDoubleToInt32UsingMXCSR(a[1]); 670 return r; 671 } 672 } 673 unittest 674 { 675 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0)); 676 assert(A[0] == 55 && A[1] == 61 && A[2] == 0 && A[3] == 0); 677 } 678 679 // MMXREG: _mm_cvtpd_pi32 680 681 version(LDC) 682 { 683 alias _mm_cvtpd_ps = __builtin_ia32_cvtpd2ps; // can't be done with IR unfortunately 684 } 685 else 686 { 687 __m128 _mm_cvtpd_ps (__m128d a) pure @safe 688 { 689 __m128 r = void; 690 r[0] = a[0]; 691 r[1] = a[1]; 692 r[2] = 0; 693 r[3] = 0; 694 return r; 695 } 696 } 697 unittest 698 { 699 __m128d A = _mm_set_pd(5.25, 4.0); 700 __m128 B = _mm_cvtpd_ps(A); 701 assert(B.array == [4.0f, 5.25f, 0, 0]); 702 } 703 704 // MMXREG: _mm_cvtpi32_pd 705 706 version(LDC) 707 { 708 alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq; 709 } 710 else 711 { 712 __m128i _mm_cvtps_epi32 (__m128 a) pure @safe 713 { 714 __m128i r = void; 715 r[0] = convertFloatToInt32UsingMXCSR(a[0]); 716 r[1] = convertFloatToInt32UsingMXCSR(a[1]); 717 r[2] = convertFloatToInt32UsingMXCSR(a[2]); 718 r[3] = convertFloatToInt32UsingMXCSR(a[3]); 719 return r; 720 } 721 } 722 unittest 723 { 724 uint savedRounding = _MM_GET_ROUNDING_MODE(); 725 726 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 727 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 728 assert(A.array == [1, -2, 54, -3]); 729 730 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 731 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 732 assert(A.array == [1, -3, 53, -3]); 733 734 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 735 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 736 assert(A.array == [2, -2, 54, -2]); 737 738 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 739 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 740 assert(A.array == [1, -2, 53, -2]); 741 742 _MM_SET_ROUNDING_MODE(savedRounding); 743 } 744 745 746 version(LDC) 747 { 748 __m128d _mm_cvtps_pd (__m128 a) pure @safe 749 { 750 // Generates cvtps2pd since LDC 1.0, no opt 751 enum ir = ` 752 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1> 753 %r = fpext <2 x float> %v to <2 x double> 754 ret <2 x double> %r`; 755 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a); 756 } 757 } 758 else 759 { 760 __m128d _mm_cvtps_pd (__m128 a) pure @safe 761 { 762 double2 r = void; 763 r[0] = a[0]; 764 r[1] = a[1]; 765 return r; 766 } 767 } 768 unittest 769 { 770 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f)); 771 assert(A[0] == 54.0); 772 assert(A[1] == 54.0); 773 } 774 775 double _mm_cvtsd_f64 (__m128d a) pure @safe 776 { 777 return extractelement!(double2, 0)(a); 778 } 779 780 version(LDC) 781 { 782 alias _mm_cvtsd_si32 = __builtin_ia32_cvtsd2si; 783 } 784 else 785 { 786 int _mm_cvtsd_si32 (__m128d a) pure @safe 787 { 788 return convertDoubleToInt32UsingMXCSR(a[0]); 789 } 790 } 791 unittest 792 { 793 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0))); 794 } 795 796 version(LDC) 797 { 798 // Unfortunately this builtin crashes in 32-bit 799 version(X86_64) 800 alias _mm_cvtsd_si64 = __builtin_ia32_cvtsd2si64; 801 else 802 { 803 long _mm_cvtsd_si64 (__m128d a) pure @safe 804 { 805 return convertDoubleToInt64UsingMXCSR(a[0]); 806 } 807 } 808 } 809 else 810 { 811 long _mm_cvtsd_si64 (__m128d a) pure @safe 812 { 813 return convertDoubleToInt64UsingMXCSR(a[0]); 814 } 815 } 816 unittest 817 { 818 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0))); 819 820 uint savedRounding = _MM_GET_ROUNDING_MODE(); 821 822 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 823 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.5))); 824 825 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 826 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1))); 827 828 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 829 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1))); 830 831 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 832 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9))); 833 834 _MM_SET_ROUNDING_MODE(savedRounding); 835 } 836 837 alias _mm_cvtsd_si64x = _mm_cvtsd_si64; 838 839 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @safe 840 { 841 // Generates cvtsd2ss since LDC 1.3 -O0 842 a[0] = b[0]; 843 return a; 844 } 845 unittest 846 { 847 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0)); 848 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]); 849 } 850 851 int _mm_cvtsi128_si32 (__m128i a) pure @safe 852 { 853 return a[0]; 854 } 855 856 long _mm_cvtsi128_si64 (__m128i a) pure @safe 857 { 858 long2 la = cast(long2)a; 859 return la[0]; 860 } 861 alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 862 863 __m128d _mm_cvtsi32_sd(__m128d v, int x) pure @safe 864 { 865 v[0] = cast(double)x; 866 return v; 867 } 868 unittest 869 { 870 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 871 assert(a.array == [42.0, 0]); 872 } 873 874 __m128i _mm_cvtsi32_si128 (int a) pure @safe 875 { 876 int4 r = [0, 0, 0, 0]; 877 r[0] = a; 878 return r; 879 } 880 unittest 881 { 882 __m128i a = _mm_cvtsi32_si128(65); 883 assert(a.array == [65, 0, 0, 0]); 884 } 885 886 887 // Note: on macOS, using "llvm.x86.sse2.cvtsi642sd" was buggy 888 __m128d _mm_cvtsi64_sd(__m128d v, long x) pure @safe 889 { 890 v[0] = cast(double)x; 891 return v; 892 } 893 unittest 894 { 895 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 896 assert(a.array == [42.0, 0]); 897 } 898 899 __m128i _mm_cvtsi64_si128 (long a) pure @safe 900 { 901 long2 r = [0, 0]; 902 r[0] = a; 903 return cast(__m128i)(r); 904 } 905 906 alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; 907 alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; 908 909 double2 _mm_cvtss_sd(double2 v, float4 x) pure @safe 910 { 911 v[0] = x[0]; 912 return v; 913 } 914 unittest 915 { 916 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 917 assert(a.array == [42.0, 0]); 918 } 919 920 long _mm_cvttss_si64 (__m128 a) pure @safe 921 { 922 return cast(long)(a[0]); // Generates cvttss2si as expected 923 } 924 unittest 925 { 926 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 927 } 928 929 version(LDC) 930 { 931 alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq; 932 } 933 else 934 { 935 __m128i _mm_cvttpd_epi32 (__m128d a) pure @safe 936 { 937 // Note: doesn't generate cvttpd2dq as of LDC 1.13 938 __m128i r; 939 r[0] = cast(int)a[0]; 940 r[1] = cast(int)a[1]; 941 r[2] = 0; 942 r[3] = 0; 943 return r; 944 } 945 } 946 unittest 947 { 948 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f)); 949 assert(R.array == [-4, 45641, 0, 0]); 950 } 951 952 //MMXREG: _mm_cvttpd_pi32 953 954 __m128i _mm_cvttps_epi32 (__m128 a) pure @safe 955 { 956 // Note: Generates cvttps2dq since LDC 1.3 -O2 957 __m128i r; 958 r[0] = cast(int)a[0]; 959 r[1] = cast(int)a[1]; 960 r[2] = cast(int)a[2]; 961 r[3] = cast(int)a[3]; 962 return r; 963 } 964 unittest 965 { 966 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f)); 967 assert(R.array == [-4, 45641, 0, 1]); 968 } 969 970 int _mm_cvttsd_si32 (__m128d a) 971 { 972 // Generates cvttsd2si since LDC 1.3 -O0 973 return cast(int)a[0]; 974 } 975 976 long _mm_cvttsd_si64 (__m128d a) 977 { 978 // Generates cvttsd2si since LDC 1.3 -O0 979 // but in 32-bit instead, it's a long sequence that resort to FPU 980 return cast(long)a[0]; 981 } 982 983 alias _mm_cvttsd_si64x = _mm_cvttsd_si64; 984 985 __m128d _mm_div_ps(__m128d a, __m128d b) 986 { 987 return a / b; 988 } 989 990 __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe 991 { 992 a[0] /= b[0]; 993 return a; 994 } 995 unittest 996 { 997 __m128d a = [2.0, 4.5]; 998 a = _mm_div_sd(a, a); 999 assert(a.array == [1.0, 4.5]); 1000 } 1001 1002 int _mm_extract_epi16(int imm8)(__m128i a) pure @safe 1003 { 1004 return extractelement!(short8, imm8)(a); 1005 } 1006 1007 __m128i _mm_insert_epi16(int imm8)(__m128i a, int i) pure @safe 1008 { 1009 return insertelement!(short8, imm8)(a, i); 1010 } 1011 1012 version(LDC) 1013 { 1014 alias _mm_lfence = __builtin_ia32_lfence; 1015 } 1016 else 1017 { 1018 void _mm_lfence() pure @safe 1019 { 1020 asm nothrow @nogc pure @safe 1021 { 1022 lfence; 1023 } 1024 } 1025 } 1026 unittest 1027 { 1028 _mm_lfence(); 1029 } 1030 1031 1032 __m128d _mm_load_pd (const(double) * mem_addr) pure 1033 { 1034 __m128d* aligned = cast(__m128d*)mem_addr; 1035 return *aligned; 1036 } 1037 1038 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 1039 { 1040 double[2] arr = [*mem_addr, *mem_addr]; 1041 return loadUnaligned!(double2)(&arr[0]); 1042 } 1043 1044 __m128d _mm_load_sd (const(double)* mem_addr) pure @safe 1045 { 1046 double2 r = [0, 0]; 1047 r[0] = *mem_addr; 1048 return r; 1049 } 1050 unittest 1051 { 1052 double x = -42; 1053 __m128d a = _mm_load_sd(&x); 1054 assert(a.array == [-42.0, 0.0]); 1055 } 1056 1057 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted 1058 { 1059 return *mem_addr; 1060 } 1061 1062 alias _mm_load1_pd = _mm_load_pd1; 1063 1064 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @safe 1065 { 1066 a[1] = *mem_addr; 1067 return a; 1068 } 1069 1070 // Note: strange signature since the memory doesn't have to aligned 1071 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @safe 1072 { 1073 auto pLong = cast(const(long)*)mem_addr; 1074 long2 r = [0, 0]; 1075 r[0] = *pLong; 1076 return cast(__m128i)(r); 1077 } 1078 1079 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @safe 1080 { 1081 a[0] = *mem_addr; 1082 return a; 1083 } 1084 1085 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted 1086 { 1087 __m128d a = _mm_load_pd(mem_addr); 1088 return shufflevector!(__m128d, 1, 0)(a, a); 1089 } 1090 1091 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @safe 1092 { 1093 return loadUnaligned!(double2)(mem_addr); 1094 } 1095 1096 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 1097 { 1098 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 1099 } 1100 1101 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted 1102 { 1103 int r = *cast(int*)(mem_addr); 1104 int4 result = [0, 0, 0, 0]; 1105 result[0] = r; 1106 return result; 1107 } 1108 unittest 1109 { 1110 int r = 42; 1111 __m128i A = _mm_loadu_si32(&r); 1112 int[4] correct = [42, 0, 0, 0]; 1113 assert(A.array == correct); 1114 } 1115 1116 version(LDC) 1117 { 1118 alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128; 1119 } 1120 1121 version(LDC) 1122 { 1123 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 1124 /// (elements are not stored when the highest bit is not set in the corresponding element) 1125 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 1126 /// boundary. 1127 alias _mm_maskmoveu_si128 = __builtin_ia32_maskmovdqu; // can't do it with pure IR 1128 } 1129 else 1130 { 1131 ///ditto 1132 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) pure @trusted 1133 { 1134 byte16 b = cast(byte16)a; 1135 byte16 m = cast(byte16)mask; 1136 byte* dest = cast(byte*)(mem_addr); 1137 foreach(j; 0..16) 1138 { 1139 if (m[j] & 128) 1140 { 1141 dest[j] = b[j]; 1142 } 1143 } 1144 } 1145 } 1146 unittest 1147 { 1148 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]; 1149 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0); 1150 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); 1151 _mm_maskmoveu_si128(A, mask, dest.ptr); 1152 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42]; 1153 assert(dest == correct); 1154 } 1155 1156 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe 1157 { 1158 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1159 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else 1160 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1161 __m128i mask = _mm_and_si128(aTob, lowerShorts); 1162 return _mm_xor_si128(b, mask); 1163 } 1164 unittest 1165 { 1166 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-57), 1167 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 1168 short[8] correct = [45, 1, 9, 7, 9, 7, 0, 0]; 1169 assert(R.array == correct); 1170 } 1171 1172 1173 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1174 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe 1175 { 1176 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1177 __m128i value128 = _mm_set1_epi8(-128); 1178 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 1179 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1180 __m128i mask = _mm_and_si128(aTob, higher); 1181 return _mm_xor_si128(b, mask); 1182 } 1183 unittest 1184 { 1185 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 1186 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 1187 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 1188 assert(R.array == correct); 1189 } 1190 1191 __m128d _mm_max_pd (__m128d a, __m128d b) pure @safe 1192 { 1193 // Generates maxpd starting with LDC 1.9 1194 a[0] = (a[0] > b[0]) ? a[0] : b[0]; 1195 a[1] = (a[1] > b[1]) ? a[1] : b[1]; 1196 return a; 1197 } 1198 unittest 1199 { 1200 __m128d A = _mm_setr_pd(4.0, 1.0); 1201 __m128d B = _mm_setr_pd(1.0, 8.0); 1202 __m128d M = _mm_max_pd(A, B); 1203 assert(M[0] == 4.0); 1204 assert(M[1] == 8.0); 1205 } 1206 1207 __m128d _mm_max_sd (__m128d a, __m128d b) pure @safe 1208 { 1209 __m128d r = a; 1210 // Generates maxsd starting with LDC 1.3 1211 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 1212 return r; 1213 } 1214 unittest 1215 { 1216 __m128d A = _mm_setr_pd(1.0, 1.0); 1217 __m128d B = _mm_setr_pd(4.0, 2.0); 1218 __m128d M = _mm_max_sd(A, B); 1219 assert(M[0] == 4.0); 1220 assert(M[1] == 1.0); 1221 } 1222 1223 version(LDC) 1224 { 1225 alias _mm_mfence = __builtin_ia32_mfence; 1226 } 1227 else 1228 { 1229 void _mm_mfence() pure @safe 1230 { 1231 asm nothrow @nogc pure @safe 1232 { 1233 mfence; 1234 } 1235 } 1236 } 1237 unittest 1238 { 1239 _mm_mfence(); 1240 } 1241 1242 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe 1243 { 1244 // Note: clang uses a __builtin_ia32_pminsw128 which has disappeared from LDC LLVM (?) 1245 // Implemented using masks and XOR 1246 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else 1247 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1248 __m128i mask = _mm_and_si128(aTob, lowerShorts); 1249 return _mm_xor_si128(b, mask); 1250 } 1251 unittest 1252 { 1253 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-57), 1254 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 1255 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -57]; 1256 assert(R.array == correct); 1257 } 1258 1259 1260 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe 1261 { 1262 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1263 __m128i value128 = _mm_set1_epi8(-128); 1264 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 1265 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1266 __m128i mask = _mm_and_si128(aTob, lower); 1267 return _mm_xor_si128(b, mask); 1268 } 1269 unittest 1270 { 1271 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 1272 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 1273 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 1274 assert(R.array == correct); 1275 } 1276 1277 __m128d _mm_min_pd (__m128d a, __m128d b) pure @safe 1278 { 1279 // Generates minpd starting with LDC 1.9 1280 a[0] = (a[0] < b[0]) ? a[0] : b[0]; 1281 a[1] = (a[1] < b[1]) ? a[1] : b[1]; 1282 return a; 1283 } 1284 unittest 1285 { 1286 __m128d A = _mm_setr_pd(1.0, 2.0); 1287 __m128d B = _mm_setr_pd(4.0, 1.0); 1288 __m128d M = _mm_min_pd(A, B); 1289 assert(M[0] == 1.0); 1290 assert(M[1] == 1.0); 1291 } 1292 1293 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe 1294 { 1295 // Generates minsd starting with LDC 1.3 1296 __m128d r = a; 1297 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 1298 return r; 1299 } 1300 unittest 1301 { 1302 __m128d A = _mm_setr_pd(1.0, 3.0); 1303 __m128d B = _mm_setr_pd(4.0, 2.0); 1304 __m128d M = _mm_min_sd(A, B); 1305 assert(M[0] == 1.0); 1306 assert(M[1] == 3.0); 1307 } 1308 1309 __m128i _mm_move_epi64 (__m128i a) pure @safe 1310 { 1311 long2 result = [ 0, 0 ]; 1312 long2 la = cast(long2) a; 1313 result[0] = la[0]; 1314 return cast(__m128i)(result); 1315 } 1316 unittest 1317 { 1318 long2 A = [13, 47]; 1319 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 1320 long[2] correct = [13, 0]; 1321 assert(B.array == correct); 1322 } 1323 1324 __m128d _mm_move_sd (__m128d a, __m128d b) pure @safe 1325 { 1326 b[1] = a[1]; 1327 return b; 1328 } 1329 unittest 1330 { 1331 double2 A = [13.0, 47.0]; 1332 double2 B = [34.0, 58.0]; 1333 double2 C = _mm_move_sd(A, B); 1334 double[2] correct = [34.0, 47.0]; 1335 assert(C.array == correct); 1336 } 1337 1338 version(LDC) 1339 { 1340 alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128; 1341 alias _mm_movemask_pd = __builtin_ia32_movmskpd; 1342 } 1343 1344 // MMXREG: _mm_movepi64_pi64 1345 // MMXREG: __m128i _mm_movpi64_epi64 (__m64 a) 1346 1347 // PERF: unfortunately, __builtin_ia32_pmuludq128 disappeared from LDC 1348 // but seems there in clang 1349 __m128i _mm_mul_epu32(__m128i a, __m128i b) pure @safe 1350 { 1351 __m128i zero = _mm_setzero_si128(); 1352 long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero); 1353 long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero); 1354 static if (__VERSION__ >= 2076) 1355 { 1356 return cast(__m128i)(la * lb); 1357 } 1358 else 1359 { 1360 // long2 mul not supported before LDC 1.5 1361 la[0] *= lb[0]; 1362 la[1] *= lb[1]; 1363 return cast(__m128i)(la); 1364 } 1365 } 1366 unittest 1367 { 1368 __m128i A = _mm_set_epi32(0, 0xDEADBEEF, 0, 0xffffffff); 1369 __m128i B = _mm_set_epi32(0, 0xCAFEBABE, 0, 0xffffffff); 1370 __m128i C = _mm_mul_epu32(A, B); 1371 long2 LC = cast(long2)C; 1372 assert(LC.array[0] == 18446744065119617025uL); 1373 assert(LC.array[1] == 12723420444339690338uL); 1374 } 1375 1376 1377 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 1378 { 1379 return a * b; 1380 } 1381 unittest 1382 { 1383 __m128d a = [-2.0, 1.5]; 1384 a = _mm_mul_pd(a, a); 1385 assert(a.array == [4.0, 2.25]); 1386 } 1387 1388 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe 1389 { 1390 a[0] *= b[0]; 1391 return a; 1392 } 1393 unittest 1394 { 1395 __m128d a = [-2.0, 1.5]; 1396 a = _mm_mul_sd(a, a); 1397 assert(a.array == [4.0, 1.5]); 1398 } 1399 1400 1401 // MMXREG: _mm_mul_su32 1402 1403 version(LDC) 1404 { 1405 alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128; 1406 } 1407 else 1408 { 1409 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @safe 1410 { 1411 short8 sa = cast(short8)a; 1412 short8 sb = cast(short8)b; 1413 short8 r = void; 1414 r[0] = (sa[0] * sb[0]) >> 16; 1415 r[1] = (sa[1] * sb[1]) >> 16; 1416 r[2] = (sa[2] * sb[2]) >> 16; 1417 r[3] = (sa[3] * sb[3]) >> 16; 1418 r[4] = (sa[4] * sb[4]) >> 16; 1419 r[5] = (sa[5] * sb[5]) >> 16; 1420 r[6] = (sa[6] * sb[6]) >> 16; 1421 r[7] = (sa[7] * sb[7]) >> 16; 1422 return cast(__m128i)r; 1423 } 1424 } 1425 unittest 1426 { 1427 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 1428 __m128i B = _mm_set1_epi16(16384); 1429 short8 R = cast(short8)_mm_mulhi_epi16(A, B); 1430 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1]; 1431 assert(R.array == correct); 1432 } 1433 1434 version(LDC) 1435 { 1436 alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128; 1437 } 1438 else 1439 { 1440 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @safe 1441 { 1442 short8 sa = cast(short8)a; 1443 short8 sb = cast(short8)b; 1444 short8 r = void; 1445 r[0] = cast(short)( (cast(ushort)sa[0] * cast(ushort)sb[0]) >> 16 ); 1446 r[1] = cast(short)( (cast(ushort)sa[1] * cast(ushort)sb[1]) >> 16 ); 1447 r[2] = cast(short)( (cast(ushort)sa[2] * cast(ushort)sb[2]) >> 16 ); 1448 r[3] = cast(short)( (cast(ushort)sa[3] * cast(ushort)sb[3]) >> 16 ); 1449 r[4] = cast(short)( (cast(ushort)sa[4] * cast(ushort)sb[4]) >> 16 ); 1450 r[5] = cast(short)( (cast(ushort)sa[5] * cast(ushort)sb[5]) >> 16 ); 1451 r[6] = cast(short)( (cast(ushort)sa[6] * cast(ushort)sb[6]) >> 16 ); 1452 r[7] = cast(short)( (cast(ushort)sa[7] * cast(ushort)sb[7]) >> 16 ); 1453 return cast(__m128i)r; 1454 } 1455 } 1456 unittest 1457 { 1458 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 1459 __m128i B = _mm_set1_epi16(16384); 1460 short8 R = cast(short8)_mm_mulhi_epu16(A, B); 1461 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1]; 1462 assert(R.array == correct); 1463 } 1464 1465 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) 1466 { 1467 return cast(__m128i)(cast(short8)a * cast(short8)b); 1468 } 1469 1470 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 1471 { 1472 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 1473 } 1474 1475 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 1476 { 1477 return a | b; 1478 } 1479 1480 version(LDC) 1481 { 1482 alias _mm_packs_epi32 = __builtin_ia32_packssdw128; 1483 } 1484 else 1485 { 1486 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @safe 1487 { 1488 short8 r; 1489 r[0] = saturateSignedIntToSignedShort(a[0]); 1490 r[1] = saturateSignedIntToSignedShort(a[1]); 1491 r[2] = saturateSignedIntToSignedShort(a[2]); 1492 r[3] = saturateSignedIntToSignedShort(a[3]); 1493 r[4] = saturateSignedIntToSignedShort(b[0]); 1494 r[5] = saturateSignedIntToSignedShort(b[1]); 1495 r[6] = saturateSignedIntToSignedShort(b[2]); 1496 r[7] = saturateSignedIntToSignedShort(b[3]); 1497 return cast(__m128i)r; 1498 } 1499 } 1500 unittest 1501 { 1502 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 1503 short8 R = cast(short8) _mm_packs_epi32(A, A); 1504 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0]; 1505 assert(R.array == correct); 1506 } 1507 1508 version(LDC) 1509 { 1510 alias _mm_packs_epi16 = __builtin_ia32_packsswb128; 1511 } 1512 else 1513 { 1514 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @safe 1515 { 1516 byte16 r; 1517 short8 sa = cast(short8)a; 1518 short8 sb = cast(short8)b; 1519 foreach(i; 0..8) 1520 r[i] = saturateSignedWordToSignedByte(sa[i]); 1521 foreach(i; 0..8) 1522 r[i+8] = saturateSignedWordToSignedByte(sb[i]); 1523 return cast(__m128i)r; 1524 } 1525 } 1526 unittest 1527 { 1528 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0); 1529 byte16 R = cast(byte16) _mm_packs_epi16(A, A); 1530 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0, 1531 127, -128, 127, 0, 127, -128, 127, 0]; 1532 assert(R.array == correct); 1533 } 1534 1535 version(LDC) 1536 { 1537 alias _mm_packus_epi16 = __builtin_ia32_packuswb128; 1538 } 1539 else 1540 { 1541 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure 1542 { 1543 short8 sa = cast(short8)a; 1544 short8 sb = cast(short8)b; 1545 ubyte[16] result = void; 1546 for (int i = 0; i < 8; ++i) 1547 { 1548 short s = sa[i]; 1549 if (s < 0) s = 0; 1550 if (s > 255) s = 255; 1551 result[i] = cast(ubyte)s; 1552 1553 s = sb[i]; 1554 if (s < 0) s = 0; 1555 if (s > 255) s = 255; 1556 result[i+8] = cast(ubyte)s; 1557 } 1558 return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr); 1559 } 1560 } 1561 unittest 1562 { 1563 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 1564 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 1565 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 1566 0, 255, 0, 255, 255, 2, 1, 0]; 1567 foreach(i; 0..16) 1568 assert(AA[i] == cast(byte)(correctResult[i])); 1569 } 1570 1571 version(LDC) 1572 { 1573 alias _mm_pause = __builtin_ia32_pause; 1574 } 1575 else 1576 { 1577 void _mm_pause() pure @safe 1578 { 1579 asm nothrow @nogc pure @safe 1580 { 1581 rep; nop; // F3 90 = pause 1582 } 1583 } 1584 } 1585 unittest 1586 { 1587 _mm_pause(); 1588 } 1589 1590 1591 version(LDC) 1592 { 1593 alias _mm_sad_epu8 = __builtin_ia32_psadbw128; 1594 } 1595 else 1596 { 1597 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @safe 1598 { 1599 byte16 ab = cast(byte16)a; 1600 byte16 bb = cast(byte16)b; 1601 ubyte[16] t; 1602 foreach(i; 0..16) 1603 { 1604 int diff = cast(ubyte)(ab[i]) - cast(ubyte)(bb[i]); 1605 if (diff < 0) diff = -diff; 1606 t[i] = cast(ubyte)(diff); 1607 } 1608 int4 r = _mm_setzero_si128(); 1609 r[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 1610 r[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15]; 1611 return r; 1612 } 1613 } 1614 unittest 1615 { 1616 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 1617 __m128i B = _mm_set1_epi8(1); 1618 __m128i R = _mm_sad_epu8(A, B); 1619 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 1620 0, 1621 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 1622 0]; 1623 assert(R.array == correct); 1624 } 1625 1626 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 1627 { 1628 short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7]; 1629 return cast(__m128i) loadUnaligned!(short8)(result.ptr); 1630 } 1631 unittest 1632 { 1633 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1634 short8 B = cast(short8) A; 1635 foreach(i; 0..8) 1636 assert(B.array[i] == i); 1637 } 1638 1639 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 1640 { 1641 int[4] result = [e0, e1, e2, e3]; 1642 return loadUnaligned!(int4)(result.ptr); 1643 } 1644 unittest 1645 { 1646 __m128i A = _mm_set_epi32(3, 2, 1, 0); 1647 foreach(i; 0..4) 1648 assert(A.array[i] == i); 1649 } 1650 1651 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 1652 { 1653 long[2] result = [e0, e1]; 1654 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 1655 } 1656 unittest 1657 { 1658 __m128i A = _mm_set_epi64x(1234, 5678); 1659 long2 B = cast(long2) A; 1660 assert(B.array[0] == 5678); 1661 assert(B.array[1] == 1234); 1662 } 1663 1664 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 1665 byte e11, byte e10, byte e9, byte e8, 1666 byte e7, byte e6, byte e5, byte e4, 1667 byte e3, byte e2, byte e1, byte e0) pure @trusted 1668 { 1669 byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 1670 e8, e9, e10, e11, e12, e13, e14, e15]; 1671 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 1672 } 1673 1674 __m128d _mm_set_pd (double e1, double e0) pure @trusted 1675 { 1676 double[2] result = [e0, e1]; 1677 return loadUnaligned!(double2)(result.ptr); 1678 } 1679 1680 __m128d _mm_set_pd1 (double a) pure @trusted 1681 { 1682 double[2] result = [a, a]; 1683 return loadUnaligned!(double2)(result.ptr); 1684 } 1685 1686 __m128d _mm_set_sd (double a) pure @trusted 1687 { 1688 double[2] result = [a, 0]; 1689 return loadUnaligned!(double2)(result.ptr); 1690 } 1691 1692 __m128i _mm_set1_epi16 (short a) pure @trusted 1693 { 1694 short[8] result = [a, a, a, a, a, a, a, a]; 1695 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 1696 } 1697 1698 __m128i _mm_set1_epi32 (int a) pure @trusted 1699 { 1700 int[4] result = [a, a, a, a]; 1701 return loadUnaligned!(int4)(result.ptr); 1702 } 1703 unittest 1704 { 1705 __m128 a = _mm_set1_ps(-1.0f); 1706 __m128 b = cast(__m128) _mm_set1_epi32(0x7fffffff); 1707 assert(_mm_and_ps(a, b).array == [1.0f, 1, 1, 1]); 1708 } 1709 1710 __m128i _mm_set1_epi64x (long a) pure @trusted 1711 { 1712 long[2] result = [a, a]; 1713 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 1714 } 1715 1716 __m128i _mm_set1_epi8 (byte a) pure @trusted 1717 { 1718 byte[16] result = [a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a]; 1719 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 1720 } 1721 1722 alias _mm_set1_pd = _mm_set_pd1; 1723 1724 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 1725 { 1726 short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0]; 1727 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 1728 } 1729 1730 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 1731 { 1732 int[4] result = [e3, e2, e1, e0]; 1733 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 1734 } 1735 1736 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 1737 { 1738 long[2] result = [e1, e0]; 1739 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 1740 } 1741 1742 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12, 1743 byte e11, byte e10, byte e9, byte e8, 1744 byte e7, byte e6, byte e5, byte e4, 1745 byte e3, byte e2, byte e1, byte e0) pure @trusted 1746 { 1747 byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 1748 e7, e6, e5, e4, e3, e2, e1, e0]; 1749 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 1750 } 1751 1752 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 1753 { 1754 double[2] result = [e1, e0]; 1755 return loadUnaligned!(double2)(result.ptr); 1756 } 1757 1758 __m128d _mm_setzero_pd () pure @trusted 1759 { 1760 double[2] result = [0.0, 0.0]; 1761 return loadUnaligned!(double2)(result.ptr); 1762 } 1763 1764 __m128i _mm_setzero_si128() pure @trusted 1765 { 1766 int[4] result = [0, 0, 0, 0]; 1767 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 1768 } 1769 1770 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe 1771 { 1772 return shufflevector!(int4, (imm8 >> 0) & 3, 1773 (imm8 >> 2) & 3, 1774 (imm8 >> 4) & 3, 1775 (imm8 >> 6) & 3)(a, a); 1776 } 1777 unittest 1778 { 1779 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 1780 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 1781 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 1782 int[4] expectedB = [ 3, 2, 1, 0 ]; 1783 assert(B.array == expectedB); 1784 } 1785 1786 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe 1787 { 1788 return shufflevector!(double2, 0 + ( imm8 & 1 ), 1789 2 + ( (imm8 >> 1) & 1 ))(a, b); 1790 } 1791 unittest 1792 { 1793 __m128d A = _mm_setr_pd(0.5, 2.0); 1794 __m128d B = _mm_setr_pd(4.0, 5.0); 1795 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 1796 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B); 1797 double[2] correct = [ 2.0, 5.0 ]; 1798 assert(R.array == correct); 1799 } 1800 1801 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe 1802 { 1803 return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3, 1804 4 + ( (imm8 >> 0) & 3 ), 1805 4 + ( (imm8 >> 2) & 3 ), 1806 4 + ( (imm8 >> 4) & 3 ), 1807 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 1808 } 1809 unittest 1810 { 1811 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 1812 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 1813 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 1814 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 1815 assert(C.array == expectedC); 1816 } 1817 1818 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe 1819 { 1820 return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ), 1821 ( (imm8 >> 2) & 3 ), 1822 ( (imm8 >> 4) & 3 ), 1823 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 1824 } 1825 unittest 1826 { 1827 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 1828 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 1829 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 1830 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 1831 assert(B.array == expectedB); 1832 } 1833 1834 version(LDC) 1835 { 1836 alias _mm_sll_epi32 = __builtin_ia32_pslld128; 1837 } 1838 else 1839 { 1840 __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe 1841 { 1842 int4 r = void; 1843 long2 lc = cast(long2)count; 1844 int bits = cast(int)(lc[0]); 1845 foreach(i; 0..4) 1846 r[i] = cast(uint)(a[i]) << bits; 1847 return r; 1848 } 1849 } 1850 unittest 1851 { 1852 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 1853 __m128i B = _mm_sll_epi32(A, _mm_cvtsi32_si128(1)); 1854 int[4] expectedB = [ 0, 4, 6, -8]; 1855 assert(B.array == expectedB); 1856 } 1857 1858 version(LDC) 1859 { 1860 alias _mm_sll_epi64 = __builtin_ia32_psllq128; 1861 } 1862 else 1863 { 1864 __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe 1865 { 1866 long2 r = void; 1867 long2 sa = cast(long2)a; 1868 long2 lc = cast(long2)count; 1869 int bits = cast(int)(lc[0]); 1870 foreach(i; 0..2) 1871 r[i] = cast(ulong)(sa[i]) << bits; 1872 return cast(__m128i)r; 1873 } 1874 } 1875 unittest 1876 { 1877 __m128i A = _mm_setr_epi64(8, -4); 1878 long2 B = cast(long2) _mm_sll_epi64(A, _mm_cvtsi32_si128(1)); 1879 long[2] expectedB = [ 16, -8]; 1880 assert(B.array == expectedB); 1881 } 1882 1883 version(LDC) 1884 { 1885 alias _mm_sll_epi16 = __builtin_ia32_psllw128; 1886 } 1887 else 1888 { 1889 __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @safe 1890 { 1891 short8 sa = cast(short8)a; 1892 long2 lc = cast(long2)count; 1893 int bits = cast(int)(lc[0]); 1894 short8 r = void; 1895 foreach(i; 0..8) 1896 r[i] = cast(short)(cast(ushort)(sa[i]) << bits); 1897 return cast(int4)r; 1898 } 1899 } 1900 unittest 1901 { 1902 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 1903 short8 B = cast(short8)( _mm_sll_epi16(A, _mm_cvtsi32_si128(1)) ); 1904 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 1905 assert(B.array == expectedB); 1906 } 1907 1908 version(LDC) 1909 { 1910 alias _mm_slli_epi32 = __builtin_ia32_pslldi128; 1911 } 1912 else 1913 { 1914 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @safe 1915 { 1916 int4 r = void; 1917 foreach(i; 0..4) 1918 r[i] = cast(uint)(a[i]) << imm8; 1919 return r; 1920 } 1921 } 1922 unittest 1923 { 1924 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 1925 __m128i B = _mm_slli_epi32(A, 1); 1926 int[4] expectedB = [ 0, 4, 6, -8]; 1927 assert(B.array == expectedB); 1928 } 1929 1930 version(LDC) 1931 { 1932 alias _mm_slli_epi64 = __builtin_ia32_psllqi128; 1933 } 1934 else 1935 { 1936 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @safe 1937 { 1938 long2 r = void; 1939 long2 sa = cast(long2)a; 1940 foreach(i; 0..2) 1941 r[i] = cast(ulong)(sa[i]) << imm8; 1942 return cast(__m128i)r; 1943 } 1944 } 1945 unittest 1946 { 1947 __m128i A = _mm_setr_epi64(8, -4); 1948 long2 B = cast(long2) _mm_slli_epi64(A, 1); 1949 long[2] expectedB = [ 16, -8]; 1950 assert(B.array == expectedB); 1951 } 1952 1953 version(LDC) 1954 { 1955 alias _mm_slli_epi16 = __builtin_ia32_psllwi128; 1956 } 1957 else 1958 { 1959 __m128i _mm_slli_epi16 (__m128i a, int imm8) pure @safe 1960 { 1961 short8 sa = cast(short8)a; 1962 short8 r = void; 1963 foreach(i; 0..8) 1964 r[i] = cast(short)(cast(ushort)(sa[i]) << imm8); 1965 return cast(int4)r; 1966 } 1967 } 1968 unittest 1969 { 1970 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 1971 short8 B = cast(short8)( _mm_slli_epi16(A, 1) ); 1972 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 1973 assert(B.array == expectedB); 1974 } 1975 1976 /// Shift `a` left by `imm8` bytes while shifting in zeros. 1977 __m128i _mm_slli_si128(ubyte imm8)(__m128i op) pure @safe 1978 { 1979 static if (imm8 & 0xF0) 1980 return _mm_setzero_si128(); 1981 else 1982 return cast(__m128i) shufflevector!(byte16, 1983 16 - imm8, 17 - imm8, 18 - imm8, 19 - imm8, 20 - imm8, 21 - imm8, 22 - imm8, 23 - imm8, 1984 24 - imm8, 25 - imm8, 26 - imm8, 27 - imm8, 28 - imm8, 29 - imm8, 30 - imm8, 31 - imm8) 1985 (cast(byte16)_mm_setzero_si128(), cast(byte16)op); 1986 } 1987 unittest 1988 { 1989 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 1990 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left 1991 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ]; 1992 assert(R.array == correct); 1993 } 1994 1995 version(LDC) 1996 { 1997 // Disappeared with LDC 1.11 1998 static if (__VERSION__ < 2081) 1999 alias _mm_sqrt_pd = __builtin_ia32_sqrtpd; 2000 else 2001 { 2002 __m128d _mm_sqrt_pd(__m128d vec) pure @safe 2003 { 2004 vec.array[0] = llvm_sqrt(vec.array[0]); 2005 vec.array[1] = llvm_sqrt(vec.array[1]); 2006 return vec; 2007 } 2008 } 2009 } 2010 else 2011 { 2012 __m128d _mm_sqrt_pd(__m128d vec) pure @safe 2013 { 2014 vec.array[0] = sqrt(vec.array[0]); 2015 vec.array[1] = sqrt(vec.array[1]); 2016 return vec; 2017 } 2018 } 2019 2020 2021 version(LDC) 2022 { 2023 // Disappeared with LDC 1.11 2024 static if (__VERSION__ < 2081) 2025 alias _mm_sqrt_sd = __builtin_ia32_sqrtsd; 2026 else 2027 { 2028 __m128d _mm_sqrt_sd(__m128d vec) pure @safe 2029 { 2030 vec.array[0] = llvm_sqrt(vec.array[0]); 2031 vec.array[1] = vec.array[1]; 2032 return vec; 2033 } 2034 } 2035 } 2036 else 2037 { 2038 __m128d _mm_sqrt_sd(__m128d vec) pure @safe 2039 { 2040 vec.array[0] = sqrt(vec.array[0]); 2041 vec.array[1] = vec.array[1]; 2042 return vec; 2043 } 2044 } 2045 2046 2047 version(LDC) 2048 { 2049 alias _mm_sra_epi16 = __builtin_ia32_psraw128; 2050 } 2051 else 2052 { 2053 __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @safe 2054 { 2055 short8 sa = cast(short8)a; 2056 long2 lc = cast(long2)count; 2057 int bits = cast(int)(lc[0]); 2058 short8 r = void; 2059 foreach(i; 0..8) 2060 r[i] = cast(short)(sa[i] >> bits); 2061 return cast(int4)r; 2062 } 2063 } 2064 unittest 2065 { 2066 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 2067 short8 B = cast(short8)( _mm_sra_epi16(A, _mm_cvtsi32_si128(1)) ); 2068 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 2069 assert(B.array == expectedB); 2070 } 2071 2072 version(LDC) 2073 { 2074 alias _mm_sra_epi32 = __builtin_ia32_psrad128; 2075 } 2076 else 2077 { 2078 __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @safe 2079 { 2080 int4 r = void; 2081 long2 lc = cast(long2)count; 2082 int bits = cast(int)(lc[0]); 2083 foreach(i; 0..4) 2084 r[i] = (a[i] >> bits); 2085 return r; 2086 } 2087 } 2088 unittest 2089 { 2090 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 2091 __m128i B = _mm_sra_epi32(A, _mm_cvtsi32_si128(1)); 2092 int[4] expectedB = [ 0, 1, 1, -2]; 2093 assert(B.array == expectedB); 2094 } 2095 2096 2097 version(LDC) 2098 { 2099 alias _mm_srai_epi16 = __builtin_ia32_psrawi128; 2100 } 2101 else 2102 { 2103 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @safe 2104 { 2105 short8 sa = cast(short8)a; 2106 short8 r = void; 2107 foreach(i; 0..8) 2108 r[i] = cast(short)(sa[i] >> imm8); 2109 return cast(int4)r; 2110 } 2111 } 2112 unittest 2113 { 2114 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 2115 short8 B = cast(short8)( _mm_srai_epi16(A, 1) ); 2116 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 2117 assert(B.array == expectedB); 2118 } 2119 2120 version(LDC) 2121 { 2122 alias _mm_srai_epi32 = __builtin_ia32_psradi128; 2123 } 2124 else 2125 { 2126 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @safe 2127 { 2128 int4 r = void; 2129 foreach(i; 0..4) 2130 r[i] = (a[i] >> imm8); 2131 return r; 2132 } 2133 } 2134 unittest 2135 { 2136 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 2137 __m128i B = _mm_srai_epi32(A, 1); 2138 int[4] expectedB = [ 0, 1, 1, -2]; 2139 assert(B.array == expectedB); 2140 } 2141 2142 version(LDC) 2143 { 2144 alias _mm_srl_epi16 = __builtin_ia32_psrlw128; 2145 } 2146 else 2147 { 2148 __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @safe 2149 { 2150 short8 sa = cast(short8)a; 2151 long2 lc = cast(long2)count; 2152 int bits = cast(int)(lc[0]); 2153 short8 r = void; 2154 foreach(i; 0..8) 2155 r[i] = cast(short)(cast(ushort)(sa[i]) >> bits); 2156 return cast(int4)r; 2157 } 2158 } 2159 unittest 2160 { 2161 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 2162 short8 B = cast(short8)( _mm_srl_epi16(A, _mm_cvtsi32_si128(1)) ); 2163 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 2164 assert(B.array == expectedB); 2165 } 2166 2167 version(LDC) 2168 { 2169 alias _mm_srl_epi32 = __builtin_ia32_psrld128; 2170 } 2171 else 2172 { 2173 __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @safe 2174 { 2175 int4 r = void; 2176 long2 lc = cast(long2)count; 2177 int bits = cast(int)(lc[0]); 2178 foreach(i; 0..4) 2179 r[i] = cast(uint)(a[i]) >> bits; 2180 return r; 2181 } 2182 } 2183 unittest 2184 { 2185 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 2186 __m128i B = _mm_srl_epi32(A, _mm_cvtsi32_si128(1)); 2187 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 2188 assert(B.array == expectedB); 2189 } 2190 2191 version(LDC) 2192 { 2193 alias _mm_srl_epi64 = __builtin_ia32_psrlq128; 2194 } 2195 else 2196 { 2197 __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @safe 2198 { 2199 long2 r = void; 2200 long2 sa = cast(long2)a; 2201 long2 lc = cast(long2)count; 2202 int bits = cast(int)(lc[0]); 2203 foreach(i; 0..2) 2204 r[i] = cast(ulong)(sa[i]) >> bits; 2205 return cast(__m128i)r; 2206 } 2207 } 2208 unittest 2209 { 2210 __m128i A = _mm_setr_epi64(8, -4); 2211 long2 B = cast(long2) _mm_srl_epi64(A, _mm_cvtsi32_si128(1)); 2212 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 2213 assert(B.array == expectedB); 2214 } 2215 2216 version(LDC) 2217 { 2218 alias _mm_srli_epi16 = __builtin_ia32_psrlwi128; 2219 } 2220 else 2221 { 2222 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @safe 2223 { 2224 short8 sa = cast(short8)a; 2225 short8 r = void; 2226 foreach(i; 0..8) 2227 r[i] = cast(short)(cast(ushort)(sa[i]) >> imm8); 2228 return cast(int4)r; 2229 } 2230 } 2231 unittest 2232 { 2233 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 2234 short8 B = cast(short8)( _mm_srli_epi16(A, 1) ); 2235 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 2236 assert(B.array == expectedB); 2237 } 2238 2239 version(LDC) 2240 { 2241 alias _mm_srli_epi32 = __builtin_ia32_psrldi128; 2242 } 2243 else 2244 { 2245 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @safe 2246 { 2247 int4 r = void; 2248 foreach(i; 0..4) 2249 r[i] = cast(uint)(a[i]) >> imm8; 2250 return r; 2251 } 2252 } 2253 unittest 2254 { 2255 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 2256 __m128i B = _mm_srli_epi32(A, 1); 2257 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 2258 assert(B.array == expectedB); 2259 } 2260 2261 version(LDC) 2262 { 2263 alias _mm_srli_epi64 = __builtin_ia32_psrlqi128; 2264 } 2265 else 2266 { 2267 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @safe 2268 { 2269 long2 r = void; 2270 long2 sa = cast(long2)a; 2271 foreach(i; 0..2) 2272 r[i] = cast(ulong)(sa[i]) >> imm8; 2273 return cast(__m128i)r; 2274 } 2275 } 2276 unittest 2277 { 2278 __m128i A = _mm_setr_epi64(8, -4); 2279 long2 B = cast(long2) _mm_srli_epi64(A, 1); 2280 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 2281 assert(B.array == expectedB); 2282 } 2283 2284 /// Shift `v` right by `bytes` bytes while shifting in zeros. 2285 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe 2286 { 2287 static if (bytes & 0xF0) 2288 return _mm_setzero_si128(); 2289 else 2290 return cast(__m128i) shufflevector!(byte16, 2291 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7, 2292 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15) 2293 (cast(byte16) v, cast(byte16)_mm_setzero_si128()); 2294 } 2295 unittest 2296 { 2297 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1)); 2298 int[4] correct = [2, 3, 4, 0]; 2299 assert(R.array == correct); 2300 } 2301 2302 /// Shift `v` right by `bytes` bytes while shifting in zeros. 2303 /// #BONUS 2304 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe 2305 { 2306 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v); 2307 } 2308 unittest 2309 { 2310 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)); 2311 float[4] correct = [3.0f, 4.0f, 0, 0]; 2312 assert(R.array == correct); 2313 } 2314 2315 /// Shift `v` right by `bytes` bytes while shifting in zeros. 2316 /// #BONUS 2317 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe 2318 { 2319 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v); 2320 } 2321 2322 void _mm_store_pd (double* mem_addr, __m128d a) pure 2323 { 2324 __m128d* aligned = cast(__m128d*)mem_addr; 2325 *aligned = a; 2326 } 2327 2328 void _mm_store_pd1 (double* mem_addr, __m128d a) pure 2329 { 2330 __m128d* aligned = cast(__m128d*)mem_addr; 2331 *aligned = shufflevector!(double2, 0, 0)(a, a); 2332 } 2333 2334 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 2335 { 2336 *mem_addr = extractelement!(double2, 0)(a); 2337 } 2338 2339 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 2340 { 2341 *mem_addr = a; 2342 } 2343 2344 alias _mm_store1_pd = _mm_store_pd1; 2345 2346 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 2347 { 2348 *mem_addr = extractelement!(double2, 1)(a); 2349 } 2350 2351 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 2352 { 2353 long* dest = cast(long*)mem_addr; 2354 *dest = extractelement!(long2, 0)(cast(long2)a); 2355 } 2356 2357 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 2358 { 2359 *mem_addr = extractelement!(double2, 0)(a); 2360 } 2361 2362 void _mm_storer_pd (double* mem_addr, __m128d a) pure 2363 { 2364 __m128d* aligned = cast(__m128d*)mem_addr; 2365 *aligned = shufflevector!(double2, 1, 0)(a, a); 2366 } 2367 2368 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe 2369 { 2370 storeUnaligned!double2(a, mem_addr); 2371 } 2372 2373 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe 2374 { 2375 storeUnaligned!__m128i(a, cast(int*)mem_addr); 2376 } 2377 2378 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 2379 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte 2380 /// boundary or a general-protection exception may be generated. 2381 void _mm_stream_pd (double* mem_addr, __m128d a) 2382 { 2383 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 2384 __m128d* dest = cast(__m128d*)mem_addr; 2385 *dest = a; 2386 } 2387 2388 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 2389 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception 2390 /// may be generated. 2391 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) 2392 { 2393 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 2394 __m128i* dest = cast(__m128i*)mem_addr; 2395 *dest = a; 2396 } 2397 2398 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 2399 /// pollution. If the cache line containing address mem_addr is already in the cache, 2400 /// the cache will be updated. 2401 void _mm_stream_si32 (int* mem_addr, int a) 2402 { 2403 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 2404 *mem_addr = a; 2405 } 2406 2407 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 2408 /// cache pollution. If the cache line containing address mem_addr is already 2409 /// in the cache, the cache will be updated. 2410 void _mm_stream_si64 (long* mem_addr, long a) 2411 { 2412 // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 2413 *mem_addr = a; 2414 } 2415 2416 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 2417 { 2418 return cast(__m128i)(cast(short8)a - cast(short8)b); 2419 } 2420 2421 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 2422 { 2423 return cast(__m128i)(cast(int4)a - cast(int4)b); 2424 } 2425 2426 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 2427 { 2428 return cast(__m128i)(cast(long2)a - cast(long2)b); 2429 } 2430 2431 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 2432 { 2433 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 2434 } 2435 2436 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 2437 { 2438 return a - b; 2439 } 2440 2441 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe 2442 { 2443 a[0] -= b[0]; 2444 return a; 2445 } 2446 unittest 2447 { 2448 __m128d a = [1.5, -2.0]; 2449 a = _mm_sub_sd(a, a); 2450 assert(a.array == [0.0, -2.0]); 2451 } 2452 2453 2454 // MMXREG: _mm_sub_si64 2455 2456 version(LDC) 2457 { 2458 alias _mm_subs_epi16 = __builtin_ia32_psubsw128; 2459 alias _mm_subs_epi8 = __builtin_ia32_psubsb128; 2460 alias _mm_subs_epu16 = __builtin_ia32_psubusw128; 2461 alias _mm_subs_epu8 = __builtin_ia32_psubusb128; 2462 } 2463 2464 // Note: the only difference between these intrinsics is the signalling 2465 // behaviour of quiet NaNs. This is incorrect but the case where 2466 // you would want to differentiate between qNaN and sNaN and then 2467 // treat them differently on purpose seems extremely rare. 2468 alias _mm_ucomieq_sd = _mm_comieq_sd; 2469 alias _mm_ucomige_sd = _mm_comige_sd; 2470 alias _mm_ucomigt_sd = _mm_comigt_sd; 2471 alias _mm_ucomile_sd = _mm_comile_sd; 2472 alias _mm_ucomilt_sd = _mm_comilt_sd; 2473 alias _mm_ucomineq_sd = _mm_comineq_sd; 2474 2475 __m128d _mm_undefined_pd() pure @safe 2476 { 2477 __m128d result = void; 2478 return result; 2479 } 2480 __m128i _mm_undefined_si128() pure @safe 2481 { 2482 __m128i result = void; 2483 return result; 2484 } 2485 2486 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe 2487 { 2488 return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 2489 (cast(short8)a, cast(short8)b); 2490 } 2491 2492 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @safe 2493 { 2494 return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 2495 } 2496 2497 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @safe 2498 { 2499 return cast(__m128i) shufflevector!(long2, 1, 3)(cast(long2)a, cast(long2)b); 2500 } 2501 2502 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe 2503 { 2504 return cast(__m128i)shufflevector!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 2505 12, 28, 13, 29, 14, 30, 15, 31) 2506 (cast(byte16)a, cast(byte16)b); 2507 } 2508 2509 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe 2510 { 2511 return shufflevector!(__m128d, 1, 3)(a, b); 2512 } 2513 2514 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe 2515 { 2516 return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11) 2517 (cast(short8)a, cast(short8)b); 2518 } 2519 2520 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @safe 2521 { 2522 return shufflevector!(int4, 0, 4, 1, 5) 2523 (cast(int4)a, cast(int4)b); 2524 } 2525 2526 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @safe 2527 { 2528 return cast(__m128i) shufflevector!(long2, 0, 2) 2529 (cast(long2)a, cast(long2)b); 2530 } 2531 2532 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe 2533 { 2534 return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 2535 4, 20, 5, 21, 6, 22, 7, 23) 2536 (cast(byte16)a, cast(byte16)b); 2537 } 2538 2539 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe 2540 { 2541 return shufflevector!(__m128d, 0, 2)(a, b); 2542 } 2543 2544 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 2545 { 2546 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 2547 } 2548 2549 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 2550 { 2551 return a ^ b; 2552 } 2553 2554 unittest 2555 { 2556 // distance between two points in 4D 2557 float distance(float[4] a, float[4] b) nothrow @nogc 2558 { 2559 __m128 va = _mm_loadu_ps(a.ptr); 2560 __m128 vb = _mm_loadu_ps(b.ptr); 2561 __m128 diffSquared = _mm_sub_ps(va, vb); 2562 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 2563 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared)); 2564 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum)); 2565 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 2566 } 2567 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 2568 }