1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2019. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.emmintrin; 7 8 public import inteli.types; 9 public import inteli.xmmintrin; // SSE2 includes SSE1 10 import inteli.mmx; 11 import inteli.internals; 12 13 nothrow @nogc: 14 15 // SSE2 instructions 16 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 17 18 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 19 { 20 return cast(__m128i)(cast(short8)a + cast(short8)b); 21 } 22 23 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 24 { 25 return cast(__m128i)(cast(int4)a + cast(int4)b); 26 } 27 28 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 29 { 30 return cast(__m128i)(cast(long2)a + cast(long2)b); 31 } 32 33 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 34 { 35 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 36 } 37 38 version(DigitalMars) 39 { 40 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 41 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 42 { 43 asm pure nothrow @nogc @trusted { nop;} 44 a[0] = a[0] + b[0]; 45 return a; 46 } 47 } 48 else 49 { 50 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 51 { 52 a[0] += b[0]; 53 return a; 54 } 55 } 56 unittest 57 { 58 __m128d a = [1.5, -2.0]; 59 a = _mm_add_sd(a, a); 60 assert(a.array == [3.0, -2.0]); 61 } 62 63 64 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 65 { 66 return a + b; 67 } 68 unittest 69 { 70 __m128d a = [1.5, -2.0]; 71 a = _mm_add_pd(a, a); 72 assert(a.array == [3.0, -4.0]); 73 } 74 75 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe 76 { 77 return a + b; 78 } 79 80 version(LDC) 81 { 82 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 83 { 84 // Generates PADDSW since LDC 1.15 -O0 85 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 86 { 87 enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 88 enum ir = ` 89 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 90 ret <8 x i16> %r`; 91 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 92 } 93 } 94 else 95 alias _mm_adds_epi16 = __builtin_ia32_paddsw128; 96 } 97 else 98 { 99 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 100 { 101 short[8] res; 102 short8 sa = cast(short8)a; 103 short8 sb = cast(short8)b; 104 foreach(i; 0..8) 105 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 106 return _mm_loadu_si128(cast(int4*)res.ptr); 107 } 108 } 109 unittest 110 { 111 short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), 112 _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); 113 static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 114 assert(res.array == correctResult); 115 } 116 117 version(LDC) 118 { 119 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 120 { 121 // Generates PADDSB since LDC 1.15 -O0 122 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 123 { 124 enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 125 enum ir = ` 126 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 127 ret <16 x i8> %r`; 128 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 129 } 130 } 131 else 132 alias _mm_adds_epi8 = __builtin_ia32_paddsb128; 133 } 134 else 135 { 136 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 137 { 138 byte[16] res; 139 byte16 sa = cast(byte16)a; 140 byte16 sb = cast(byte16)b; 141 foreach(i; 0..16) 142 res[i] = saturateSignedWordToSignedByte(sa.array[i] + sb.array[i]); 143 return _mm_loadu_si128(cast(int4*)res.ptr); 144 } 145 } 146 unittest 147 { 148 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 149 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 150 static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14, 151 16, 18, 20, 22, 24, 26, 28, 30]; 152 assert(res.array == correctResult); 153 } 154 155 version(LDC) 156 { 157 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 158 { 159 // Generates PADDUSB since LDC 1.15 -O0 160 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 161 { 162 enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 163 enum ir = ` 164 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 165 ret <16 x i8> %r`; 166 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 167 } 168 } 169 else 170 alias _mm_adds_epu8 = __builtin_ia32_paddusb128; 171 } 172 else 173 { 174 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 175 { 176 ubyte[16] res; 177 byte16 sa = cast(byte16)a; 178 byte16 sb = cast(byte16)b; 179 foreach(i; 0..16) 180 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 181 return _mm_loadu_si128(cast(int4*)res.ptr); 182 } 183 } 184 185 version(LDC) 186 { 187 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 188 { 189 // Generates PADDUSW since LDC 1.15 -O0 190 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 191 { 192 enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 193 enum ir = ` 194 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 195 ret <8 x i16> %r`; 196 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 197 } 198 } 199 else 200 alias _mm_adds_epu16 = __builtin_ia32_paddusw128; 201 } 202 else 203 { 204 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 205 { 206 ushort[8] res; 207 short8 sa = cast(short8)a; 208 short8 sb = cast(short8)b; 209 foreach(i; 0..8) 210 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 211 return _mm_loadu_si128(cast(int4*)res.ptr); 212 } 213 } 214 215 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 216 { 217 return cast(__m128d)( cast(__m128i)a & cast(__m128i)b ); 218 } 219 220 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 221 { 222 return a & b; 223 } 224 unittest 225 { 226 __m128i A = _mm_set1_epi32(7); 227 __m128i B = _mm_set1_epi32(14); 228 __m128i R = _mm_and_si128(A, B); 229 int[4] correct = [6, 6, 6, 6]; 230 assert(R.array == correct); 231 } 232 233 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 234 { 235 return cast(__m128d)( (~cast(__m128i)a) & cast(__m128i)b ); 236 } 237 238 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 239 { 240 return (~a) & b; 241 } 242 unittest 243 { 244 __m128i A = _mm_set1_epi32(7); 245 __m128i B = _mm_set1_epi32(14); 246 __m128i R = _mm_andnot_si128(A, B); 247 int[4] correct = [8, 8, 8, 8]; 248 assert(R.array == correct); 249 } 250 251 version(LDC) 252 { 253 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @safe 254 { 255 // Generates pavgw even in LDC 1.0, even in -O0 256 enum ir = ` 257 %ia = zext <8 x i16> %0 to <8 x i32> 258 %ib = zext <8 x i16> %1 to <8 x i32> 259 %isum = add <8 x i32> %ia, %ib 260 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 261 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 262 %r = trunc <8 x i32> %isums to <8 x i16> 263 ret <8 x i16> %r`; 264 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 265 } 266 } 267 else 268 { 269 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @safe 270 { 271 short8 sa = cast(short8)a; 272 short8 sb = cast(short8)b; 273 short8 sr = void; 274 foreach(i; 0..8) 275 { 276 sr[i] = cast(ushort)( (cast(ushort)(sa[i]) + cast(ushort)(sb[i]) + 1) >> 1 ); 277 } 278 return cast(int4)sr; 279 } 280 } 281 unittest 282 { 283 __m128i A = _mm_set1_epi16(31); 284 __m128i B = _mm_set1_epi16(64); 285 short8 avg = cast(short8)(_mm_avg_epu16(A, B)); 286 foreach(i; 0..8) 287 assert(avg[i] == 48); 288 } 289 290 version(LDC) 291 { 292 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @safe 293 { 294 // Generates pavgb even in LDC 1.0, even in -O0 295 enum ir = ` 296 %ia = zext <16 x i8> %0 to <16 x i16> 297 %ib = zext <16 x i8> %1 to <16 x i16> 298 %isum = add <16 x i16> %ia, %ib 299 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 300 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 301 %r = trunc <16 x i16> %isums to <16 x i8> 302 ret <16 x i8> %r`; 303 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 304 } 305 } 306 else 307 { 308 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @safe 309 { 310 byte16 sa = cast(byte16)a; 311 byte16 sb = cast(byte16)b; 312 byte16 sr = void; 313 foreach(i; 0..16) 314 { 315 sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 ); 316 } 317 return cast(int4)sr; 318 } 319 } 320 unittest 321 { 322 __m128i A = _mm_set1_epi8(31); 323 __m128i B = _mm_set1_epi8(64); 324 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B)); 325 foreach(i; 0..16) 326 assert(avg[i] == 48); 327 } 328 329 // Note: unlike Intel API, shift amount is a compile-time parameter. 330 __m128i _mm_bslli_si128(int bits)(__m128i a) pure @safe 331 { 332 // Generates pslldq starting with LDC 1.1 -O2 333 __m128i zero = _mm_setzero_si128(); 334 return cast(__m128i) 335 shufflevector!(byte16, 16 - bits, 17 - bits, 18 - bits, 19 - bits, 336 20 - bits, 21 - bits, 22 - bits, 23 - bits, 337 24 - bits, 25 - bits, 26 - bits, 27 - bits, 338 28 - bits, 29 - bits, 30 - bits, 31 - bits) 339 (cast(byte16)zero, cast(byte16)a); 340 } 341 unittest 342 { 343 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 344 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 345 __m128i result = _mm_bslli_si128!5(toShift); 346 assert( (cast(byte16)result).array == exact); 347 } 348 349 // Note: unlike Intel API, shift amount is a compile-time parameter. 350 __m128i _mm_bsrli_si128(int bits)(__m128i a) pure @safe 351 { 352 // Generates psrldq starting with LDC 1.1 -O2 353 __m128i zero = _mm_setzero_si128(); 354 return cast(__m128i) 355 shufflevector!(byte16, 0 + bits, 1 + bits, 2 + bits, 3 + bits, 356 4 + bits, 5 + bits, 6 + bits, 7 + bits, 357 8 + bits, 9 + bits, 10 + bits, 11 + bits, 358 12 + bits, 13 + bits, 14 + bits, 15 + bits) 359 (cast(byte16)a, cast(byte16)zero); 360 } 361 unittest 362 { 363 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 364 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0]; 365 __m128i result = _mm_bsrli_si128!5(toShift); 366 assert( (cast(byte16)result).array == exact); 367 } 368 369 __m128 _mm_castpd_ps (__m128d a) pure @safe 370 { 371 return cast(__m128)a; 372 } 373 374 __m128i _mm_castpd_si128 (__m128d a) pure @safe 375 { 376 return cast(__m128i)a; 377 } 378 379 __m128d _mm_castps_pd (__m128 a) pure @safe 380 { 381 return cast(__m128d)a; 382 } 383 384 __m128i _mm_castps_si128 (__m128 a) pure @safe 385 { 386 return cast(__m128i)a; 387 } 388 389 __m128d _mm_castsi128_pd (__m128i a) pure @safe 390 { 391 return cast(__m128d)a; 392 } 393 394 __m128 _mm_castsi128_ps (__m128i a) pure @safe 395 { 396 return cast(__m128)a; 397 } 398 399 version(LDC) 400 { 401 alias _mm_clflush = __builtin_ia32_clflush; 402 } 403 else 404 { 405 void _mm_clflush (const(void)* p) pure @safe 406 { 407 version(D_InlineAsm_X86) 408 { 409 asm pure nothrow @nogc @safe 410 { 411 mov EAX, p; 412 clflush [EAX]; 413 } 414 } 415 else version(D_InlineAsm_X86_64) 416 { 417 asm pure nothrow @nogc @safe 418 { 419 mov RAX, p; 420 clflush [RAX]; 421 } 422 } 423 } 424 } 425 unittest 426 { 427 ubyte[64] cacheline; 428 _mm_clflush(cacheline.ptr); 429 } 430 431 432 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 433 { 434 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 435 } 436 unittest 437 { 438 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 439 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 440 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 441 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 442 assert(R.array == E); 443 } 444 445 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 446 { 447 return equalMask!__m128i(a, b); 448 } 449 unittest 450 { 451 int4 A = [-3, -2, -1, 0]; 452 int4 B = [ 4, -2, 2, 0]; 453 int[4] E = [ 0, -1, 0, -1]; 454 int4 R = cast(int4)(_mm_cmpeq_epi16(A, B)); 455 assert(R.array == E); 456 } 457 458 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 459 { 460 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 461 } 462 unittest 463 { 464 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 465 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 466 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 467 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 468 assert(C.array == correct); 469 } 470 471 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 472 { 473 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b); 474 } 475 476 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 477 { 478 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b); 479 } 480 481 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 482 { 483 return cast(__m128d) cmppd!(FPComparison.oge)(a, b); 484 } 485 486 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 487 { 488 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b); 489 } 490 491 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 492 { 493 return cast(__m128i)( greaterMask!short8(cast(short8)a, cast(short8)b)); 494 } 495 unittest 496 { 497 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 498 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 499 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 500 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 501 assert(R.array == E); 502 } 503 504 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 505 { 506 return cast(__m128i)( greaterMask!int4(a, b)); 507 } 508 unittest 509 { 510 int4 A = [-3, 2, -1, 0]; 511 int4 B = [ 4, -2, 2, 0]; 512 int[4] E = [ 0, -1, 0, 0]; 513 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 514 assert(R.array == E); 515 } 516 517 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 518 { 519 return cast(__m128i)( greaterMask!byte16(cast(byte16)a, cast(byte16)b)); 520 } 521 unittest 522 { 523 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 524 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 525 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 526 byte[16] correct = [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 527 __m128i D = _mm_cmpeq_epi8(A, B); 528 assert(C.array == correct); 529 } 530 531 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 532 { 533 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b); 534 } 535 536 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 537 { 538 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b); 539 } 540 541 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 542 { 543 return cast(__m128d) cmppd!(FPComparison.ole)(a, b); 544 } 545 546 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 547 { 548 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b); 549 } 550 551 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 552 { 553 return _mm_cmpgt_epi16(b, a); 554 } 555 556 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 557 { 558 return _mm_cmpgt_epi32(b, a); 559 } 560 561 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 562 { 563 return _mm_cmpgt_epi8(b, a); 564 } 565 566 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 567 { 568 return cast(__m128d) cmppd!(FPComparison.olt)(a, b); 569 } 570 571 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 572 { 573 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b); 574 } 575 576 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 577 { 578 return cast(__m128d) cmppd!(FPComparison.une)(a, b); 579 } 580 581 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 582 { 583 return cast(__m128d) cmpsd!(FPComparison.une)(a, b); 584 } 585 586 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 587 { 588 return cast(__m128d) cmppd!(FPComparison.ult)(a, b); 589 } 590 591 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 592 { 593 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b); 594 } 595 596 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 597 { 598 return cast(__m128d) cmppd!(FPComparison.ule)(a, b); 599 } 600 601 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 602 { 603 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b); 604 } 605 606 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 607 { 608 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b); 609 } 610 611 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 612 { 613 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b); 614 } 615 616 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 617 { 618 return cast(__m128d) cmppd!(FPComparison.uge)(a, b); 619 } 620 621 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 622 { 623 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b); 624 } 625 626 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 627 { 628 return cast(__m128d) cmppd!(FPComparison.ord)(a, b); 629 } 630 631 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 632 { 633 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b); 634 } 635 636 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 637 { 638 return cast(__m128d) cmppd!(FPComparison.uno)(a, b); 639 } 640 641 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 642 { 643 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b); 644 } 645 646 647 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS 648 // Some such comparisons yields true for NaNs, other don't. 649 650 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe 651 { 652 return comsd!(FPComparison.ueq)(a, b); // yields true for NaN, same as GCC 653 } 654 655 int _mm_comige_sd (__m128d a, __m128d b) pure @safe 656 { 657 return comsd!(FPComparison.oge)(a, b); 658 } 659 660 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe 661 { 662 return comsd!(FPComparison.ogt)(a, b); 663 } 664 665 int _mm_comile_sd (__m128d a, __m128d b) pure @safe 666 { 667 return comsd!(FPComparison.ule)(a, b); // yields true for NaN, same as GCC 668 } 669 670 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe 671 { 672 return comsd!(FPComparison.ult)(a, b); // yields true for NaN, same as GCC 673 } 674 675 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe 676 { 677 return comsd!(FPComparison.one)(a, b); 678 } 679 680 version(LDC) 681 { 682 __m128d _mm_cvtepi32_pd (__m128i a) pure @safe 683 { 684 // Generates cvtdq2pd since LDC 1.0, even without optimizations 685 enum ir = ` 686 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 687 %r = sitofp <2 x i32> %v to <2 x double> 688 ret <2 x double> %r`; 689 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a); 690 } 691 } 692 else 693 { 694 __m128d _mm_cvtepi32_pd (__m128i a) pure @safe 695 { 696 double2 r = void; 697 r[0] = a[0]; 698 r[1] = a[1]; 699 return r; 700 } 701 } 702 unittest 703 { 704 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54)); 705 assert(A[0] == 54.0); 706 assert(A[1] == 54.0); 707 } 708 709 __m128 _mm_cvtepi32_ps(__m128i a) pure @safe 710 { 711 // Generates cvtdq2ps since LDC 1.0.0 -O1 712 __m128 res; 713 res.array[0] = cast(float)a.array[0]; 714 res.array[1] = cast(float)a.array[1]; 715 res.array[2] = cast(float)a.array[2]; 716 res.array[3] = cast(float)a.array[3]; 717 return res; 718 } 719 unittest 720 { 721 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 722 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 723 } 724 725 726 version(LDC) 727 { 728 // Like in clang, implemented with a magic intrinsic right now 729 alias _mm_cvtpd_epi32 = __builtin_ia32_cvtpd2dq; 730 731 /* Unfortunately this generates a cvttpd2dq instruction 732 __m128i _mm_cvtpd_epi32 (__m128d a) pure @safe 733 { 734 enum ir = ` 735 %i = fptosi <2 x double> %0 to <2 x i32> 736 %r = shufflevector <2 x i32> %i,<2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 737 ret <4 x i32> %r`; 738 739 return cast(__m128i) inlineIR!(ir, __m128i, __m128d)(a); 740 } */ 741 } 742 else 743 { 744 __m128i _mm_cvtpd_epi32 (__m128d a) pure @safe 745 { 746 __m128i r = _mm_setzero_si128(); 747 r[0] = convertDoubleToInt32UsingMXCSR(a[0]); 748 r[1] = convertDoubleToInt32UsingMXCSR(a[1]); 749 return r; 750 } 751 } 752 unittest 753 { 754 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0)); 755 assert(A[0] == 55 && A[1] == 61 && A[2] == 0 && A[3] == 0); 756 } 757 758 /// Convert packed double-precision (64-bit) floating-point elements in `v` 759 // to packed 32-bit integers 760 __m64 _mm_cvtpd_pi32 (__m128d v) pure @safe 761 { 762 return to_m64(_mm_cvtpd_epi32(v)); 763 } 764 unittest 765 { 766 int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0)); 767 assert(A[0] == 55 && A[1] == 61); 768 } 769 770 version(LDC) 771 { 772 alias _mm_cvtpd_ps = __builtin_ia32_cvtpd2ps; // can't be done with IR unfortunately 773 } 774 else 775 { 776 __m128 _mm_cvtpd_ps (__m128d a) pure @safe 777 { 778 __m128 r = void; 779 r[0] = a[0]; 780 r[1] = a[1]; 781 r[2] = 0; 782 r[3] = 0; 783 return r; 784 } 785 } 786 unittest 787 { 788 __m128d A = _mm_set_pd(5.25, 4.0); 789 __m128 B = _mm_cvtpd_ps(A); 790 assert(B.array == [4.0f, 5.25f, 0, 0]); 791 } 792 793 /// Convert packed 32-bit integers in `v` to packed double-precision 794 /// (64-bit) floating-point elements. 795 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe 796 { 797 return _mm_cvtepi32_pd(to_m128i(v)); 798 } 799 unittest 800 { 801 __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5)); 802 assert(A[0] == 4.0 && A[1] == -5.0); 803 } 804 805 version(LDC) 806 { 807 // Disabled, since it fail with optimizations unfortunately 808 //alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq; 809 810 __m128i _mm_cvtps_epi32 (__m128 a) pure @trusted 811 { 812 return __asm!__m128i("cvtps2dq $1,$0","=x,x",a); 813 } 814 } 815 else 816 { 817 __m128i _mm_cvtps_epi32 (__m128 a) pure @safe 818 { 819 __m128i r = void; 820 r[0] = convertFloatToInt32UsingMXCSR(a[0]); 821 r[1] = convertFloatToInt32UsingMXCSR(a[1]); 822 r[2] = convertFloatToInt32UsingMXCSR(a[2]); 823 r[3] = convertFloatToInt32UsingMXCSR(a[3]); 824 return r; 825 } 826 } 827 unittest 828 { 829 uint savedRounding = _MM_GET_ROUNDING_MODE(); 830 831 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 832 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 833 assert(A.array == [1, -2, 54, -3]); 834 835 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 836 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 837 assert(A.array == [1, -3, 53, -3]); 838 839 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 840 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 841 assert(A.array == [2, -2, 54, -2]); 842 843 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 844 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 845 assert(A.array == [1, -2, 53, -2]); 846 847 _MM_SET_ROUNDING_MODE(savedRounding); 848 } 849 850 851 version(LDC) 852 { 853 __m128d _mm_cvtps_pd (__m128 a) pure @safe 854 { 855 // Generates cvtps2pd since LDC 1.0, no opt 856 enum ir = ` 857 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1> 858 %r = fpext <2 x float> %v to <2 x double> 859 ret <2 x double> %r`; 860 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a); 861 } 862 } 863 else 864 { 865 __m128d _mm_cvtps_pd (__m128 a) pure @safe 866 { 867 double2 r = void; 868 r[0] = a[0]; 869 r[1] = a[1]; 870 return r; 871 } 872 } 873 unittest 874 { 875 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f)); 876 assert(A[0] == 54.0); 877 assert(A[1] == 54.0); 878 } 879 880 double _mm_cvtsd_f64 (__m128d a) pure @safe 881 { 882 return a[0]; 883 } 884 885 version(LDC) 886 { 887 alias _mm_cvtsd_si32 = __builtin_ia32_cvtsd2si; 888 } 889 else 890 { 891 int _mm_cvtsd_si32 (__m128d a) pure @safe 892 { 893 return convertDoubleToInt32UsingMXCSR(a[0]); 894 } 895 } 896 unittest 897 { 898 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0))); 899 } 900 901 version(LDC) 902 { 903 // Unfortunately this builtin crashes in 32-bit 904 version(X86_64) 905 alias _mm_cvtsd_si64 = __builtin_ia32_cvtsd2si64; 906 else 907 { 908 long _mm_cvtsd_si64 (__m128d a) pure @safe 909 { 910 return convertDoubleToInt64UsingMXCSR(a[0]); 911 } 912 } 913 } 914 else 915 { 916 long _mm_cvtsd_si64 (__m128d a) pure @safe 917 { 918 return convertDoubleToInt64UsingMXCSR(a[0]); 919 } 920 } 921 unittest 922 { 923 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0))); 924 925 uint savedRounding = _MM_GET_ROUNDING_MODE(); 926 927 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 928 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.5))); 929 930 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 931 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1))); 932 933 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 934 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1))); 935 936 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 937 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9))); 938 939 _MM_SET_ROUNDING_MODE(savedRounding); 940 } 941 942 alias _mm_cvtsd_si64x = _mm_cvtsd_si64; 943 944 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @safe 945 { 946 // Generates cvtsd2ss since LDC 1.3 -O0 947 a[0] = b[0]; 948 return a; 949 } 950 unittest 951 { 952 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0)); 953 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]); 954 } 955 956 int _mm_cvtsi128_si32 (__m128i a) pure @safe 957 { 958 return a[0]; 959 } 960 961 long _mm_cvtsi128_si64 (__m128i a) pure @safe 962 { 963 long2 la = cast(long2)a; 964 return la[0]; 965 } 966 alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 967 968 __m128d _mm_cvtsi32_sd(__m128d v, int x) pure @safe 969 { 970 v[0] = cast(double)x; 971 return v; 972 } 973 unittest 974 { 975 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 976 assert(a.array == [42.0, 0]); 977 } 978 979 __m128i _mm_cvtsi32_si128 (int a) pure @safe 980 { 981 int4 r = [0, 0, 0, 0]; 982 r[0] = a; 983 return r; 984 } 985 unittest 986 { 987 __m128i a = _mm_cvtsi32_si128(65); 988 assert(a.array == [65, 0, 0, 0]); 989 } 990 991 992 // Note: on macOS, using "llvm.x86.sse2.cvtsi642sd" was buggy 993 __m128d _mm_cvtsi64_sd(__m128d v, long x) pure @safe 994 { 995 v[0] = cast(double)x; 996 return v; 997 } 998 unittest 999 { 1000 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 1001 assert(a.array == [42.0, 0]); 1002 } 1003 1004 __m128i _mm_cvtsi64_si128 (long a) pure @safe 1005 { 1006 long2 r = [0, 0]; 1007 r[0] = a; 1008 return cast(__m128i)(r); 1009 } 1010 1011 alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; 1012 alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; 1013 1014 double2 _mm_cvtss_sd(double2 v, float4 x) pure @safe 1015 { 1016 v[0] = x[0]; 1017 return v; 1018 } 1019 unittest 1020 { 1021 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 1022 assert(a.array == [42.0, 0]); 1023 } 1024 1025 long _mm_cvttss_si64 (__m128 a) pure @safe 1026 { 1027 return cast(long)(a[0]); // Generates cvttss2si as expected 1028 } 1029 unittest 1030 { 1031 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1032 } 1033 1034 version(LDC) 1035 { 1036 alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq; 1037 } 1038 else 1039 { 1040 __m128i _mm_cvttpd_epi32 (__m128d a) pure @safe 1041 { 1042 // Note: doesn't generate cvttpd2dq as of LDC 1.13 1043 __m128i r; 1044 r[0] = cast(int)a[0]; 1045 r[1] = cast(int)a[1]; 1046 r[2] = 0; 1047 r[3] = 0; 1048 return r; 1049 } 1050 } 1051 unittest 1052 { 1053 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f)); 1054 assert(R.array == [-4, 45641, 0, 0]); 1055 } 1056 1057 1058 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1059 /// to packed 32-bit integers with truncation. 1060 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe 1061 { 1062 return to_m64(_mm_cvttpd_epi32(v)); 1063 } 1064 unittest 1065 { 1066 int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f)); 1067 int[2] correct = [-4, 45641]; 1068 assert(R.array == correct); 1069 } 1070 1071 __m128i _mm_cvttps_epi32 (__m128 a) pure @safe 1072 { 1073 // Note: Generates cvttps2dq since LDC 1.3 -O2 1074 __m128i r; 1075 r[0] = cast(int)a[0]; 1076 r[1] = cast(int)a[1]; 1077 r[2] = cast(int)a[2]; 1078 r[3] = cast(int)a[3]; 1079 return r; 1080 } 1081 unittest 1082 { 1083 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f)); 1084 assert(R.array == [-4, 45641, 0, 1]); 1085 } 1086 1087 int _mm_cvttsd_si32 (__m128d a) 1088 { 1089 // Generates cvttsd2si since LDC 1.3 -O0 1090 return cast(int)a[0]; 1091 } 1092 1093 long _mm_cvttsd_si64 (__m128d a) 1094 { 1095 // Generates cvttsd2si since LDC 1.3 -O0 1096 // but in 32-bit instead, it's a long sequence that resort to FPU 1097 return cast(long)a[0]; 1098 } 1099 1100 alias _mm_cvttsd_si64x = _mm_cvttsd_si64; 1101 1102 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe 1103 { 1104 return a / b; 1105 } 1106 1107 version(DigitalMars) 1108 { 1109 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1110 __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe 1111 { 1112 asm pure nothrow @nogc @trusted { nop;} 1113 a[0] = a[0] / b[0]; 1114 return a; 1115 } 1116 } 1117 else 1118 { 1119 __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe 1120 { 1121 a[0] /= b[0]; 1122 return a; 1123 } 1124 } 1125 unittest 1126 { 1127 __m128d a = [2.0, 4.5]; 1128 a = _mm_div_sd(a, a); 1129 assert(a.array == [1.0, 4.5]); 1130 } 1131 1132 /// Extract a 16-bit integer from `v`, selected with `index` 1133 int _mm_extract_epi16(__m128i v, int index) pure @safe 1134 { 1135 short8 r = cast(short8)v; 1136 return cast(ushort)(r[index]); 1137 } 1138 unittest 1139 { 1140 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1); 1141 assert(_mm_extract_epi16(A, 6) == 6); 1142 assert(_mm_extract_epi16(A, 0) == 65535); 1143 } 1144 1145 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`. 1146 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted 1147 { 1148 short8 r = cast(short8)v; 1149 r[index & 7] = cast(short)i; 1150 return cast(__m128i)r; 1151 } 1152 unittest 1153 { 1154 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1155 short8 R = cast(short8) _mm_insert_epi16(A, 42, 6); 1156 short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7]; 1157 assert(R.array == correct); 1158 } 1159 1160 version(LDC) 1161 { 1162 alias _mm_lfence = __builtin_ia32_lfence; 1163 } 1164 else 1165 { 1166 void _mm_lfence() pure @safe 1167 { 1168 asm nothrow @nogc pure @safe 1169 { 1170 lfence; 1171 } 1172 } 1173 } 1174 unittest 1175 { 1176 _mm_lfence(); 1177 } 1178 1179 1180 __m128d _mm_load_pd (const(double) * mem_addr) pure 1181 { 1182 __m128d* aligned = cast(__m128d*)mem_addr; 1183 return *aligned; 1184 } 1185 1186 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 1187 { 1188 double[2] arr = [*mem_addr, *mem_addr]; 1189 return loadUnaligned!(double2)(&arr[0]); 1190 } 1191 1192 __m128d _mm_load_sd (const(double)* mem_addr) pure @safe 1193 { 1194 double2 r = [0, 0]; 1195 r[0] = *mem_addr; 1196 return r; 1197 } 1198 unittest 1199 { 1200 double x = -42; 1201 __m128d a = _mm_load_sd(&x); 1202 assert(a.array == [-42.0, 0.0]); 1203 } 1204 1205 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted 1206 { 1207 return *mem_addr; 1208 } 1209 1210 alias _mm_load1_pd = _mm_load_pd1; 1211 1212 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @safe 1213 { 1214 a[1] = *mem_addr; 1215 return a; 1216 } 1217 1218 // Note: strange signature since the memory doesn't have to aligned 1219 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @safe 1220 { 1221 auto pLong = cast(const(long)*)mem_addr; 1222 long2 r = [0, 0]; 1223 r[0] = *pLong; 1224 return cast(__m128i)(r); 1225 } 1226 1227 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @safe 1228 { 1229 a[0] = *mem_addr; 1230 return a; 1231 } 1232 1233 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted 1234 { 1235 __m128d a = _mm_load_pd(mem_addr); 1236 return shufflevector!(__m128d, 1, 0)(a, a); 1237 } 1238 1239 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @safe 1240 { 1241 return loadUnaligned!(double2)(mem_addr); 1242 } 1243 1244 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 1245 { 1246 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 1247 } 1248 1249 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted 1250 { 1251 int r = *cast(int*)(mem_addr); 1252 int4 result = [0, 0, 0, 0]; 1253 result[0] = r; 1254 return result; 1255 } 1256 unittest 1257 { 1258 int r = 42; 1259 __m128i A = _mm_loadu_si32(&r); 1260 int[4] correct = [42, 0, 0, 0]; 1261 assert(A.array == correct); 1262 } 1263 1264 version(LDC) 1265 { 1266 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 1267 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 1268 /// and pack the results in destination. 1269 alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128; 1270 } 1271 else 1272 { 1273 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 1274 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 1275 /// and pack the results in destination. 1276 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @safe 1277 { 1278 short8 sa = cast(short8)a; 1279 short8 sb = cast(short8)b; 1280 1281 int4 r; 1282 foreach(i; 0..4) 1283 { 1284 r[i] = sa[2*i] * sb[2*i] + sa[2*i+1] * sb[2*i+1]; 1285 } 1286 return r; 1287 } 1288 } 1289 unittest 1290 { 1291 short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 1292 short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 1293 int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B); 1294 int[4] correct = [1, 13, -2147483648, 2*32767*32767]; 1295 assert(R.array == correct); 1296 } 1297 1298 version(LDC) 1299 { 1300 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 1301 /// (elements are not stored when the highest bit is not set in the corresponding element) 1302 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 1303 /// boundary. 1304 alias _mm_maskmoveu_si128 = __builtin_ia32_maskmovdqu; // can't do it with pure IR 1305 } 1306 else 1307 { 1308 ///ditto 1309 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) pure @trusted 1310 { 1311 byte16 b = cast(byte16)a; 1312 byte16 m = cast(byte16)mask; 1313 byte* dest = cast(byte*)(mem_addr); 1314 foreach(j; 0..16) 1315 { 1316 if (m[j] & 128) 1317 { 1318 dest[j] = b[j]; 1319 } 1320 } 1321 } 1322 } 1323 unittest 1324 { 1325 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]; 1326 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0); 1327 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); 1328 _mm_maskmoveu_si128(A, mask, dest.ptr); 1329 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42]; 1330 assert(dest == correct); 1331 } 1332 1333 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe 1334 { 1335 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1336 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else 1337 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1338 __m128i mask = _mm_and_si128(aTob, lowerShorts); 1339 return _mm_xor_si128(b, mask); 1340 } 1341 unittest 1342 { 1343 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-57), 1344 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 1345 short[8] correct = [45, 1, 9, 7, 9, 7, 0, 0]; 1346 assert(R.array == correct); 1347 } 1348 1349 1350 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1351 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe 1352 { 1353 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1354 __m128i value128 = _mm_set1_epi8(-128); 1355 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 1356 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1357 __m128i mask = _mm_and_si128(aTob, higher); 1358 return _mm_xor_si128(b, mask); 1359 } 1360 unittest 1361 { 1362 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 1363 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 1364 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 1365 assert(R.array == correct); 1366 } 1367 1368 __m128d _mm_max_pd (__m128d a, __m128d b) pure @safe 1369 { 1370 // Generates maxpd starting with LDC 1.9 1371 a[0] = (a[0] > b[0]) ? a[0] : b[0]; 1372 a[1] = (a[1] > b[1]) ? a[1] : b[1]; 1373 return a; 1374 } 1375 unittest 1376 { 1377 __m128d A = _mm_setr_pd(4.0, 1.0); 1378 __m128d B = _mm_setr_pd(1.0, 8.0); 1379 __m128d M = _mm_max_pd(A, B); 1380 assert(M[0] == 4.0); 1381 assert(M[1] == 8.0); 1382 } 1383 1384 __m128d _mm_max_sd (__m128d a, __m128d b) pure @safe 1385 { 1386 __m128d r = a; 1387 // Generates maxsd starting with LDC 1.3 1388 r[0] = (a[0] > b[0]) ? a[0] : b[0]; 1389 return r; 1390 } 1391 unittest 1392 { 1393 __m128d A = _mm_setr_pd(1.0, 1.0); 1394 __m128d B = _mm_setr_pd(4.0, 2.0); 1395 __m128d M = _mm_max_sd(A, B); 1396 assert(M[0] == 4.0); 1397 assert(M[1] == 1.0); 1398 } 1399 1400 version(LDC) 1401 { 1402 alias _mm_mfence = __builtin_ia32_mfence; 1403 } 1404 else 1405 { 1406 void _mm_mfence() pure @safe 1407 { 1408 asm nothrow @nogc pure @safe 1409 { 1410 mfence; 1411 } 1412 } 1413 } 1414 unittest 1415 { 1416 _mm_mfence(); 1417 } 1418 1419 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe 1420 { 1421 // Note: clang uses a __builtin_ia32_pminsw128 which has disappeared from LDC LLVM (?) 1422 // Implemented using masks and XOR 1423 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else 1424 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1425 __m128i mask = _mm_and_si128(aTob, lowerShorts); 1426 return _mm_xor_si128(b, mask); 1427 } 1428 unittest 1429 { 1430 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-57), 1431 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 1432 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -57]; 1433 assert(R.array == correct); 1434 } 1435 1436 1437 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe 1438 { 1439 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1440 __m128i value128 = _mm_set1_epi8(-128); 1441 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 1442 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1443 __m128i mask = _mm_and_si128(aTob, lower); 1444 return _mm_xor_si128(b, mask); 1445 } 1446 unittest 1447 { 1448 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 1449 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 1450 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 1451 assert(R.array == correct); 1452 } 1453 1454 __m128d _mm_min_pd (__m128d a, __m128d b) pure @safe 1455 { 1456 // Generates minpd starting with LDC 1.9 1457 a[0] = (a[0] < b[0]) ? a[0] : b[0]; 1458 a[1] = (a[1] < b[1]) ? a[1] : b[1]; 1459 return a; 1460 } 1461 unittest 1462 { 1463 __m128d A = _mm_setr_pd(1.0, 2.0); 1464 __m128d B = _mm_setr_pd(4.0, 1.0); 1465 __m128d M = _mm_min_pd(A, B); 1466 assert(M[0] == 1.0); 1467 assert(M[1] == 1.0); 1468 } 1469 1470 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe 1471 { 1472 // Generates minsd starting with LDC 1.3 1473 __m128d r = a; 1474 r[0] = (a[0] < b[0]) ? a[0] : b[0]; 1475 return r; 1476 } 1477 unittest 1478 { 1479 __m128d A = _mm_setr_pd(1.0, 3.0); 1480 __m128d B = _mm_setr_pd(4.0, 2.0); 1481 __m128d M = _mm_min_sd(A, B); 1482 assert(M[0] == 1.0); 1483 assert(M[1] == 3.0); 1484 } 1485 1486 __m128i _mm_move_epi64 (__m128i a) pure @safe 1487 { 1488 long2 result = [ 0, 0 ]; 1489 long2 la = cast(long2) a; 1490 result[0] = la[0]; 1491 return cast(__m128i)(result); 1492 } 1493 unittest 1494 { 1495 long2 A = [13, 47]; 1496 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 1497 long[2] correct = [13, 0]; 1498 assert(B.array == correct); 1499 } 1500 1501 __m128d _mm_move_sd (__m128d a, __m128d b) pure @safe 1502 { 1503 b[1] = a[1]; 1504 return b; 1505 } 1506 unittest 1507 { 1508 double2 A = [13.0, 47.0]; 1509 double2 B = [34.0, 58.0]; 1510 double2 C = _mm_move_sd(A, B); 1511 double[2] correct = [34.0, 47.0]; 1512 assert(C.array == correct); 1513 } 1514 1515 version(LDC) 1516 { 1517 /// Create mask from the most significant bit of each 8-bit element in `v`. 1518 alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128; 1519 } 1520 else 1521 { 1522 /// Create mask from the most significant bit of each 8-bit element in `v`. 1523 int _mm_movemask_epi8(__m128i v) pure @safe 1524 { 1525 byte16 ai = cast(byte16)v; 1526 int r = 0; 1527 foreach(bit; 0..16) 1528 { 1529 if (ai[bit] < 0) r += (1 << bit); 1530 } 1531 return r; 1532 } 1533 } 1534 unittest 1535 { 1536 assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 0, 0, -1, -1, -1, 0, 0, 0, 0, -1, -1, 0, -1, -1, 0))); 1537 } 1538 1539 version(LDC) 1540 { 1541 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 1542 /// packed double-precision (64-bit) floating-point element in `v`. 1543 alias _mm_movemask_pd = __builtin_ia32_movmskpd; 1544 } 1545 else 1546 { 1547 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 1548 /// packed double-precision (64-bit) floating-point element in `v`. 1549 int _mm_movemask_pd(__m128d v) pure @safe 1550 { 1551 long2 lv = cast(long2)v; 1552 int r = 0; 1553 if (lv[0] < 0) r += 1; 1554 if (lv[1] < 0) r += 2; 1555 return r; 1556 } 1557 } 1558 unittest 1559 { 1560 __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0); 1561 assert(_mm_movemask_pd(A) == 2); 1562 } 1563 1564 /// Copy the lower 64-bit integer in `v`. 1565 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe 1566 { 1567 long2 lv = cast(long2)v; 1568 return long1(lv[0]); 1569 } 1570 unittest 1571 { 1572 __m128i A = _mm_set_epi64x(-1, -2); 1573 __m64 R = _mm_movepi64_pi64(A); 1574 assert(R[0] == -2); 1575 } 1576 1577 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element. 1578 __m128i _mm_movpi64_epi64 (__m64 a) pure @safe 1579 { 1580 long2 r; 1581 r[0] = a[0]; 1582 r[1] = 0; 1583 return cast(__m128i)r; 1584 } 1585 1586 // PERF: unfortunately, __builtin_ia32_pmuludq128 disappeared from LDC 1587 // but seems there in clang 1588 __m128i _mm_mul_epu32(__m128i a, __m128i b) pure @safe 1589 { 1590 __m128i zero = _mm_setzero_si128(); 1591 long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero); 1592 long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero); 1593 static if (__VERSION__ >= 2076) 1594 { 1595 return cast(__m128i)(la * lb); 1596 } 1597 else 1598 { 1599 // long2 mul not supported before LDC 1.5 1600 la[0] *= lb[0]; 1601 la[1] *= lb[1]; 1602 return cast(__m128i)(la); 1603 } 1604 } 1605 unittest 1606 { 1607 __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff); 1608 __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff); 1609 __m128i C = _mm_mul_epu32(A, B); 1610 long2 LC = cast(long2)C; 1611 assert(LC.array[0] == 18446744065119617025uL); 1612 assert(LC.array[1] == 12723420444339690338uL); 1613 } 1614 1615 1616 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 1617 { 1618 return a * b; 1619 } 1620 unittest 1621 { 1622 __m128d a = [-2.0, 1.5]; 1623 a = _mm_mul_pd(a, a); 1624 assert(a.array == [4.0, 2.25]); 1625 } 1626 1627 version(DigitalMars) 1628 { 1629 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1630 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe 1631 { 1632 asm pure nothrow @nogc @trusted { nop;} 1633 a[0] = a[0] * b[0]; 1634 return a; 1635 } 1636 } 1637 else 1638 { 1639 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe 1640 { 1641 a[0] *= b[0]; 1642 return a; 1643 } 1644 } 1645 unittest 1646 { 1647 __m128d a = [-2.0, 1.5]; 1648 a = _mm_mul_sd(a, a); 1649 assert(a.array == [4.0, 1.5]); 1650 } 1651 1652 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 1653 /// and get an unsigned 64-bit result. 1654 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe 1655 { 1656 return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b))); 1657 } 1658 unittest 1659 { 1660 __m64 A = _mm_set_pi32(42, 0xDEADBEEF); 1661 __m64 B = _mm_set_pi32(42, 0xCAFEBABE); 1662 __m64 C = _mm_mul_su32(A, B); 1663 assert(C[0] == 0xDEADBEEFuL * 0xCAFEBABEuL); 1664 } 1665 1666 version(LDC) 1667 { 1668 alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128; 1669 } 1670 else 1671 { 1672 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @safe 1673 { 1674 short8 sa = cast(short8)a; 1675 short8 sb = cast(short8)b; 1676 short8 r = void; 1677 r[0] = (sa[0] * sb[0]) >> 16; 1678 r[1] = (sa[1] * sb[1]) >> 16; 1679 r[2] = (sa[2] * sb[2]) >> 16; 1680 r[3] = (sa[3] * sb[3]) >> 16; 1681 r[4] = (sa[4] * sb[4]) >> 16; 1682 r[5] = (sa[5] * sb[5]) >> 16; 1683 r[6] = (sa[6] * sb[6]) >> 16; 1684 r[7] = (sa[7] * sb[7]) >> 16; 1685 return cast(__m128i)r; 1686 } 1687 } 1688 unittest 1689 { 1690 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 1691 __m128i B = _mm_set1_epi16(16384); 1692 short8 R = cast(short8)_mm_mulhi_epi16(A, B); 1693 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1]; 1694 assert(R.array == correct); 1695 } 1696 1697 version(LDC) 1698 { 1699 alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128; 1700 } 1701 else 1702 { 1703 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @safe 1704 { 1705 short8 sa = cast(short8)a; 1706 short8 sb = cast(short8)b; 1707 short8 r = void; 1708 r[0] = cast(short)( (cast(ushort)sa[0] * cast(ushort)sb[0]) >> 16 ); 1709 r[1] = cast(short)( (cast(ushort)sa[1] * cast(ushort)sb[1]) >> 16 ); 1710 r[2] = cast(short)( (cast(ushort)sa[2] * cast(ushort)sb[2]) >> 16 ); 1711 r[3] = cast(short)( (cast(ushort)sa[3] * cast(ushort)sb[3]) >> 16 ); 1712 r[4] = cast(short)( (cast(ushort)sa[4] * cast(ushort)sb[4]) >> 16 ); 1713 r[5] = cast(short)( (cast(ushort)sa[5] * cast(ushort)sb[5]) >> 16 ); 1714 r[6] = cast(short)( (cast(ushort)sa[6] * cast(ushort)sb[6]) >> 16 ); 1715 r[7] = cast(short)( (cast(ushort)sa[7] * cast(ushort)sb[7]) >> 16 ); 1716 return cast(__m128i)r; 1717 } 1718 } 1719 unittest 1720 { 1721 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 1722 __m128i B = _mm_set1_epi16(16384); 1723 short8 R = cast(short8)_mm_mulhi_epu16(A, B); 1724 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1]; 1725 assert(R.array == correct); 1726 } 1727 1728 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe 1729 { 1730 return cast(__m128i)(cast(short8)a * cast(short8)b); 1731 } 1732 unittest 1733 { 1734 __m128i A = _mm_setr_epi16(16384, -16, 0, 3, 4, 1, 16, 7); 1735 __m128i B = _mm_set1_epi16(16384); 1736 short8 R = cast(short8)_mm_mullo_epi16(A, B); 1737 short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384]; 1738 assert(R.array == correct); 1739 } 1740 1741 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 1742 { 1743 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 1744 } 1745 1746 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 1747 { 1748 return a | b; 1749 } 1750 1751 version(LDC) 1752 { 1753 alias _mm_packs_epi32 = __builtin_ia32_packssdw128; 1754 } 1755 else 1756 { 1757 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @safe 1758 { 1759 short8 r; 1760 r[0] = saturateSignedIntToSignedShort(a[0]); 1761 r[1] = saturateSignedIntToSignedShort(a[1]); 1762 r[2] = saturateSignedIntToSignedShort(a[2]); 1763 r[3] = saturateSignedIntToSignedShort(a[3]); 1764 r[4] = saturateSignedIntToSignedShort(b[0]); 1765 r[5] = saturateSignedIntToSignedShort(b[1]); 1766 r[6] = saturateSignedIntToSignedShort(b[2]); 1767 r[7] = saturateSignedIntToSignedShort(b[3]); 1768 return cast(__m128i)r; 1769 } 1770 } 1771 unittest 1772 { 1773 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 1774 short8 R = cast(short8) _mm_packs_epi32(A, A); 1775 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0]; 1776 assert(R.array == correct); 1777 } 1778 1779 version(LDC) 1780 { 1781 alias _mm_packs_epi16 = __builtin_ia32_packsswb128; 1782 } 1783 else 1784 { 1785 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @safe 1786 { 1787 byte16 r; 1788 short8 sa = cast(short8)a; 1789 short8 sb = cast(short8)b; 1790 foreach(i; 0..8) 1791 r[i] = saturateSignedWordToSignedByte(sa[i]); 1792 foreach(i; 0..8) 1793 r[i+8] = saturateSignedWordToSignedByte(sb[i]); 1794 return cast(__m128i)r; 1795 } 1796 } 1797 unittest 1798 { 1799 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0); 1800 byte16 R = cast(byte16) _mm_packs_epi16(A, A); 1801 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0, 1802 127, -128, 127, 0, 127, -128, 127, 0]; 1803 assert(R.array == correct); 1804 } 1805 1806 version(LDC) 1807 { 1808 alias _mm_packus_epi16 = __builtin_ia32_packuswb128; 1809 } 1810 else 1811 { 1812 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 1813 { 1814 short8 sa = cast(short8)a; 1815 short8 sb = cast(short8)b; 1816 ubyte[16] result = void; 1817 for (int i = 0; i < 8; ++i) 1818 { 1819 short s = sa[i]; 1820 if (s < 0) s = 0; 1821 if (s > 255) s = 255; 1822 result[i] = cast(ubyte)s; 1823 1824 s = sb[i]; 1825 if (s < 0) s = 0; 1826 if (s > 255) s = 255; 1827 result[i+8] = cast(ubyte)s; 1828 } 1829 return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr); 1830 } 1831 } 1832 unittest 1833 { 1834 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 1835 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 1836 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 1837 0, 255, 0, 255, 255, 2, 1, 0]; 1838 foreach(i; 0..16) 1839 assert(AA[i] == cast(byte)(correctResult[i])); 1840 } 1841 1842 version(LDC) 1843 { 1844 alias _mm_pause = __builtin_ia32_pause; 1845 } 1846 else 1847 { 1848 void _mm_pause() pure @safe 1849 { 1850 asm nothrow @nogc pure @safe 1851 { 1852 rep; nop; // F3 90 = pause 1853 } 1854 } 1855 } 1856 unittest 1857 { 1858 _mm_pause(); 1859 } 1860 1861 1862 version(LDC) 1863 { 1864 alias _mm_sad_epu8 = __builtin_ia32_psadbw128; 1865 } 1866 else 1867 { 1868 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @safe 1869 { 1870 byte16 ab = cast(byte16)a; 1871 byte16 bb = cast(byte16)b; 1872 ubyte[16] t; 1873 foreach(i; 0..16) 1874 { 1875 int diff = cast(ubyte)(ab[i]) - cast(ubyte)(bb[i]); 1876 if (diff < 0) diff = -diff; 1877 t[i] = cast(ubyte)(diff); 1878 } 1879 int4 r = _mm_setzero_si128(); 1880 r[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 1881 r[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15]; 1882 return r; 1883 } 1884 } 1885 unittest 1886 { 1887 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 1888 __m128i B = _mm_set1_epi8(1); 1889 __m128i R = _mm_sad_epu8(A, B); 1890 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 1891 0, 1892 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 1893 0]; 1894 assert(R.array == correct); 1895 } 1896 1897 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 1898 { 1899 short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7]; 1900 return cast(__m128i) loadUnaligned!(short8)(result.ptr); 1901 } 1902 unittest 1903 { 1904 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1905 short8 B = cast(short8) A; 1906 foreach(i; 0..8) 1907 assert(B.array[i] == i); 1908 } 1909 1910 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 1911 { 1912 int[4] result = [e0, e1, e2, e3]; 1913 return loadUnaligned!(int4)(result.ptr); 1914 } 1915 unittest 1916 { 1917 __m128i A = _mm_set_epi32(3, 2, 1, 0); 1918 foreach(i; 0..4) 1919 assert(A.array[i] == i); 1920 } 1921 1922 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted 1923 { 1924 long[2] result = [e0[0], e1[0]]; 1925 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 1926 } 1927 unittest 1928 { 1929 __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678)); 1930 long2 B = cast(long2) A; 1931 assert(B.array[0] == 5678); 1932 assert(B.array[1] == 1234); 1933 } 1934 1935 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 1936 { 1937 long[2] result = [e0, e1]; 1938 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 1939 } 1940 unittest 1941 { 1942 __m128i A = _mm_set_epi64x(1234, 5678); 1943 long2 B = cast(long2) A; 1944 assert(B.array[0] == 5678); 1945 assert(B.array[1] == 1234); 1946 } 1947 1948 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 1949 byte e11, byte e10, byte e9, byte e8, 1950 byte e7, byte e6, byte e5, byte e4, 1951 byte e3, byte e2, byte e1, byte e0) pure @trusted 1952 { 1953 byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 1954 e8, e9, e10, e11, e12, e13, e14, e15]; 1955 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 1956 } 1957 1958 __m128d _mm_set_pd (double e1, double e0) pure @trusted 1959 { 1960 double[2] result = [e0, e1]; 1961 return loadUnaligned!(double2)(result.ptr); 1962 } 1963 1964 __m128d _mm_set_pd1 (double a) pure @trusted 1965 { 1966 double[2] result = [a, a]; 1967 return loadUnaligned!(double2)(result.ptr); 1968 } 1969 1970 __m128d _mm_set_sd (double a) pure @trusted 1971 { 1972 double[2] result = [a, 0]; 1973 return loadUnaligned!(double2)(result.ptr); 1974 } 1975 1976 __m128i _mm_set1_epi16 (short a) pure @trusted 1977 { 1978 return cast(__m128i)(short8(a)); 1979 } 1980 1981 __m128i _mm_set1_epi32 (int a) pure @trusted 1982 { 1983 return cast(__m128i)(int4(a)); 1984 } 1985 unittest 1986 { 1987 __m128 a = _mm_set1_ps(-1.0f); 1988 __m128 b = cast(__m128) _mm_set1_epi32(0x7fffffff); 1989 assert(_mm_and_ps(a, b).array == [1.0f, 1, 1, 1]); 1990 } 1991 1992 /// Broadcast 64-bit integer `a` to all elements of `dst`. 1993 __m128i _mm_set1_epi64 (__m64 a) pure @safe 1994 { 1995 return _mm_set_epi64(a, a); 1996 } 1997 1998 __m128i _mm_set1_epi64x (long a) pure @trusted 1999 { 2000 return cast(__m128i)(long2(a)); 2001 } 2002 2003 __m128i _mm_set1_epi8 (byte a) pure @trusted 2004 { 2005 return cast(__m128i)(byte16(a)); 2006 } 2007 2008 alias _mm_set1_pd = _mm_set_pd1; 2009 2010 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 2011 { 2012 short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0]; 2013 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 2014 } 2015 2016 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 2017 { 2018 int[4] result = [e3, e2, e1, e0]; 2019 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 2020 } 2021 2022 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 2023 { 2024 long[2] result = [e1, e0]; 2025 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 2026 } 2027 2028 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12, 2029 byte e11, byte e10, byte e9, byte e8, 2030 byte e7, byte e6, byte e5, byte e4, 2031 byte e3, byte e2, byte e1, byte e0) pure @trusted 2032 { 2033 byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 2034 e7, e6, e5, e4, e3, e2, e1, e0]; 2035 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 2036 } 2037 2038 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 2039 { 2040 double[2] result = [e1, e0]; 2041 return loadUnaligned!(double2)(result.ptr); 2042 } 2043 2044 __m128d _mm_setzero_pd () pure @trusted 2045 { 2046 double[2] result = [0.0, 0.0]; 2047 return loadUnaligned!(double2)(result.ptr); 2048 } 2049 2050 __m128i _mm_setzero_si128() pure @trusted 2051 { 2052 int[4] result = [0, 0, 0, 0]; 2053 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 2054 } 2055 2056 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe 2057 { 2058 return shufflevector!(int4, (imm8 >> 0) & 3, 2059 (imm8 >> 2) & 3, 2060 (imm8 >> 4) & 3, 2061 (imm8 >> 6) & 3)(a, a); 2062 } 2063 unittest 2064 { 2065 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 2066 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 2067 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 2068 int[4] expectedB = [ 3, 2, 1, 0 ]; 2069 assert(B.array == expectedB); 2070 } 2071 2072 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe 2073 { 2074 return shufflevector!(double2, 0 + ( imm8 & 1 ), 2075 2 + ( (imm8 >> 1) & 1 ))(a, b); 2076 } 2077 unittest 2078 { 2079 __m128d A = _mm_setr_pd(0.5, 2.0); 2080 __m128d B = _mm_setr_pd(4.0, 5.0); 2081 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 2082 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B); 2083 double[2] correct = [ 2.0, 5.0 ]; 2084 assert(R.array == correct); 2085 } 2086 2087 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe 2088 { 2089 return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3, 2090 4 + ( (imm8 >> 0) & 3 ), 2091 4 + ( (imm8 >> 2) & 3 ), 2092 4 + ( (imm8 >> 4) & 3 ), 2093 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 2094 } 2095 unittest 2096 { 2097 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 2098 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 2099 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 2100 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 2101 assert(C.array == expectedC); 2102 } 2103 2104 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe 2105 { 2106 return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ), 2107 ( (imm8 >> 2) & 3 ), 2108 ( (imm8 >> 4) & 3 ), 2109 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 2110 } 2111 unittest 2112 { 2113 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 2114 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 2115 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 2116 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 2117 assert(B.array == expectedB); 2118 } 2119 2120 version(LDC) 2121 { 2122 alias _mm_sll_epi32 = __builtin_ia32_pslld128; 2123 } 2124 else 2125 { 2126 __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe 2127 { 2128 int4 r = void; 2129 long2 lc = cast(long2)count; 2130 int bits = cast(int)(lc[0]); 2131 foreach(i; 0..4) 2132 r[i] = cast(uint)(a[i]) << bits; 2133 return r; 2134 } 2135 } 2136 unittest 2137 { 2138 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 2139 __m128i B = _mm_sll_epi32(A, _mm_cvtsi32_si128(1)); 2140 int[4] expectedB = [ 0, 4, 6, -8]; 2141 assert(B.array == expectedB); 2142 } 2143 2144 version(LDC) 2145 { 2146 alias _mm_sll_epi64 = __builtin_ia32_psllq128; 2147 } 2148 else 2149 { 2150 __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe 2151 { 2152 long2 r = void; 2153 long2 sa = cast(long2)a; 2154 long2 lc = cast(long2)count; 2155 int bits = cast(int)(lc[0]); 2156 foreach(i; 0..2) 2157 r[i] = cast(ulong)(sa[i]) << bits; 2158 return cast(__m128i)r; 2159 } 2160 } 2161 unittest 2162 { 2163 __m128i A = _mm_setr_epi64(8, -4); 2164 long2 B = cast(long2) _mm_sll_epi64(A, _mm_cvtsi32_si128(1)); 2165 long[2] expectedB = [ 16, -8]; 2166 assert(B.array == expectedB); 2167 } 2168 2169 version(LDC) 2170 { 2171 alias _mm_sll_epi16 = __builtin_ia32_psllw128; 2172 } 2173 else 2174 { 2175 __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @safe 2176 { 2177 short8 sa = cast(short8)a; 2178 long2 lc = cast(long2)count; 2179 int bits = cast(int)(lc[0]); 2180 short8 r = void; 2181 foreach(i; 0..8) 2182 r[i] = cast(short)(cast(ushort)(sa[i]) << bits); 2183 return cast(int4)r; 2184 } 2185 } 2186 unittest 2187 { 2188 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 2189 short8 B = cast(short8)( _mm_sll_epi16(A, _mm_cvtsi32_si128(1)) ); 2190 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 2191 assert(B.array == expectedB); 2192 } 2193 2194 version(LDC) 2195 { 2196 alias _mm_slli_epi32 = __builtin_ia32_pslldi128; 2197 } 2198 else 2199 { 2200 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @safe 2201 { 2202 int4 r = void; 2203 foreach(i; 0..4) 2204 r[i] = cast(uint)(a[i]) << imm8; 2205 return r; 2206 } 2207 } 2208 unittest 2209 { 2210 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 2211 __m128i B = _mm_slli_epi32(A, 1); 2212 int[4] expectedB = [ 0, 4, 6, -8]; 2213 assert(B.array == expectedB); 2214 } 2215 2216 version(LDC) 2217 { 2218 alias _mm_slli_epi64 = __builtin_ia32_psllqi128; 2219 } 2220 else 2221 { 2222 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @safe 2223 { 2224 long2 r = void; 2225 long2 sa = cast(long2)a; 2226 foreach(i; 0..2) 2227 r[i] = cast(ulong)(sa[i]) << imm8; 2228 return cast(__m128i)r; 2229 } 2230 } 2231 unittest 2232 { 2233 __m128i A = _mm_setr_epi64(8, -4); 2234 long2 B = cast(long2) _mm_slli_epi64(A, 1); 2235 long[2] expectedB = [ 16, -8]; 2236 assert(B.array == expectedB); 2237 } 2238 2239 version(LDC) 2240 { 2241 alias _mm_slli_epi16 = __builtin_ia32_psllwi128; 2242 } 2243 else 2244 { 2245 __m128i _mm_slli_epi16 (__m128i a, int imm8) pure @safe 2246 { 2247 short8 sa = cast(short8)a; 2248 short8 r = void; 2249 foreach(i; 0..8) 2250 r[i] = cast(short)(cast(ushort)(sa[i]) << imm8); 2251 return cast(int4)r; 2252 } 2253 } 2254 unittest 2255 { 2256 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 2257 short8 B = cast(short8)( _mm_slli_epi16(A, 1) ); 2258 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 2259 assert(B.array == expectedB); 2260 } 2261 2262 /// Shift `a` left by `imm8` bytes while shifting in zeros. 2263 __m128i _mm_slli_si128(ubyte imm8)(__m128i op) pure @safe 2264 { 2265 static if (imm8 & 0xF0) 2266 return _mm_setzero_si128(); 2267 else 2268 return cast(__m128i) shufflevector!(byte16, 2269 16 - imm8, 17 - imm8, 18 - imm8, 19 - imm8, 20 - imm8, 21 - imm8, 22 - imm8, 23 - imm8, 2270 24 - imm8, 25 - imm8, 26 - imm8, 27 - imm8, 28 - imm8, 29 - imm8, 30 - imm8, 31 - imm8) 2271 (cast(byte16)_mm_setzero_si128(), cast(byte16)op); 2272 } 2273 unittest 2274 { 2275 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 2276 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left 2277 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ]; 2278 assert(R.array == correct); 2279 } 2280 2281 version(LDC) 2282 { 2283 // Disappeared with LDC 1.11 2284 static if (__VERSION__ < 2081) 2285 alias _mm_sqrt_pd = __builtin_ia32_sqrtpd; 2286 else 2287 { 2288 __m128d _mm_sqrt_pd(__m128d vec) pure @safe 2289 { 2290 vec.array[0] = llvm_sqrt(vec.array[0]); 2291 vec.array[1] = llvm_sqrt(vec.array[1]); 2292 return vec; 2293 } 2294 } 2295 } 2296 else 2297 { 2298 __m128d _mm_sqrt_pd(__m128d vec) pure @safe 2299 { 2300 vec.array[0] = sqrt(vec.array[0]); 2301 vec.array[1] = sqrt(vec.array[1]); 2302 return vec; 2303 } 2304 } 2305 2306 2307 version(LDC) 2308 { 2309 // Disappeared with LDC 1.11 2310 static if (__VERSION__ < 2081) 2311 alias _mm_sqrt_sd = __builtin_ia32_sqrtsd; 2312 else 2313 { 2314 __m128d _mm_sqrt_sd(__m128d vec) pure @safe 2315 { 2316 vec.array[0] = llvm_sqrt(vec.array[0]); 2317 vec.array[1] = vec.array[1]; 2318 return vec; 2319 } 2320 } 2321 } 2322 else 2323 { 2324 __m128d _mm_sqrt_sd(__m128d vec) pure @safe 2325 { 2326 vec.array[0] = sqrt(vec.array[0]); 2327 vec.array[1] = vec.array[1]; 2328 return vec; 2329 } 2330 } 2331 2332 2333 version(LDC) 2334 { 2335 alias _mm_sra_epi16 = __builtin_ia32_psraw128; 2336 } 2337 else 2338 { 2339 __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @safe 2340 { 2341 short8 sa = cast(short8)a; 2342 long2 lc = cast(long2)count; 2343 int bits = cast(int)(lc[0]); 2344 short8 r = void; 2345 foreach(i; 0..8) 2346 r[i] = cast(short)(sa[i] >> bits); 2347 return cast(int4)r; 2348 } 2349 } 2350 unittest 2351 { 2352 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 2353 short8 B = cast(short8)( _mm_sra_epi16(A, _mm_cvtsi32_si128(1)) ); 2354 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 2355 assert(B.array == expectedB); 2356 } 2357 2358 version(LDC) 2359 { 2360 alias _mm_sra_epi32 = __builtin_ia32_psrad128; 2361 } 2362 else 2363 { 2364 __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @safe 2365 { 2366 int4 r = void; 2367 long2 lc = cast(long2)count; 2368 int bits = cast(int)(lc[0]); 2369 foreach(i; 0..4) 2370 r[i] = (a[i] >> bits); 2371 return r; 2372 } 2373 } 2374 unittest 2375 { 2376 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 2377 __m128i B = _mm_sra_epi32(A, _mm_cvtsi32_si128(1)); 2378 int[4] expectedB = [ 0, 1, 1, -2]; 2379 assert(B.array == expectedB); 2380 } 2381 2382 2383 version(LDC) 2384 { 2385 alias _mm_srai_epi16 = __builtin_ia32_psrawi128; 2386 } 2387 else 2388 { 2389 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @safe 2390 { 2391 short8 sa = cast(short8)a; 2392 short8 r = void; 2393 foreach(i; 0..8) 2394 r[i] = cast(short)(sa[i] >> imm8); 2395 return cast(int4)r; 2396 } 2397 } 2398 unittest 2399 { 2400 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 2401 short8 B = cast(short8)( _mm_srai_epi16(A, 1) ); 2402 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 2403 assert(B.array == expectedB); 2404 } 2405 2406 version(LDC) 2407 { 2408 alias _mm_srai_epi32 = __builtin_ia32_psradi128; 2409 } 2410 else 2411 { 2412 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @safe 2413 { 2414 int4 r = void; 2415 foreach(i; 0..4) 2416 r[i] = (a[i] >> imm8); 2417 return r; 2418 } 2419 } 2420 unittest 2421 { 2422 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 2423 __m128i B = _mm_srai_epi32(A, 1); 2424 int[4] expectedB = [ 0, 1, 1, -2]; 2425 assert(B.array == expectedB); 2426 } 2427 2428 version(LDC) 2429 { 2430 alias _mm_srl_epi16 = __builtin_ia32_psrlw128; 2431 } 2432 else 2433 { 2434 __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @safe 2435 { 2436 short8 sa = cast(short8)a; 2437 long2 lc = cast(long2)count; 2438 int bits = cast(int)(lc[0]); 2439 short8 r = void; 2440 foreach(i; 0..8) 2441 r[i] = cast(short)(cast(ushort)(sa[i]) >> bits); 2442 return cast(int4)r; 2443 } 2444 } 2445 unittest 2446 { 2447 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 2448 short8 B = cast(short8)( _mm_srl_epi16(A, _mm_cvtsi32_si128(1)) ); 2449 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 2450 assert(B.array == expectedB); 2451 } 2452 2453 version(LDC) 2454 { 2455 alias _mm_srl_epi32 = __builtin_ia32_psrld128; 2456 } 2457 else 2458 { 2459 __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @safe 2460 { 2461 int4 r = void; 2462 long2 lc = cast(long2)count; 2463 int bits = cast(int)(lc[0]); 2464 foreach(i; 0..4) 2465 r[i] = cast(uint)(a[i]) >> bits; 2466 return r; 2467 } 2468 } 2469 unittest 2470 { 2471 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 2472 __m128i B = _mm_srl_epi32(A, _mm_cvtsi32_si128(1)); 2473 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 2474 assert(B.array == expectedB); 2475 } 2476 2477 version(LDC) 2478 { 2479 alias _mm_srl_epi64 = __builtin_ia32_psrlq128; 2480 } 2481 else 2482 { 2483 __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @safe 2484 { 2485 long2 r = void; 2486 long2 sa = cast(long2)a; 2487 long2 lc = cast(long2)count; 2488 int bits = cast(int)(lc[0]); 2489 foreach(i; 0..2) 2490 r[i] = cast(ulong)(sa[i]) >> bits; 2491 return cast(__m128i)r; 2492 } 2493 } 2494 unittest 2495 { 2496 __m128i A = _mm_setr_epi64(8, -4); 2497 long2 B = cast(long2) _mm_srl_epi64(A, _mm_cvtsi32_si128(1)); 2498 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 2499 assert(B.array == expectedB); 2500 } 2501 2502 version(LDC) 2503 { 2504 alias _mm_srli_epi16 = __builtin_ia32_psrlwi128; 2505 } 2506 else 2507 { 2508 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @safe 2509 { 2510 short8 sa = cast(short8)a; 2511 short8 r = void; 2512 foreach(i; 0..8) 2513 r[i] = cast(short)(cast(ushort)(sa[i]) >> imm8); 2514 return cast(int4)r; 2515 } 2516 } 2517 unittest 2518 { 2519 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 2520 short8 B = cast(short8)( _mm_srli_epi16(A, 1) ); 2521 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 2522 assert(B.array == expectedB); 2523 } 2524 2525 version(LDC) 2526 { 2527 alias _mm_srli_epi32 = __builtin_ia32_psrldi128; 2528 } 2529 else 2530 { 2531 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @safe 2532 { 2533 int4 r = void; 2534 foreach(i; 0..4) 2535 r[i] = cast(uint)(a[i]) >> imm8; 2536 return r; 2537 } 2538 } 2539 unittest 2540 { 2541 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 2542 __m128i B = _mm_srli_epi32(A, 1); 2543 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 2544 assert(B.array == expectedB); 2545 } 2546 2547 version(LDC) 2548 { 2549 alias _mm_srli_epi64 = __builtin_ia32_psrlqi128; 2550 } 2551 else 2552 { 2553 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @safe 2554 { 2555 long2 r = void; 2556 long2 sa = cast(long2)a; 2557 foreach(i; 0..2) 2558 r[i] = cast(ulong)(sa[i]) >> imm8; 2559 return cast(__m128i)r; 2560 } 2561 } 2562 unittest 2563 { 2564 __m128i A = _mm_setr_epi64(8, -4); 2565 long2 B = cast(long2) _mm_srli_epi64(A, 1); 2566 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 2567 assert(B.array == expectedB); 2568 } 2569 2570 /// Shift `v` right by `bytes` bytes while shifting in zeros. 2571 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe 2572 { 2573 static if (bytes & 0xF0) 2574 return _mm_setzero_si128(); 2575 else 2576 return cast(__m128i) shufflevector!(byte16, 2577 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7, 2578 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15) 2579 (cast(byte16) v, cast(byte16)_mm_setzero_si128()); 2580 } 2581 unittest 2582 { 2583 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1)); 2584 int[4] correct = [2, 3, 4, 0]; 2585 assert(R.array == correct); 2586 } 2587 2588 /// Shift `v` right by `bytes` bytes while shifting in zeros. 2589 /// #BONUS 2590 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe 2591 { 2592 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v); 2593 } 2594 unittest 2595 { 2596 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)); 2597 float[4] correct = [3.0f, 4.0f, 0, 0]; 2598 assert(R.array == correct); 2599 } 2600 2601 /// Shift `v` right by `bytes` bytes while shifting in zeros. 2602 /// #BONUS 2603 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe 2604 { 2605 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v); 2606 } 2607 2608 void _mm_store_pd (double* mem_addr, __m128d a) pure 2609 { 2610 __m128d* aligned = cast(__m128d*)mem_addr; 2611 *aligned = a; 2612 } 2613 2614 void _mm_store_pd1 (double* mem_addr, __m128d a) pure 2615 { 2616 __m128d* aligned = cast(__m128d*)mem_addr; 2617 *aligned = shufflevector!(double2, 0, 0)(a, a); 2618 } 2619 2620 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 2621 { 2622 *mem_addr = a[0]; 2623 } 2624 2625 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 2626 { 2627 *mem_addr = a; 2628 } 2629 2630 alias _mm_store1_pd = _mm_store_pd1; 2631 2632 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 2633 { 2634 *mem_addr = a[1]; 2635 } 2636 2637 // Note: `mem_addr` doesn't have to actually be aligned, which breaks 2638 // expectations from the user point of view. This problem also exist in C++. 2639 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 2640 { 2641 long* dest = cast(long*)mem_addr; 2642 long2 la = cast(long2)a; 2643 *dest = la[0]; 2644 } 2645 unittest 2646 { 2647 long[3] A = [1, 2, 3]; 2648 _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 2649 long[3] correct = [1, 0x1_0000_0000, 3]; 2650 assert(A == correct); 2651 } 2652 2653 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 2654 { 2655 *mem_addr = a[0]; 2656 } 2657 2658 void _mm_storer_pd (double* mem_addr, __m128d a) pure 2659 { 2660 __m128d* aligned = cast(__m128d*)mem_addr; 2661 *aligned = shufflevector!(double2, 1, 0)(a, a); 2662 } 2663 2664 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe 2665 { 2666 storeUnaligned!double2(a, mem_addr); 2667 } 2668 2669 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe 2670 { 2671 storeUnaligned!__m128i(a, cast(int*)mem_addr); 2672 } 2673 2674 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 2675 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte 2676 /// boundary or a general-protection exception may be generated. 2677 void _mm_stream_pd (double* mem_addr, __m128d a) 2678 { 2679 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 2680 __m128d* dest = cast(__m128d*)mem_addr; 2681 *dest = a; 2682 } 2683 2684 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 2685 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception 2686 /// may be generated. 2687 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) 2688 { 2689 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 2690 __m128i* dest = cast(__m128i*)mem_addr; 2691 *dest = a; 2692 } 2693 2694 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 2695 /// pollution. If the cache line containing address mem_addr is already in the cache, 2696 /// the cache will be updated. 2697 void _mm_stream_si32 (int* mem_addr, int a) 2698 { 2699 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 2700 *mem_addr = a; 2701 } 2702 2703 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 2704 /// cache pollution. If the cache line containing address mem_addr is already 2705 /// in the cache, the cache will be updated. 2706 void _mm_stream_si64 (long* mem_addr, long a) 2707 { 2708 // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 2709 *mem_addr = a; 2710 } 2711 2712 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 2713 { 2714 return cast(__m128i)(cast(short8)a - cast(short8)b); 2715 } 2716 2717 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 2718 { 2719 return cast(__m128i)(cast(int4)a - cast(int4)b); 2720 } 2721 2722 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 2723 { 2724 return cast(__m128i)(cast(long2)a - cast(long2)b); 2725 } 2726 2727 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 2728 { 2729 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 2730 } 2731 2732 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 2733 { 2734 return a - b; 2735 } 2736 2737 version(DigitalMars) 2738 { 2739 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 2740 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe 2741 { 2742 asm pure nothrow @nogc @trusted { nop;} 2743 a[0] = a[0] - b[0]; 2744 return a; 2745 } 2746 } 2747 else 2748 { 2749 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe 2750 { 2751 a[0] -= b[0]; 2752 return a; 2753 } 2754 } 2755 unittest 2756 { 2757 __m128d a = [1.5, -2.0]; 2758 a = _mm_sub_sd(a, a); 2759 assert(a.array == [0.0, -2.0]); 2760 } 2761 2762 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe 2763 { 2764 return a - b; 2765 } 2766 2767 version(LDC) 2768 { 2769 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 2770 { 2771 // Generates PSUBSW since LDC 1.15 -O0 2772 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 2773 { 2774 enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 2775 enum ir = ` 2776 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 2777 ret <8 x i16> %r`; 2778 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 2779 } 2780 } 2781 else 2782 alias _mm_subs_epi16 = __builtin_ia32_psubsw128; 2783 } 2784 else 2785 { 2786 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 2787 { 2788 short[8] res; 2789 short8 sa = cast(short8)a; 2790 short8 sb = cast(short8)b; 2791 foreach(i; 0..8) 2792 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 2793 return _mm_loadu_si128(cast(int4*)res.ptr); 2794 } 2795 } 2796 unittest 2797 { 2798 short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0), 2799 _mm_setr_epi16(-10 , 16, 5, 4, 3, 2, 1, 0)); 2800 static immutable short[8] correctResult = [32767, -32768, 0, 0, 0, 0, 0, 0]; 2801 assert(res.array == correctResult); 2802 } 2803 2804 version(LDC) 2805 { 2806 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 2807 { 2808 // Generates PSUBSB since LDC 1.15 -O0 2809 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 2810 { 2811 enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 2812 enum ir = ` 2813 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 2814 ret <16 x i8> %r`; 2815 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 2816 } 2817 } 2818 else 2819 alias _mm_subs_epi8 = __builtin_ia32_psubsb128; 2820 } 2821 else 2822 { 2823 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 2824 { 2825 byte[16] res; 2826 byte16 sa = cast(byte16)a; 2827 byte16 sb = cast(byte16)b; 2828 foreach(i; 0..16) 2829 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 2830 return _mm_loadu_si128(cast(int4*)res.ptr); 2831 } 2832 } 2833 unittest 2834 { 2835 byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 2836 _mm_setr_epi8( 15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 2837 static immutable byte[16] correctResult = [-128, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 2838 assert(res.array == correctResult); 2839 } 2840 2841 version(LDC) 2842 { 2843 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 2844 { 2845 // Generates PSUBUSW since LDC 1.15 -O0 2846 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 2847 { 2848 enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 2849 enum ir = ` 2850 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 2851 ret <8 x i16> %r`; 2852 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 2853 } 2854 } 2855 else 2856 alias _mm_subs_epu16 = __builtin_ia32_psubusw128; 2857 } 2858 else 2859 { 2860 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 2861 { 2862 short[8] res; 2863 short8 sa = cast(short8)a; 2864 short8 sb = cast(short8)b; 2865 foreach(i; 0..8) 2866 { 2867 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 2868 res[i] = saturateSignedIntToUnsignedShort(sum); 2869 } 2870 return _mm_loadu_si128(cast(int4*)res.ptr); 2871 } 2872 } 2873 unittest 2874 { 2875 __m128i A = _mm_setr_epi16(cast(short)65534, 0, 5, 4, 3, 2, 1, 0); 2876 short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534, 1, 5, 4, 3, 2, 1, 0), 2877 _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0)); 2878 static immutable short[8] correct = [ 0, 0, 1, 0, 0, 2, 0, 0]; 2879 assert(R.array == correct); 2880 } 2881 2882 version(LDC) 2883 { 2884 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 2885 { 2886 // Generates PSUBUSB since LDC 1.15 -O0 2887 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 2888 { 2889 enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 2890 enum ir = ` 2891 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 2892 ret <16 x i8> %r`; 2893 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 2894 } 2895 } 2896 else 2897 alias _mm_subs_epu8 = __builtin_ia32_psubusb128; 2898 } 2899 else 2900 { 2901 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 2902 { 2903 ubyte[16] res; 2904 byte16 sa = cast(byte16)a; 2905 byte16 sb = cast(byte16)b; 2906 foreach(i; 0..16) 2907 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 2908 return _mm_loadu_si128(cast(int4*)res.ptr); 2909 } 2910 } 2911 unittest 2912 { 2913 byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 2914 _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 2915 static immutable byte[16] correctResult = [ 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 2916 assert(res.array == correctResult); 2917 } 2918 2919 // Note: the only difference between these intrinsics is the signalling 2920 // behaviour of quiet NaNs. This is incorrect but the case where 2921 // you would want to differentiate between qNaN and sNaN and then 2922 // treat them differently on purpose seems extremely rare. 2923 alias _mm_ucomieq_sd = _mm_comieq_sd; 2924 alias _mm_ucomige_sd = _mm_comige_sd; 2925 alias _mm_ucomigt_sd = _mm_comigt_sd; 2926 alias _mm_ucomile_sd = _mm_comile_sd; 2927 alias _mm_ucomilt_sd = _mm_comilt_sd; 2928 alias _mm_ucomineq_sd = _mm_comineq_sd; 2929 2930 __m128d _mm_undefined_pd() pure @safe 2931 { 2932 __m128d result = void; 2933 return result; 2934 } 2935 __m128i _mm_undefined_si128() pure @safe 2936 { 2937 __m128i result = void; 2938 return result; 2939 } 2940 2941 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe 2942 { 2943 return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 2944 (cast(short8)a, cast(short8)b); 2945 } 2946 2947 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @safe 2948 { 2949 return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 2950 } 2951 2952 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @safe 2953 { 2954 return cast(__m128i) shufflevector!(long2, 1, 3)(cast(long2)a, cast(long2)b); 2955 } 2956 2957 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe 2958 { 2959 return cast(__m128i)shufflevector!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 2960 12, 28, 13, 29, 14, 30, 15, 31) 2961 (cast(byte16)a, cast(byte16)b); 2962 } 2963 2964 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe 2965 { 2966 return shufflevector!(__m128d, 1, 3)(a, b); 2967 } 2968 2969 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe 2970 { 2971 return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11) 2972 (cast(short8)a, cast(short8)b); 2973 } 2974 2975 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @safe 2976 { 2977 return shufflevector!(int4, 0, 4, 1, 5) 2978 (cast(int4)a, cast(int4)b); 2979 } 2980 2981 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @safe 2982 { 2983 return cast(__m128i) shufflevector!(long2, 0, 2) 2984 (cast(long2)a, cast(long2)b); 2985 } 2986 2987 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe 2988 { 2989 return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 2990 4, 20, 5, 21, 6, 22, 7, 23) 2991 (cast(byte16)a, cast(byte16)b); 2992 } 2993 2994 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe 2995 { 2996 return shufflevector!(__m128d, 0, 2)(a, b); 2997 } 2998 2999 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 3000 { 3001 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 3002 } 3003 3004 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 3005 { 3006 return a ^ b; 3007 } 3008 3009 unittest 3010 { 3011 // distance between two points in 4D 3012 float distance(float[4] a, float[4] b) nothrow @nogc 3013 { 3014 __m128 va = _mm_loadu_ps(a.ptr); 3015 __m128 vb = _mm_loadu_ps(b.ptr); 3016 __m128 diffSquared = _mm_sub_ps(va, vb); 3017 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 3018 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared)); 3019 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum)); 3020 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 3021 } 3022 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 3023 }