1 /** 2 * SSE2 intrinsics. 3 * 4 * Copyright: Copyright Auburn Sounds 2016-2019, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 * Authors: Guillaume Piolat 7 */ 8 module inteli.emmintrin; 9 10 public import inteli.types; 11 public import inteli.xmmintrin; // SSE2 includes SSE1 12 import inteli.mmx; 13 import inteli.internals; 14 15 nothrow @nogc: 16 17 18 // SSE2 instructions 19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 20 21 /// Add packed 16-bit integers in `a` and `b`. 22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 23 { 24 return cast(__m128i)(cast(short8)a + cast(short8)b); 25 } 26 unittest 27 { 28 __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77); 29 short8 R = cast(short8) _mm_add_epi16(A, A); 30 short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154]; 31 assert(R.array == correct); 32 } 33 34 /// Add packed 32-bit integers in `a` and `b`. 35 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 36 { 37 return cast(__m128i)(cast(int4)a + cast(int4)b); 38 } 39 unittest 40 { 41 __m128i A = _mm_setr_epi32( -7, -1, 0, 9); 42 int4 R = _mm_add_epi32(A, A); 43 int[4] correct = [ -14, -2, 0, 18 ]; 44 assert(R.array == correct); 45 } 46 47 /// Add packed 64-bit integers in `a` and `b`. 48 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 49 { 50 return cast(__m128i)(cast(long2)a + cast(long2)b); 51 } 52 unittest 53 { 54 __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000); 55 long2 R = cast(long2) _mm_add_epi64(A, A); 56 long[2] correct = [ -2, 0 ]; 57 assert(R.array == correct); 58 } 59 60 /// Add packed 8-bit integers in `a` and `b`. 61 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 62 { 63 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 64 } 65 unittest 66 { 67 __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78); 68 byte16 R = cast(byte16) _mm_add_epi8(A, A); 69 byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100]; 70 assert(R.array == correct); 71 } 72 73 /// Add the lower double-precision (64-bit) floating-point element 74 /// in `a` and `b`, store the result in the lower element of dst, 75 /// and copy the upper element from `a` to the upper element of destination. 76 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 77 { 78 static if (GDC_with_SSE2) 79 { 80 return __builtin_ia32_addsd(a, b); 81 } 82 else version(DigitalMars) 83 { 84 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 85 asm pure nothrow @nogc @trusted { nop;} 86 a[0] = a[0] + b[0]; 87 return a; 88 } 89 else 90 { 91 a[0] += b[0]; 92 return a; 93 } 94 } 95 unittest 96 { 97 __m128d a = [1.5, -2.0]; 98 a = _mm_add_sd(a, a); 99 assert(a.array == [3.0, -2.0]); 100 } 101 102 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 103 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 104 { 105 return a + b; 106 } 107 unittest 108 { 109 __m128d a = [1.5, -2.0]; 110 a = _mm_add_pd(a, a); 111 assert(a.array == [3.0, -4.0]); 112 } 113 114 /// Add 64-bit integers `a` and `b`. 115 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe 116 { 117 return a + b; 118 } 119 120 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 121 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 122 { 123 static if (GDC_with_SSE2) 124 { 125 return __builtin_ia32_paddsw128(a, b); 126 } 127 else version(LDC) 128 { 129 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 130 { 131 // x86: Generates PADDSW since LDC 1.15 -O0 132 // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20 133 enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 134 enum ir = ` 135 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 136 ret <8 x i16> %r`; 137 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 138 } 139 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 140 { 141 short[8] res; 142 short8 sa = cast(short8)a; 143 short8 sb = cast(short8)b; 144 foreach(i; 0..8) 145 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 146 return _mm_loadu_si128(cast(int4*)res.ptr); 147 } 148 else 149 return __builtin_ia32_paddsw128(a, b); 150 } 151 else 152 { 153 short[8] res; 154 short8 sa = cast(short8)a; 155 short8 sb = cast(short8)b; 156 foreach(i; 0..8) 157 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 158 return _mm_loadu_si128(cast(int4*)res.ptr); 159 } 160 } 161 unittest 162 { 163 short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), 164 _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); 165 static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 166 assert(res.array == correctResult); 167 } 168 169 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 170 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 171 { 172 static if (GDC_with_SSE2) 173 { 174 return __builtin_ia32_paddsb128(a, b); 175 } 176 else version(LDC) 177 { 178 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 179 { 180 // x86: Generates PADDSB since LDC 1.15 -O0 181 // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20 182 enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 183 enum ir = ` 184 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 185 ret <16 x i8> %r`; 186 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 187 } 188 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 189 { 190 byte[16] res; 191 byte16 sa = cast(byte16)a; 192 byte16 sb = cast(byte16)b; 193 foreach(i; 0..16) 194 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 195 return _mm_loadu_si128(cast(int4*)res.ptr); 196 } 197 else 198 return __builtin_ia32_paddsb128(a, b); 199 } 200 else 201 { 202 byte[16] res; 203 byte16 sa = cast(byte16)a; 204 byte16 sb = cast(byte16)b; 205 foreach(i; 0..16) 206 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 207 return _mm_loadu_si128(cast(int4*)res.ptr); 208 } 209 } 210 unittest 211 { 212 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 213 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 214 static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14, 215 16, 18, 20, 22, 24, 26, 28, 30]; 216 assert(res.array == correctResult); 217 } 218 219 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 220 // PERF: #GDC version? 221 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 222 { 223 version(LDC) 224 { 225 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 226 { 227 // x86: Generates PADDUSB since LDC 1.15 -O0 228 // ARM: Generates uqadd.16b since LDC 1.21 -O1 229 enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 230 enum ir = ` 231 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 232 ret <16 x i8> %r`; 233 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 234 } 235 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 236 { 237 ubyte[16] res; 238 byte16 sa = cast(byte16)a; 239 byte16 sb = cast(byte16)b; 240 foreach(i; 0..16) 241 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 242 return _mm_loadu_si128(cast(int4*)res.ptr); 243 } 244 else 245 return __builtin_ia32_paddusb128(a, b); 246 } 247 else 248 { 249 ubyte[16] res; 250 byte16 sa = cast(byte16)a; 251 byte16 sb = cast(byte16)b; 252 foreach(i; 0..16) 253 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 254 return _mm_loadu_si128(cast(int4*)res.ptr); 255 } 256 } 257 unittest 258 { 259 byte16 res = cast(byte16) _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0), 260 _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0)); 261 static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 262 assert(res.array == correctResult); 263 } 264 265 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation. 266 // PERF: #GDC version? 267 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 268 { 269 version(LDC) 270 { 271 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 272 { 273 // x86: Generates PADDUSW since LDC 1.15 -O0 274 // ARM: Generates uqadd.8h since LDC 1.21 -O1 275 enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 276 enum ir = ` 277 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 278 ret <8 x i16> %r`; 279 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 280 } 281 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 282 { 283 ushort[8] res; 284 short8 sa = cast(short8)a; 285 short8 sb = cast(short8)b; 286 foreach(i; 0..8) 287 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 288 return _mm_loadu_si128(cast(int4*)res.ptr); 289 } 290 else 291 return __builtin_ia32_paddusw128(a, b); 292 } 293 else 294 { 295 ushort[8] res; 296 short8 sa = cast(short8)a; 297 short8 sb = cast(short8)b; 298 foreach(i; 0..8) 299 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 300 return _mm_loadu_si128(cast(int4*)res.ptr); 301 } 302 } 303 unittest 304 { 305 short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0), 306 _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0)); 307 static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6]; 308 assert(res.array == correctResult); 309 } 310 311 /// Compute the bitwise AND of packed double-precision (64-bit) 312 /// floating-point elements in `a` and `b`. 313 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 314 { 315 return cast(__m128d)( cast(long2)a & cast(long2)b ); 316 } 317 unittest 318 { 319 double a = 4.32; 320 double b = -78.99; 321 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 322 __m128d A = _mm_set_pd(a, b); 323 __m128d B = _mm_set_pd(b, a); 324 long2 R = cast(long2)( _mm_and_pd(A, B) ); 325 assert(R.array[0] == correct); 326 assert(R.array[1] == correct); 327 } 328 329 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`. 330 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 331 { 332 return a & b; 333 } 334 unittest 335 { 336 __m128i A = _mm_set1_epi32(7); 337 __m128i B = _mm_set1_epi32(14); 338 __m128i R = _mm_and_si128(A, B); 339 int[4] correct = [6, 6, 6, 6]; 340 assert(R.array == correct); 341 } 342 343 /// Compute the bitwise NOT of packed double-precision (64-bit) 344 /// floating-point elements in `a` and then AND with `b`. 345 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 346 { 347 return cast(__m128d)( ~(cast(long2)a) & cast(long2)b); 348 } 349 unittest 350 { 351 double a = 4.32; 352 double b = -78.99; 353 long correct = (~*cast(long*)(&a)) & ( *cast(long*)(&b)); 354 long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b)); 355 __m128d A = _mm_setr_pd(a, b); 356 __m128d B = _mm_setr_pd(b, a); 357 long2 R = cast(long2)( _mm_andnot_pd(A, B) ); 358 assert(R.array[0] == correct); 359 assert(R.array[1] == correct2); 360 } 361 362 /// Compute the bitwise NOT of 128 bits (representing integer data) 363 /// in `a` and then AND with `b`. 364 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 365 { 366 return (~a) & b; 367 } 368 unittest 369 { 370 __m128i A = _mm_set1_epi32(7); 371 __m128i B = _mm_set1_epi32(14); 372 __m128i R = _mm_andnot_si128(A, B); 373 int[4] correct = [8, 8, 8, 8]; 374 assert(R.array == correct); 375 } 376 377 /// Average packed unsigned 16-bit integers in `a` and `b`. 378 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted 379 { 380 static if (GDC_with_SSE2) 381 { 382 return __builtin_ia32_pavgw128(a, b); 383 } 384 else static if (LDC_with_ARM64) 385 { 386 return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b); 387 } 388 else version(LDC) 389 { 390 // Generates pavgw even in LDC 1.0, even in -O0 391 // But not in ARM 392 enum ir = ` 393 %ia = zext <8 x i16> %0 to <8 x i32> 394 %ib = zext <8 x i16> %1 to <8 x i32> 395 %isum = add <8 x i32> %ia, %ib 396 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 397 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 398 %r = trunc <8 x i32> %isums to <8 x i16> 399 ret <8 x i16> %r`; 400 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 401 } 402 else 403 { 404 short8 sa = cast(short8)a; 405 short8 sb = cast(short8)b; 406 short8 sr = void; 407 foreach(i; 0..8) 408 { 409 sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 ); 410 } 411 return cast(int4)sr; 412 } 413 } 414 unittest 415 { 416 __m128i A = _mm_set1_epi16(31); 417 __m128i B = _mm_set1_epi16(64); 418 short8 avg = cast(short8)(_mm_avg_epu16(A, B)); 419 foreach(i; 0..8) 420 assert(avg.array[i] == 48); 421 } 422 423 /// Average packed unsigned 8-bit integers in `a` and `b`. 424 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted 425 { 426 static if (GDC_with_SSE2) 427 { 428 return __builtin_ia32_pavgb128(a, b); 429 } 430 else static if (LDC_with_ARM64) 431 { 432 return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b); 433 } 434 else version(LDC) 435 { 436 // Generates pavgb even in LDC 1.0, even in -O0 437 // But not in ARM 438 enum ir = ` 439 %ia = zext <16 x i8> %0 to <16 x i16> 440 %ib = zext <16 x i8> %1 to <16 x i16> 441 %isum = add <16 x i16> %ia, %ib 442 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 443 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 444 %r = trunc <16 x i16> %isums to <16 x i8> 445 ret <16 x i8> %r`; 446 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 447 } 448 else 449 { 450 byte16 sa = cast(byte16)a; 451 byte16 sb = cast(byte16)b; 452 byte16 sr = void; 453 foreach(i; 0..16) 454 { 455 sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 ); 456 } 457 return cast(int4)sr; 458 } 459 } 460 unittest 461 { 462 __m128i A = _mm_set1_epi8(31); 463 __m128i B = _mm_set1_epi8(64); 464 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B)); 465 foreach(i; 0..16) 466 assert(avg.array[i] == 48); 467 } 468 469 /// Shift `a` left by `bytes` bytes while shifting in zeros. 470 alias _mm_bslli_si128 = _mm_slli_si128; 471 unittest 472 { 473 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 474 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 475 __m128i result = _mm_bslli_si128!5(toShift); 476 assert( (cast(byte16)result).array == exact); 477 } 478 479 /// Shift `v` right by `bytes` bytes while shifting in zeros. 480 alias _mm_bsrli_si128 = _mm_srli_si128; 481 unittest 482 { 483 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 484 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0]; 485 __m128i result = _mm_bsrli_si128!5(toShift); 486 assert( (cast(byte16)result).array == exact); 487 } 488 489 /// Cast vector of type `__m128d` to type `__m128`. 490 /// Note: Also possible with a regular `cast(__m128)(a)`. 491 __m128 _mm_castpd_ps (__m128d a) pure @safe 492 { 493 return cast(__m128)a; 494 } 495 496 /// Cast vector of type `__m128d` to type `__m128i`. 497 /// Note: Also possible with a regular `cast(__m128i)(a)`. 498 __m128i _mm_castpd_si128 (__m128d a) pure @safe 499 { 500 return cast(__m128i)a; 501 } 502 503 /// Cast vector of type `__m128` to type `__m128d`. 504 /// Note: Also possible with a regular `cast(__m128d)(a)`. 505 __m128d _mm_castps_pd (__m128 a) pure @safe 506 { 507 return cast(__m128d)a; 508 } 509 510 /// Cast vector of type `__m128` to type `__m128i`. 511 /// Note: Also possible with a regular `cast(__m128i)(a)`. 512 __m128i _mm_castps_si128 (__m128 a) pure @safe 513 { 514 return cast(__m128i)a; 515 } 516 517 /// Cast vector of type `__m128i` to type `__m128d`. 518 /// Note: Also possible with a regular `cast(__m128d)(a)`. 519 __m128d _mm_castsi128_pd (__m128i a) pure @safe 520 { 521 return cast(__m128d)a; 522 } 523 524 /// Cast vector of type `__m128i` to type `__m128`. 525 /// Note: Also possible with a regular `cast(__m128)(a)`. 526 __m128 _mm_castsi128_ps (__m128i a) pure @safe 527 { 528 return cast(__m128)a; 529 } 530 531 /// Invalidate and flush the cache line that contains `p` 532 /// from all levels of the cache hierarchy. 533 void _mm_clflush (const(void)* p) @trusted 534 { 535 static if (GDC_with_SSE2) 536 { 537 __builtin_ia32_clflush(p); 538 } 539 else static if (LDC_with_SSE2) 540 { 541 __builtin_ia32_clflush(cast(void*)p); 542 } 543 else version(D_InlineAsm_X86) 544 { 545 asm pure nothrow @nogc @safe 546 { 547 mov EAX, p; 548 clflush [EAX]; 549 } 550 } 551 else version(D_InlineAsm_X86_64) 552 { 553 asm pure nothrow @nogc @safe 554 { 555 mov RAX, p; 556 clflush [RAX]; 557 } 558 } 559 else 560 { 561 // Do nothing. Invalidating cacheline does 562 // not affect correctness. 563 } 564 } 565 unittest 566 { 567 ubyte[64] cacheline; 568 _mm_clflush(cacheline.ptr); 569 } 570 571 /// Compare packed 16-bit integers in `a` and `b` for equality. 572 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 573 { 574 static if (GDC_with_SSE2) 575 { 576 return __builtin_ia32_pcmpeqw128(a, b); 577 } 578 else 579 { 580 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 581 } 582 } 583 unittest 584 { 585 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 586 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 587 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 588 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 589 assert(R.array == E); 590 } 591 592 /// Compare packed 32-bit integers in `a` and `b` for equality. 593 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 594 { 595 static if (GDC_with_SSE2) 596 { 597 return __builtin_ia32_pcmpeqd128(a, b); 598 } 599 else 600 { 601 return equalMask!__m128i(a, b); 602 } 603 } 604 unittest 605 { 606 int4 A = [-3, -2, -1, 0]; 607 int4 B = [ 4, -2, 2, 0]; 608 int[4] E = [ 0, -1, 0, -1]; 609 int4 R = cast(int4)(_mm_cmpeq_epi16(A, B)); 610 assert(R.array == E); 611 } 612 613 /// Compare packed 8-bit integers in `a` and `b` for equality. 614 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 615 { 616 static if (GDC_with_SSE2) 617 { 618 return __builtin_ia32_pcmpeqb128(a, b); 619 } 620 else 621 { 622 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 623 } 624 } 625 unittest 626 { 627 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 628 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 629 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 630 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 631 assert(C.array == correct); 632 } 633 634 /// Compare packed double-precision (64-bit) floating-point elements 635 /// in `a` and `b` for equality. 636 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 637 { 638 static if (GDC_with_SSE2) 639 { 640 return __builtin_ia32_cmpeqpd(a, b); 641 } 642 else 643 { 644 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b); 645 } 646 } 647 648 /// Compare the lower double-precision (64-bit) floating-point elements 649 /// in `a` and `b` for equality, store the result in the lower element, 650 /// and copy the upper element from `a`. 651 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 652 { 653 static if (GDC_with_SSE2) 654 { 655 return __builtin_ia32_cmpeqsd(a, b); 656 } 657 else 658 { 659 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b); 660 } 661 } 662 663 /// Compare packed double-precision (64-bit) floating-point elements 664 /// in `a` and `b` for greater-than-or-equal. 665 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 666 { 667 static if (GDC_with_SSE2) 668 { 669 return __builtin_ia32_cmpgepd(a, b); 670 } 671 else 672 { 673 return cast(__m128d) cmppd!(FPComparison.oge)(a, b); 674 } 675 } 676 677 /// Compare the lower double-precision (64-bit) floating-point elements 678 /// in `a` and `b` for greater-than-or-equal, store the result in the 679 /// lower element, and copy the upper element from `a`. 680 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 681 { 682 // Note: There is no __builtin_ia32_cmpgesd builtin. 683 static if (GDC_with_SSE2) 684 { 685 return __builtin_ia32_cmpnltsd(b, a); 686 } 687 else 688 { 689 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b); 690 } 691 } 692 693 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 694 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 695 { 696 static if (GDC_with_SSE2) 697 { 698 return __builtin_ia32_pcmpgtw128(a, b); 699 } 700 else 701 { 702 return cast(__m128i)( greaterMask!short8(cast(short8)a, cast(short8)b)); 703 } 704 } 705 unittest 706 { 707 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 708 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 709 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 710 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 711 assert(R.array == E); 712 } 713 714 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 715 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 716 { 717 static if (GDC_with_SSE2) 718 { 719 return __builtin_ia32_pcmpgtd128(a, b); 720 } 721 else 722 { 723 return cast(__m128i)( greaterMask!int4(a, b)); 724 } 725 } 726 unittest 727 { 728 int4 A = [-3, 2, -1, 0]; 729 int4 B = [ 4, -2, 2, 0]; 730 int[4] E = [ 0, -1, 0, 0]; 731 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 732 assert(R.array == E); 733 } 734 735 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 736 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 737 { 738 static if (GDC_with_SSE2) 739 { 740 return __builtin_ia32_pcmpgtb128(a, b); 741 } 742 else 743 { 744 return cast(__m128i)( greaterMask!byte16(cast(byte16)a, cast(byte16)b)); 745 } 746 } 747 unittest 748 { 749 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 750 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 751 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 752 byte[16] correct = [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 753 __m128i D = _mm_cmpeq_epi8(A, B); 754 assert(C.array == correct); 755 } 756 757 /// Compare packed double-precision (64-bit) floating-point elements 758 /// in `a` and `b` for greater-than. 759 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 760 { 761 static if (GDC_with_SSE2) 762 { 763 return __builtin_ia32_cmpgtpd(a, b); 764 } 765 else 766 { 767 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b); 768 } 769 } 770 771 /// Compare the lower double-precision (64-bit) floating-point elements 772 /// in `a` and `b` for greater-than, store the result in the lower element, 773 /// and copy the upper element from `a`. 774 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 775 { 776 // Note: There is no __builtin_ia32_cmpgtsd builtin. 777 static if (GDC_with_SSE2) 778 { 779 return __builtin_ia32_cmpnlesd(b, a); 780 } 781 else 782 { 783 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b); 784 } 785 } 786 787 /// Compare packed double-precision (64-bit) floating-point elements 788 /// in `a` and `b` for less-than-or-equal. 789 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 790 { 791 static if (GDC_with_SSE2) 792 { 793 return __builtin_ia32_cmplepd(a, b); 794 } 795 else 796 { 797 return cast(__m128d) cmppd!(FPComparison.ole)(a, b); 798 } 799 } 800 801 /// Compare the lower double-precision (64-bit) floating-point elements 802 /// in `a` and `b` for less-than-or-equal, store the result in the 803 /// lower element, and copy the upper element from `a`. 804 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 805 { 806 static if (GDC_with_SSE2) 807 { 808 return __builtin_ia32_cmplesd(a, b); 809 } 810 else 811 { 812 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b); 813 } 814 } 815 816 /// Compare packed 16-bit integers in `a` and `b` for less-than. 817 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 818 { 819 return _mm_cmpgt_epi16(b, a); 820 } 821 822 /// Compare packed 32-bit integers in `a` and `b` for less-than. 823 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 824 { 825 return _mm_cmpgt_epi32(b, a); 826 } 827 828 /// Compare packed 8-bit integers in `a` and `b` for less-than. 829 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 830 { 831 return _mm_cmpgt_epi8(b, a); 832 } 833 834 /// Compare packed double-precision (64-bit) floating-point elements 835 /// in `a` and `b` for less-than. 836 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 837 { 838 static if (GDC_with_SSE2) 839 { 840 return __builtin_ia32_cmpltpd(a, b); 841 } 842 else 843 { 844 return cast(__m128d) cmppd!(FPComparison.olt)(a, b); 845 } 846 } 847 848 /// Compare the lower double-precision (64-bit) floating-point elements 849 /// in `a` and `b` for less-than, store the result in the lower 850 /// element, and copy the upper element from `a`. 851 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 852 { 853 static if (GDC_with_SSE2) 854 { 855 return __builtin_ia32_cmpltsd(a, b); 856 } 857 else 858 { 859 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b); 860 } 861 } 862 863 /// Compare packed double-precision (64-bit) floating-point elements 864 /// in `a` and `b` for not-equal. 865 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 866 { 867 static if (GDC_with_SSE2) 868 { 869 return __builtin_ia32_cmpneqpd(a, b); 870 } 871 else 872 { 873 return cast(__m128d) cmppd!(FPComparison.une)(a, b); 874 } 875 } 876 877 /// Compare the lower double-precision (64-bit) floating-point elements 878 /// in `a` and `b` for not-equal, store the result in the lower 879 /// element, and copy the upper element from `a`. 880 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 881 { 882 static if (GDC_with_SSE2) 883 { 884 return __builtin_ia32_cmpneqsd(a, b); 885 } 886 else 887 { 888 return cast(__m128d) cmpsd!(FPComparison.une)(a, b); 889 } 890 } 891 892 /// Compare packed double-precision (64-bit) floating-point elements 893 /// in `a` and `b` for not-greater-than-or-equal. 894 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 895 { 896 static if (GDC_with_SSE2) 897 { 898 return __builtin_ia32_cmpngepd(a, b); 899 } 900 else 901 { 902 return cast(__m128d) cmppd!(FPComparison.ult)(a, b); 903 } 904 } 905 906 /// Compare the lower double-precision (64-bit) floating-point elements 907 /// in `a` and `b` for not-greater-than-or-equal, store the result in 908 /// the lower element, and copy the upper element from `a`. 909 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 910 { 911 // Note: There is no __builtin_ia32_cmpngesd builtin. 912 static if (GDC_with_SSE2) 913 { 914 return __builtin_ia32_cmpltsd(b, a); 915 } 916 else 917 { 918 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b); 919 } 920 } 921 922 /// Compare packed double-precision (64-bit) floating-point elements 923 /// in `a` and `b` for not-greater-than. 924 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 925 { 926 static if (GDC_with_SSE2) 927 { 928 return __builtin_ia32_cmpngtpd(a, b); 929 } 930 else 931 { 932 return cast(__m128d) cmppd!(FPComparison.ule)(a, b); 933 } 934 } 935 936 /// Compare the lower double-precision (64-bit) floating-point elements 937 /// in `a` and `b` for not-greater-than, store the result in the 938 /// lower element, and copy the upper element from `a`. 939 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 940 { 941 // Note: There is no __builtin_ia32_cmpngtsd builtin. 942 static if (GDC_with_SSE2) 943 { 944 return __builtin_ia32_cmplesd(b, a); 945 } 946 else 947 { 948 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b); 949 } 950 } 951 952 /// Compare packed double-precision (64-bit) floating-point elements 953 /// in `a` and `b` for not-less-than-or-equal. 954 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 955 { 956 static if (GDC_with_SSE2) 957 { 958 return __builtin_ia32_cmpnlepd(a, b); 959 } 960 else 961 { 962 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b); 963 } 964 } 965 966 /// Compare the lower double-precision (64-bit) floating-point elements 967 /// in `a` and `b` for not-less-than-or-equal, store the result in the 968 /// lower element, and copy the upper element from `a`. 969 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 970 { 971 static if (GDC_with_SSE2) 972 { 973 return __builtin_ia32_cmpnlesd(a, b); 974 } 975 else 976 { 977 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b); 978 } 979 } 980 981 /// Compare packed double-precision (64-bit) floating-point elements 982 /// in `a` and `b` for not-less-than. 983 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 984 { 985 static if (GDC_with_SSE2) 986 { 987 return __builtin_ia32_cmpnltpd(a, b); 988 } 989 else 990 { 991 return cast(__m128d) cmppd!(FPComparison.uge)(a, b); 992 } 993 } 994 995 /// Compare the lower double-precision (64-bit) floating-point elements 996 /// in `a` and `b` for not-less-than, store the result in the lower 997 /// element, and copy the upper element from `a`. 998 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 999 { 1000 static if (GDC_with_SSE2) 1001 { 1002 return __builtin_ia32_cmpnltsd(a, b); 1003 } 1004 else 1005 { 1006 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b); 1007 } 1008 } 1009 1010 /// Compare packed double-precision (64-bit) floating-point elements 1011 /// in `a` and `b` to see if neither is NaN. 1012 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 1013 { 1014 static if (GDC_with_SSE2) 1015 { 1016 return __builtin_ia32_cmpordpd(a, b); 1017 } 1018 else 1019 { 1020 return cast(__m128d) cmppd!(FPComparison.ord)(a, b); 1021 } 1022 } 1023 1024 /// Compare the lower double-precision (64-bit) floating-point elements 1025 /// in `a` and `b` to see if neither is NaN, store the result in the 1026 /// lower element, and copy the upper element from `a` to the upper element. 1027 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 1028 { 1029 static if (GDC_with_SSE2) 1030 { 1031 return __builtin_ia32_cmpordsd(a, b); 1032 } 1033 else 1034 { 1035 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b); 1036 } 1037 } 1038 1039 /// Compare packed double-precision (64-bit) floating-point elements 1040 /// in `a` and `b` to see if either is NaN. 1041 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 1042 { 1043 static if (GDC_with_SSE2) 1044 { 1045 return __builtin_ia32_cmpunordpd(a, b); 1046 } 1047 else 1048 { 1049 return cast(__m128d) cmppd!(FPComparison.uno)(a, b); 1050 } 1051 } 1052 1053 /// Compare the lower double-precision (64-bit) floating-point elements 1054 /// in `a` and `b` to see if either is NaN, store the result in the lower 1055 /// element, and copy the upper element from `a` to the upper element. 1056 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 1057 { 1058 static if (GDC_with_SSE2) 1059 { 1060 return __builtin_ia32_cmpunordsd(a, b); 1061 } 1062 else 1063 { 1064 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b); 1065 } 1066 } 1067 1068 1069 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS 1070 // Some such comparisons yields true for NaNs, other don't. 1071 1072 /// Compare the lower double-precision (64-bit) floating-point element 1073 /// in `a` and `b` for equality, and return the boolean result (0 or 1). 1074 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe 1075 { 1076 static if (GDC_with_SSE2) 1077 { 1078 return __builtin_ia32_comieq(a, b); 1079 } 1080 else 1081 { 1082 return comsd!(FPComparison.ueq)(a, b); // yields true for NaN, same as GCC 1083 } 1084 } 1085 1086 /// Compare the lower double-precision (64-bit) floating-point element 1087 /// in `a` and `b` for greater-than-or-equal, and return the boolean 1088 /// result (0 or 1). 1089 int _mm_comige_sd (__m128d a, __m128d b) pure @safe 1090 { 1091 static if (GDC_with_SSE2) 1092 { 1093 return __builtin_ia32_comige(a, b); 1094 } 1095 else 1096 { 1097 return comsd!(FPComparison.oge)(a, b); 1098 } 1099 } 1100 1101 /// Compare the lower double-precision (64-bit) floating-point element 1102 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1). 1103 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe 1104 { 1105 static if (GDC_with_SSE2) 1106 { 1107 return __builtin_ia32_comigt(a, b); 1108 } 1109 else 1110 { 1111 return comsd!(FPComparison.ogt)(a, b); 1112 } 1113 } 1114 1115 /// Compare the lower double-precision (64-bit) floating-point element 1116 /// in `a` and `b` for less-than-or-equal. 1117 int _mm_comile_sd (__m128d a, __m128d b) pure @safe 1118 { 1119 static if (GDC_with_SSE2) 1120 { 1121 return __builtin_ia32_comile(a, b); 1122 } 1123 else 1124 { 1125 return comsd!(FPComparison.ule)(a, b); // yields true for NaN, same as GCC 1126 } 1127 } 1128 1129 /// Compare the lower double-precision (64-bit) floating-point element 1130 /// in `a` and `b` for less-than, and return the boolean result (0 or 1). 1131 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe 1132 { 1133 static if (GDC_with_SSE2) 1134 { 1135 return __builtin_ia32_comilt(a, b); 1136 } 1137 else 1138 { 1139 return comsd!(FPComparison.ult)(a, b); // yields true for NaN, same as GCC 1140 } 1141 } 1142 1143 /// Compare the lower double-precision (64-bit) floating-point element 1144 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1). 1145 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe 1146 { 1147 static if (GDC_with_SSE2) 1148 { 1149 return __builtin_ia32_comineq(a, b); 1150 } 1151 else 1152 { 1153 return comsd!(FPComparison.one)(a, b); 1154 } 1155 } 1156 1157 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) 1158 /// floating-point elements. 1159 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted 1160 { 1161 version(LDC) 1162 { 1163 // Generates cvtdq2pd since LDC 1.0, even without optimizations 1164 enum ir = ` 1165 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 1166 %r = sitofp <2 x i32> %v to <2 x double> 1167 ret <2 x double> %r`; 1168 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a); 1169 } 1170 else static if (GDC_with_SSE2) 1171 { 1172 return __builtin_ia32_cvtdq2pd(a); 1173 } 1174 else 1175 { 1176 double2 r = void; 1177 r.ptr[0] = a.array[0]; 1178 r.ptr[1] = a.array[1]; 1179 return r; 1180 } 1181 } 1182 unittest 1183 { 1184 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54)); 1185 assert(A.array[0] == 54.0); 1186 assert(A.array[1] == 54.0); 1187 } 1188 1189 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 1190 /// floating-point elements. 1191 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted 1192 { 1193 static if (GDC_with_SSE2) 1194 { 1195 return __builtin_ia32_cvtdq2ps(a); 1196 } 1197 else 1198 { 1199 // x86: Generates cvtdq2ps since LDC 1.0.0 -O1 1200 // ARM: Generats scvtf.4s since LDC 1.8.0 -02 1201 __m128 res; 1202 res.ptr[0] = cast(float)a.array[0]; 1203 res.ptr[1] = cast(float)a.array[1]; 1204 res.ptr[2] = cast(float)a.array[2]; 1205 res.ptr[3] = cast(float)a.array[3]; 1206 return res; 1207 } 1208 } 1209 unittest 1210 { 1211 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 1212 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 1213 } 1214 1215 /// Convert packed double-precision (64-bit) floating-point elements 1216 /// in `a` to packed 32-bit integers. 1217 // PERF #ARM 1218 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted 1219 { 1220 static if (LDC_with_SSE2) 1221 { 1222 // Like in clang, implemented with a magic intrinsic right now 1223 return __builtin_ia32_cvtpd2dq(a); 1224 } 1225 else static if (GDC_with_SSE2) 1226 { 1227 return __builtin_ia32_cvtpd2dq(a); 1228 } 1229 else 1230 { 1231 __m128i r = _mm_setzero_si128(); 1232 r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]); 1233 r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]); 1234 return r; 1235 } 1236 } 1237 unittest 1238 { 1239 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0)); 1240 assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0); 1241 } 1242 1243 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1244 /// to packed 32-bit integers 1245 __m64 _mm_cvtpd_pi32 (__m128d v) @safe 1246 { 1247 return to_m64(_mm_cvtpd_epi32(v)); 1248 } 1249 unittest 1250 { 1251 int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0)); 1252 assert(A.array[0] == 55 && A.array[1] == 61); 1253 } 1254 1255 /// Convert packed double-precision (64-bit) floating-point elements 1256 /// in `a` to packed single-precision (32-bit) floating-point elements. 1257 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted 1258 { 1259 static if (LDC_with_SSE2) 1260 { 1261 return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately 1262 } 1263 else static if (GDC_with_SSE2) 1264 { 1265 return __builtin_ia32_cvtpd2ps(a); 1266 } 1267 else 1268 { 1269 __m128 r = void; 1270 r.ptr[0] = a.array[0]; 1271 r.ptr[1] = a.array[1]; 1272 r.ptr[2] = 0; 1273 r.ptr[3] = 0; 1274 return r; 1275 } 1276 } 1277 unittest 1278 { 1279 __m128d A = _mm_set_pd(5.25, 4.0); 1280 __m128 B = _mm_cvtpd_ps(A); 1281 assert(B.array == [4.0f, 5.25f, 0, 0]); 1282 } 1283 1284 /// Convert packed 32-bit integers in `v` to packed double-precision 1285 /// (64-bit) floating-point elements. 1286 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe 1287 { 1288 return _mm_cvtepi32_pd(to_m128i(v)); 1289 } 1290 unittest 1291 { 1292 __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5)); 1293 assert(A.array[0] == 4.0 && A.array[1] == -5.0); 1294 } 1295 1296 /// Convert packed single-precision (32-bit) floating-point elements 1297 /// in `a` to packed 32-bit integers 1298 __m128i _mm_cvtps_epi32 (__m128 a) @trusted 1299 { 1300 static if (LDC_with_SSE2) 1301 { 1302 // Disabled, since it fail with optimizations unfortunately 1303 //alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq; 1304 return __asm!__m128i("cvtps2dq $1,$0","=x,x",a); 1305 } 1306 else static if (GDC_with_SSE2) 1307 { 1308 return __builtin_ia32_cvtps2dq(a); 1309 } 1310 else static if (LDC_with_ARM64) 1311 { 1312 // Get current rounding mode. 1313 uint fpscr = arm_get_fpcr(); 1314 switch(fpscr & _MM_ROUND_MASK_ARM) 1315 { 1316 default: 1317 case _MM_ROUND_NEAREST_ARM: return vcvtnq_s32_f32(a); 1318 case _MM_ROUND_DOWN_ARM: return vcvtmq_s32_f32(a); 1319 case _MM_ROUND_UP_ARM: return vcvtpq_s32_f32(a); 1320 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a); 1321 } 1322 } 1323 else 1324 { 1325 __m128i r = void; 1326 r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]); 1327 r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]); 1328 r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]); 1329 r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]); 1330 return r; 1331 } 1332 } 1333 unittest 1334 { 1335 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1336 1337 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1338 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1339 assert(A.array == [1, -2, 54, -3]); 1340 1341 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1342 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1343 assert(A.array == [1, -3, 53, -3]); 1344 1345 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1346 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1347 assert(A.array == [2, -2, 54, -2]); 1348 1349 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1350 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1351 assert(A.array == [1, -2, 53, -2]); 1352 1353 _MM_SET_ROUNDING_MODE(savedRounding); 1354 } 1355 1356 /// Convert packed single-precision (32-bit) floating-point elements 1357 /// in `a` to packed double-precision (64-bit) floating-point elements. 1358 __m128d _mm_cvtps_pd (__m128 a) pure @trusted 1359 { 1360 version(LDC) 1361 { 1362 // Generates cvtps2pd since LDC 1.0 -O0 1363 enum ir = ` 1364 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1> 1365 %r = fpext <2 x float> %v to <2 x double> 1366 ret <2 x double> %r`; 1367 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a); 1368 } 1369 else static if (GDC_with_SSE2) 1370 { 1371 return __builtin_ia32_cvtps2pd(a); 1372 } 1373 else 1374 { 1375 double2 r = void; 1376 r.ptr[0] = a.array[0]; 1377 r.ptr[1] = a.array[1]; 1378 return r; 1379 } 1380 } 1381 unittest 1382 { 1383 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f)); 1384 assert(A.array[0] == 54.0); 1385 assert(A.array[1] == 54.0); 1386 } 1387 1388 /// Copy the lower double-precision (64-bit) floating-point element of `a`. 1389 double _mm_cvtsd_f64 (__m128d a) pure @safe 1390 { 1391 return a.array[0]; 1392 } 1393 1394 /// Convert the lower double-precision (64-bit) floating-point element 1395 /// in `a` to a 32-bit integer. 1396 int _mm_cvtsd_si32 (__m128d a) @safe 1397 { 1398 static if (LDC_with_SSE2) 1399 { 1400 return __builtin_ia32_cvtsd2si(a); 1401 } 1402 else static if (GDC_with_SSE2) 1403 { 1404 return __builtin_ia32_cvtsd2si(a); 1405 } 1406 else 1407 { 1408 return convertDoubleToInt32UsingMXCSR(a[0]); 1409 } 1410 } 1411 unittest 1412 { 1413 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0))); 1414 } 1415 1416 version(LDC) 1417 { 1418 // Unfortunately this builtin crashes in 32-bit 1419 version(X86_64) 1420 alias _mm_cvtsd_si64 = __builtin_ia32_cvtsd2si64; 1421 else 1422 { 1423 long _mm_cvtsd_si64 (__m128d a) @safe 1424 { 1425 return convertDoubleToInt64UsingMXCSR(a[0]); 1426 } 1427 } 1428 } 1429 else 1430 { 1431 long _mm_cvtsd_si64 (__m128d a) @safe 1432 { 1433 return convertDoubleToInt64UsingMXCSR(a.array[0]); 1434 } 1435 } 1436 unittest 1437 { 1438 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0))); 1439 1440 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1441 1442 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1443 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49))); 1444 1445 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1446 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1))); 1447 1448 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1449 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1))); 1450 1451 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1452 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9))); 1453 1454 _MM_SET_ROUNDING_MODE(savedRounding); 1455 } 1456 1457 alias _mm_cvtsd_si64x = _mm_cvtsd_si64; 1458 1459 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @safe 1460 { 1461 static if (GDC_with_SSE2) 1462 { 1463 return __builtin_ia32_cvtsd2ss(a, b); 1464 } 1465 else 1466 { 1467 // Generates cvtsd2ss since LDC 1.3 -O0 1468 a[0] = b[0]; 1469 return a; 1470 } 1471 } 1472 unittest 1473 { 1474 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0)); 1475 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]); 1476 } 1477 1478 int _mm_cvtsi128_si32 (__m128i a) pure @safe 1479 { 1480 return a.array[0]; 1481 } 1482 1483 long _mm_cvtsi128_si64 (__m128i a) pure @safe 1484 { 1485 long2 la = cast(long2)a; 1486 return la.array[0]; 1487 } 1488 alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 1489 1490 __m128d _mm_cvtsi32_sd(__m128d v, int x) pure @trusted 1491 { 1492 v.ptr[0] = cast(double)x; 1493 return v; 1494 } 1495 unittest 1496 { 1497 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 1498 assert(a.array == [42.0, 0]); 1499 } 1500 1501 __m128i _mm_cvtsi32_si128 (int a) pure @trusted 1502 { 1503 int4 r = [0, 0, 0, 0]; 1504 r.ptr[0] = a; 1505 return r; 1506 } 1507 unittest 1508 { 1509 __m128i a = _mm_cvtsi32_si128(65); 1510 assert(a.array == [65, 0, 0, 0]); 1511 } 1512 1513 1514 // Note: on macOS, using "llvm.x86.sse2.cvtsi642sd" was buggy 1515 __m128d _mm_cvtsi64_sd(__m128d v, long x) pure @trusted 1516 { 1517 v.ptr[0] = cast(double)x; 1518 return v; 1519 } 1520 unittest 1521 { 1522 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 1523 assert(a.array == [42.0, 0]); 1524 } 1525 1526 __m128i _mm_cvtsi64_si128 (long a) pure @trusted 1527 { 1528 long2 r = [0, 0]; 1529 r.ptr[0] = a; 1530 return cast(__m128i)(r); 1531 } 1532 1533 alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; 1534 alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; 1535 1536 double2 _mm_cvtss_sd(double2 v, float4 x) pure @trusted 1537 { 1538 v.ptr[0] = x.array[0]; 1539 return v; 1540 } 1541 unittest 1542 { 1543 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 1544 assert(a.array == [42.0, 0]); 1545 } 1546 1547 long _mm_cvttss_si64 (__m128 a) pure @safe 1548 { 1549 return cast(long)(a.array[0]); // Generates cvttss2si as expected 1550 } 1551 unittest 1552 { 1553 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1554 } 1555 1556 static if (LDC_with_SSE2) 1557 { 1558 alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq; 1559 } 1560 else static if (GDC_with_SSE2) 1561 { 1562 alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq; 1563 } 1564 else 1565 { 1566 __m128i _mm_cvttpd_epi32 (__m128d a) pure @safe 1567 { 1568 // Note: doesn't generate cvttpd2dq as of LDC 1.13 1569 __m128i r; 1570 r.array[0] = cast(int)a.array[0]; 1571 r.array[1] = cast(int)a.array[1]; 1572 r.array[2] = 0; 1573 r.array[3] = 0; 1574 return r; 1575 } 1576 } 1577 unittest 1578 { 1579 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f)); 1580 assert(R.array == [-4, 45641, 0, 0]); 1581 } 1582 1583 1584 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1585 /// to packed 32-bit integers with truncation. 1586 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe 1587 { 1588 return to_m64(_mm_cvttpd_epi32(v)); 1589 } 1590 unittest 1591 { 1592 int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f)); 1593 int[2] correct = [-4, 45641]; 1594 assert(R.array == correct); 1595 } 1596 1597 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted 1598 { 1599 // x86: Generates cvttps2dq since LDC 1.3 -O2 1600 // ARM64: generates fcvtze since LDC 1.8 -O2 1601 __m128i r; 1602 r.ptr[0] = cast(int)a.array[0]; 1603 r.ptr[1] = cast(int)a.array[1]; 1604 r.ptr[2] = cast(int)a.array[2]; 1605 r.ptr[3] = cast(int)a.array[3]; 1606 return r; 1607 } 1608 unittest 1609 { 1610 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f)); 1611 assert(R.array == [-4, 45641, 0, 1]); 1612 } 1613 1614 int _mm_cvttsd_si32 (__m128d a) 1615 { 1616 // Generates cvttsd2si since LDC 1.3 -O0 1617 return cast(int)a.array[0]; 1618 } 1619 1620 long _mm_cvttsd_si64 (__m128d a) 1621 { 1622 // Generates cvttsd2si since LDC 1.3 -O0 1623 // but in 32-bit instead, it's a long sequence that resort to FPU 1624 return cast(long)a.array[0]; 1625 } 1626 1627 alias _mm_cvttsd_si64x = _mm_cvttsd_si64; 1628 1629 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe 1630 { 1631 return a / b; 1632 } 1633 1634 static if (GDC_with_SSE2) 1635 { 1636 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted 1637 { 1638 return __builtin_ia32_divsd(a, b); 1639 } 1640 } 1641 else version(DigitalMars) 1642 { 1643 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1644 __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe 1645 { 1646 asm pure nothrow @nogc @trusted { nop;} 1647 a.array[0] = a.array[0] / b.array[0]; 1648 return a; 1649 } 1650 } 1651 else 1652 { 1653 __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe 1654 { 1655 a.array[0] /= b.array[0]; 1656 return a; 1657 } 1658 } 1659 unittest 1660 { 1661 __m128d a = [2.0, 4.5]; 1662 a = _mm_div_sd(a, a); 1663 assert(a.array == [1.0, 4.5]); 1664 } 1665 1666 /// Extract a 16-bit integer from `v`, selected with `index` 1667 // PERF: ARM version has array bound check 1668 int _mm_extract_epi16(__m128i v, int index) pure @safe 1669 { 1670 short8 r = cast(short8)v; 1671 return cast(ushort)(r.array[index]); 1672 } 1673 unittest 1674 { 1675 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1); 1676 assert(_mm_extract_epi16(A, 6) == 6); 1677 assert(_mm_extract_epi16(A, 0) == 65535); 1678 } 1679 1680 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`. 1681 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted 1682 { 1683 short8 r = cast(short8)v; 1684 r.ptr[index & 7] = cast(short)i; 1685 return cast(__m128i)r; 1686 } 1687 unittest 1688 { 1689 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1690 short8 R = cast(short8) _mm_insert_epi16(A, 42, 6); 1691 short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7]; 1692 assert(R.array == correct); 1693 } 1694 1695 version(GNU) 1696 { 1697 void _mm_lfence() pure @trusted 1698 { 1699 static if (GDC_with_SSE2) 1700 { 1701 __builtin_ia32_lfence(); 1702 } 1703 else version(X86) 1704 { 1705 asm pure nothrow @nogc @trusted 1706 { 1707 "lfence;\n" : : : ; 1708 } 1709 } 1710 else 1711 static assert(false); 1712 } 1713 } 1714 else static if (LDC_with_SSE2) 1715 { 1716 alias _mm_lfence = __builtin_ia32_lfence; 1717 } 1718 else static if (DMD_with_asm) 1719 { 1720 void _mm_lfence() pure @safe 1721 { 1722 asm nothrow @nogc pure @safe 1723 { 1724 lfence; 1725 } 1726 } 1727 } 1728 else version(LDC) 1729 { 1730 void _mm_lfence() pure @safe 1731 { 1732 llvm_memory_fence(); // Note: actually generates mfence 1733 } 1734 } 1735 else 1736 static assert(false); 1737 unittest 1738 { 1739 _mm_lfence(); 1740 } 1741 1742 1743 __m128d _mm_load_pd (const(double) * mem_addr) pure 1744 { 1745 __m128d* aligned = cast(__m128d*)mem_addr; 1746 return *aligned; 1747 } 1748 1749 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 1750 { 1751 double[2] arr = [*mem_addr, *mem_addr]; 1752 return loadUnaligned!(double2)(&arr[0]); 1753 } 1754 1755 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted 1756 { 1757 double2 r = [0, 0]; 1758 r.ptr[0] = *mem_addr; 1759 return r; 1760 } 1761 unittest 1762 { 1763 double x = -42; 1764 __m128d a = _mm_load_sd(&x); 1765 assert(a.array == [-42.0, 0.0]); 1766 } 1767 1768 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted 1769 { 1770 return *mem_addr; 1771 } 1772 1773 alias _mm_load1_pd = _mm_load_pd1; 1774 1775 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted 1776 { 1777 a.ptr[1] = *mem_addr; 1778 return a; 1779 } 1780 1781 // Note: strange signature since the memory doesn't have to aligned 1782 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted 1783 { 1784 auto pLong = cast(const(long)*)mem_addr; 1785 long2 r = [0, 0]; 1786 r.ptr[0] = *pLong; 1787 return cast(__m128i)(r); 1788 } 1789 1790 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted 1791 { 1792 a.ptr[0] = *mem_addr; 1793 return a; 1794 } 1795 1796 __m128d _mm_loadr_pd2 (const(double)* mem_addr) pure @trusted 1797 { 1798 __m128d a = *cast(__m128d*)(mem_addr); 1799 __m128d r; 1800 r.ptr[0] = a.array[1]; 1801 r.ptr[1] = a.array[0]; 1802 return r; 1803 } 1804 1805 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @safe 1806 { 1807 static if (GDC_with_SSE2) 1808 { 1809 return __builtin_ia32_loadupd(mem_addr); 1810 } 1811 else 1812 { 1813 return loadUnaligned!(double2)(mem_addr); 1814 } 1815 } 1816 1817 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 1818 { 1819 static if (GDC_with_SSE2) 1820 { 1821 return __builtin_ia32_loaddqu(cast(const(char*))mem_addr); 1822 } 1823 else 1824 { 1825 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 1826 } 1827 } 1828 1829 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted 1830 { 1831 int r = *cast(int*)(mem_addr); 1832 int4 result = [0, 0, 0, 0]; 1833 result.ptr[0] = r; 1834 return result; 1835 } 1836 unittest 1837 { 1838 int r = 42; 1839 __m128i A = _mm_loadu_si32(&r); 1840 int[4] correct = [42, 0, 0, 0]; 1841 assert(A.array == correct); 1842 } 1843 1844 static if (GDC_with_SSE2) 1845 { 1846 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 1847 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 1848 /// and pack the results in destination. 1849 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @safe 1850 { 1851 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 1852 } 1853 } 1854 else static if (LDC_with_SSE2) 1855 { 1856 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 1857 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 1858 /// and pack the results in destination. 1859 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @safe 1860 { 1861 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 1862 } 1863 } 1864 else static if (LDC_with_ARM64) 1865 { 1866 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 1867 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 1868 /// and pack the results in destination. 1869 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @safe 1870 { 1871 int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b)); 1872 int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b)); 1873 int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl)); 1874 int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph)); 1875 return vcombine_s32(rl, rh); 1876 } 1877 } 1878 else 1879 { 1880 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 1881 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 1882 /// and pack the results in destination. 1883 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @safe 1884 { 1885 short8 sa = cast(short8)a; 1886 short8 sb = cast(short8)b; 1887 1888 int4 r; 1889 foreach(i; 0..4) 1890 { 1891 r.array[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1]; 1892 } 1893 return r; 1894 } 1895 } 1896 unittest 1897 { 1898 short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 1899 short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 1900 int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B); 1901 int[4] correct = [1, 13, -2147483648, 2*32767*32767]; 1902 assert(R.array == correct); 1903 } 1904 1905 1906 static if (GDC_with_SSE2) 1907 { 1908 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 1909 /// (elements are not stored when the highest bit is not set in the corresponding element) 1910 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 1911 /// boundary. 1912 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted 1913 { 1914 return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr); 1915 } 1916 } 1917 else static if (LDC_with_SSE2) 1918 { 1919 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 1920 /// (elements are not stored when the highest bit is not set in the corresponding element) 1921 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 1922 /// boundary. 1923 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted 1924 { 1925 return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr); 1926 } 1927 } 1928 else 1929 { 1930 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 1931 /// (elements are not stored when the highest bit is not set in the corresponding element) 1932 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 1933 /// boundary. 1934 // PERF: catastrophic on ARM 1935 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted 1936 { 1937 byte16 b = cast(byte16)a; 1938 byte16 m = cast(byte16)mask; 1939 byte* dest = cast(byte*)(mem_addr); 1940 foreach(j; 0..16) 1941 { 1942 if (m.array[j] & 128) 1943 { 1944 dest[j] = b.array[j]; 1945 } 1946 } 1947 } 1948 } 1949 unittest 1950 { 1951 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]; 1952 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0); 1953 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); 1954 _mm_maskmoveu_si128(A, mask, dest.ptr); 1955 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42]; 1956 assert(dest == correct); 1957 } 1958 1959 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe 1960 { 1961 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1962 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else 1963 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1964 __m128i mask = _mm_and_si128(aTob, lowerShorts); 1965 return _mm_xor_si128(b, mask); 1966 } 1967 unittest 1968 { 1969 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-57), 1970 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 1971 short[8] correct = [45, 1, 9, 7, 9, 7, 0, 0]; 1972 assert(R.array == correct); 1973 } 1974 1975 1976 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1977 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe 1978 { 1979 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1980 __m128i value128 = _mm_set1_epi8(-128); 1981 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 1982 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1983 __m128i mask = _mm_and_si128(aTob, higher); 1984 return _mm_xor_si128(b, mask); 1985 } 1986 unittest 1987 { 1988 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 1989 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 1990 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 1991 assert(R.array == correct); 1992 } 1993 1994 __m128d _mm_max_pd (__m128d a, __m128d b) pure @safe 1995 { 1996 static if (GDC_with_SSE2) 1997 { 1998 return __builtin_ia32_maxpd(a, b); 1999 } 2000 else 2001 { 2002 // Generates maxpd starting with LDC 1.9 2003 a[0] = (a[0] > b[0]) ? a[0] : b[0]; 2004 a[1] = (a[1] > b[1]) ? a[1] : b[1]; 2005 return a; 2006 } 2007 } 2008 unittest 2009 { 2010 __m128d A = _mm_setr_pd(4.0, 1.0); 2011 __m128d B = _mm_setr_pd(1.0, 8.0); 2012 __m128d M = _mm_max_pd(A, B); 2013 assert(M.array[0] == 4.0); 2014 assert(M.array[1] == 8.0); 2015 } 2016 2017 __m128d _mm_max_sd (__m128d a, __m128d b) pure @safe 2018 { 2019 static if (GDC_with_SSE2) 2020 { 2021 return __builtin_ia32_maxsd(a, b); 2022 } 2023 else 2024 { 2025 __m128d r = a; 2026 // Generates maxsd starting with LDC 1.3 2027 r.array[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2028 return r; 2029 } 2030 } 2031 unittest 2032 { 2033 __m128d A = _mm_setr_pd(1.0, 1.0); 2034 __m128d B = _mm_setr_pd(4.0, 2.0); 2035 __m128d M = _mm_max_sd(A, B); 2036 assert(M.array[0] == 4.0); 2037 assert(M.array[1] == 1.0); 2038 } 2039 2040 version(GNU) 2041 { 2042 void _mm_mfence() pure @trusted 2043 { 2044 static if (GDC_with_SSE2) 2045 { 2046 __builtin_ia32_mfence(); 2047 } 2048 else version(X86) 2049 { 2050 asm pure nothrow @nogc @trusted 2051 { 2052 "mfence;\n" : : : ; 2053 } 2054 } 2055 else 2056 static assert(false); 2057 } 2058 } 2059 else static if (LDC_with_SSE2) 2060 { 2061 alias _mm_mfence = __builtin_ia32_mfence; 2062 } 2063 else static if (DMD_with_asm) 2064 { 2065 void _mm_mfence() pure @safe 2066 { 2067 asm nothrow @nogc pure @safe 2068 { 2069 mfence; 2070 } 2071 } 2072 } 2073 else version(LDC) 2074 { 2075 void _mm_mfence() pure @safe 2076 { 2077 // Note: will generate the DMB instruction on ARM 2078 llvm_memory_fence(); 2079 } 2080 } 2081 else 2082 static assert(false); 2083 unittest 2084 { 2085 _mm_mfence(); 2086 } 2087 2088 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe 2089 { 2090 // Note: clang uses a __builtin_ia32_pminsw128 which has disappeared from LDC LLVM (?) 2091 // Implemented using masks and XOR 2092 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else 2093 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2094 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2095 return _mm_xor_si128(b, mask); 2096 } 2097 unittest 2098 { 2099 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-57), 2100 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 2101 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -57]; 2102 assert(R.array == correct); 2103 } 2104 2105 2106 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe 2107 { 2108 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 2109 __m128i value128 = _mm_set1_epi8(-128); 2110 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2111 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2112 __m128i mask = _mm_and_si128(aTob, lower); 2113 return _mm_xor_si128(b, mask); 2114 } 2115 unittest 2116 { 2117 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2118 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2119 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 2120 assert(R.array == correct); 2121 } 2122 2123 __m128d _mm_min_pd (__m128d a, __m128d b) pure @safe 2124 { 2125 static if (GDC_with_SSE2) 2126 { 2127 return __builtin_ia32_minpd(a, b); 2128 } 2129 else 2130 { 2131 // Generates minpd starting with LDC 1.9 2132 a.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2133 a.array[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2134 return a; 2135 } 2136 } 2137 unittest 2138 { 2139 __m128d A = _mm_setr_pd(1.0, 2.0); 2140 __m128d B = _mm_setr_pd(4.0, 1.0); 2141 __m128d M = _mm_min_pd(A, B); 2142 assert(M.array[0] == 1.0); 2143 assert(M.array[1] == 1.0); 2144 } 2145 2146 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe 2147 { 2148 static if (GDC_with_SSE2) 2149 { 2150 return __builtin_ia32_minsd(a, b); 2151 } 2152 else 2153 { 2154 // Generates minsd starting with LDC 1.3 2155 __m128d r = a; 2156 r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2157 return r; 2158 } 2159 } 2160 unittest 2161 { 2162 __m128d A = _mm_setr_pd(1.0, 3.0); 2163 __m128d B = _mm_setr_pd(4.0, 2.0); 2164 __m128d M = _mm_min_sd(A, B); 2165 assert(M.array[0] == 1.0); 2166 assert(M.array[1] == 3.0); 2167 } 2168 2169 __m128i _mm_move_epi64 (__m128i a) pure @safe 2170 { 2171 static if (GDC_with_SSE2) 2172 { 2173 return __builtin_ia32_movq128(a); 2174 } 2175 else 2176 { 2177 long2 result = [ 0, 0 ]; 2178 long2 la = cast(long2) a; 2179 result.array[0] = la.array[0]; 2180 return cast(__m128i)(result); 2181 } 2182 } 2183 unittest 2184 { 2185 long2 A = [13, 47]; 2186 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 2187 long[2] correct = [13, 0]; 2188 assert(B.array == correct); 2189 } 2190 2191 __m128d _mm_move_sd (__m128d a, __m128d b) pure @safe 2192 { 2193 static if (GDC_with_SSE2) 2194 { 2195 return __builtin_ia32_movsd(a, b); 2196 } 2197 else 2198 { 2199 b.array[1] = a.array[1]; 2200 return b; 2201 } 2202 } 2203 unittest 2204 { 2205 double2 A = [13.0, 47.0]; 2206 double2 B = [34.0, 58.0]; 2207 double2 C = _mm_move_sd(A, B); 2208 double[2] correct = [34.0, 47.0]; 2209 assert(C.array == correct); 2210 } 2211 2212 static if (GDC_with_SSE2) 2213 { 2214 /// Create mask from the most significant bit of each 8-bit element in `v`. 2215 alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128; 2216 } 2217 else static if (LDC_with_SSE2) 2218 { 2219 /// Create mask from the most significant bit of each 8-bit element in `v`. 2220 int _mm_movemask_epi8(__m128i v) pure @safe 2221 { 2222 return __builtin_ia32_pmovmskb128(cast(byte16)v); 2223 } 2224 } 2225 else static if (LDC_with_ARM64) 2226 { 2227 // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon 2228 // The other two solutions lead to unfound intrinsics in LLVM and that took a long time. 2229 // SO there might be something a bit faster, but this one is reasonable and branchless. 2230 2231 /// Create mask from the most significant bit of each 8-bit element in `v`. 2232 int _mm_movemask_epi8 (__m128i a) pure @trusted 2233 { 2234 byte8 mask_shift; 2235 mask_shift.ptr[0] = 7; 2236 mask_shift.ptr[1] = 6; 2237 mask_shift.ptr[2] = 5; 2238 mask_shift.ptr[3] = 4; 2239 mask_shift.ptr[4] = 3; 2240 mask_shift.ptr[5] = 2; 2241 mask_shift.ptr[6] = 1; 2242 mask_shift.ptr[7] = 0; 2243 byte8 mask_and = byte8(-128); 2244 byte8 lo = vget_low_u8(cast(byte16)a); 2245 byte8 hi = vget_high_u8(cast(byte16)a); 2246 lo = vand_u8(lo, mask_and); 2247 lo = vshr_u8(lo, mask_shift); 2248 hi = vand_u8(hi, mask_and); 2249 hi = vshr_u8(hi, mask_shift); 2250 lo = vpadd_u8(lo,lo); 2251 lo = vpadd_u8(lo,lo); 2252 lo = vpadd_u8(lo,lo); 2253 hi = vpadd_u8(hi,hi); 2254 hi = vpadd_u8(hi,hi); 2255 hi = vpadd_u8(hi,hi); 2256 return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]); 2257 } 2258 } 2259 else 2260 { 2261 /// Create mask from the most significant bit of each 8-bit element in `v`. 2262 int _mm_movemask_epi8(__m128i v) pure @safe 2263 { 2264 byte16 ai = cast(byte16)v; 2265 int r = 0; 2266 foreach(bit; 0..16) 2267 { 2268 if (ai.array[bit] < 0) r += (1 << bit); 2269 } 2270 return r; 2271 } 2272 } 2273 unittest 2274 { 2275 assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0))); 2276 } 2277 2278 static if (GDC_with_SSE2) 2279 { 2280 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2281 /// packed double-precision (64-bit) floating-point element in `v`. 2282 alias _mm_movemask_pd = __builtin_ia32_movmskpd; 2283 } 2284 else static if (LDC_with_SSE2) 2285 { 2286 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2287 /// packed double-precision (64-bit) floating-point element in `v`. 2288 alias _mm_movemask_pd = __builtin_ia32_movmskpd; 2289 } 2290 else 2291 { 2292 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2293 /// packed double-precision (64-bit) floating-point element in `v`. 2294 int _mm_movemask_pd(__m128d v) pure @safe 2295 { 2296 long2 lv = cast(long2)v; 2297 int r = 0; 2298 if (lv.array[0] < 0) r += 1; 2299 if (lv.array[1] < 0) r += 2; 2300 return r; 2301 } 2302 } 2303 unittest 2304 { 2305 __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0); 2306 assert(_mm_movemask_pd(A) == 2); 2307 } 2308 2309 /// Copy the lower 64-bit integer in `v`. 2310 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe 2311 { 2312 long2 lv = cast(long2)v; 2313 return long1(lv.array[0]); 2314 } 2315 unittest 2316 { 2317 __m128i A = _mm_set_epi64x(-1, -2); 2318 __m64 R = _mm_movepi64_pi64(A); 2319 assert(R.array[0] == -2); 2320 } 2321 2322 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element. 2323 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted 2324 { 2325 long2 r; 2326 r.ptr[0] = a.array[0]; 2327 r.ptr[1] = 0; 2328 return cast(__m128i)r; 2329 } 2330 2331 // Note: generates pmuludq in LDC with -O1 2332 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted 2333 { 2334 __m128i zero = _mm_setzero_si128(); 2335 2336 static if (__VERSION__ >= 2088) 2337 { 2338 // Need LLVM9 to avoid this shufflevector 2339 long2 la, lb; 2340 la.ptr[0] = cast(uint)a.array[0]; 2341 la.ptr[1] = cast(uint)a.array[2]; 2342 lb.ptr[0] = cast(uint)b.array[0]; 2343 lb.ptr[1] = cast(uint)b.array[2]; 2344 } 2345 else 2346 { 2347 long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero); 2348 long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero); 2349 } 2350 2351 static if (__VERSION__ >= 2076) 2352 { 2353 return cast(__m128i)(la * lb); 2354 } 2355 else 2356 { 2357 // long2 mul not supported before LDC 1.5 2358 la.ptr[0] *= lb.array[0]; 2359 la.ptr[1] *= lb.array[1]; 2360 return cast(__m128i)(la); 2361 } 2362 } 2363 unittest 2364 { 2365 __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff); 2366 __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff); 2367 __m128i C = _mm_mul_epu32(A, B); 2368 long2 LC = cast(long2)C; 2369 assert(LC.array[0] == 18446744065119617025uL); 2370 assert(LC.array[1] == 12723420444339690338uL); 2371 } 2372 2373 2374 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 2375 { 2376 return a * b; 2377 } 2378 unittest 2379 { 2380 __m128d a = [-2.0, 1.5]; 2381 a = _mm_mul_pd(a, a); 2382 assert(a.array == [4.0, 2.25]); 2383 } 2384 2385 version(DigitalMars) 2386 { 2387 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 2388 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe 2389 { 2390 asm pure nothrow @nogc @trusted { nop;} 2391 a.array[0] = a.array[0] * b.array[0]; 2392 return a; 2393 } 2394 } 2395 else 2396 { 2397 static if (GDC_with_SSE2) 2398 { 2399 alias _mm_mul_sd = __builtin_ia32_mulsd; 2400 } 2401 else 2402 { 2403 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe 2404 { 2405 a.array[0] *= b.array[0]; 2406 return a; 2407 } 2408 } 2409 } 2410 unittest 2411 { 2412 __m128d a = [-2.0, 1.5]; 2413 a = _mm_mul_sd(a, a); 2414 assert(a.array == [4.0, 1.5]); 2415 } 2416 2417 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 2418 /// and get an unsigned 64-bit result. 2419 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe 2420 { 2421 return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b))); 2422 } 2423 unittest 2424 { 2425 __m64 A = _mm_set_pi32(42, 0xDEADBEEF); 2426 __m64 B = _mm_set_pi32(42, 0xCAFEBABE); 2427 __m64 C = _mm_mul_su32(A, B); 2428 assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL); 2429 } 2430 2431 static if (GDC_with_SSE2) 2432 { 2433 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted 2434 { 2435 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2436 } 2437 } 2438 else static if (LDC_with_SSE2) 2439 { 2440 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted 2441 { 2442 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2443 } 2444 } 2445 else 2446 { 2447 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted 2448 { 2449 short8 sa = cast(short8)a; 2450 short8 sb = cast(short8)b; 2451 short8 r = void; 2452 r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16; 2453 r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16; 2454 r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16; 2455 r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16; 2456 r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16; 2457 r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16; 2458 r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16; 2459 r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16; 2460 return cast(__m128i)r; 2461 } 2462 } 2463 unittest 2464 { 2465 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2466 __m128i B = _mm_set1_epi16(16384); 2467 short8 R = cast(short8)_mm_mulhi_epi16(A, B); 2468 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1]; 2469 assert(R.array == correct); 2470 } 2471 2472 static if (GDC_with_SSE2) 2473 { 2474 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted 2475 { 2476 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2477 } 2478 } 2479 else static if (LDC_with_SSE2) 2480 { 2481 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted 2482 { 2483 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2484 } 2485 } 2486 else 2487 { 2488 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted 2489 { 2490 short8 sa = cast(short8)a; 2491 short8 sb = cast(short8)b; 2492 short8 r = void; 2493 r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 ); 2494 r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 ); 2495 r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 ); 2496 r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 ); 2497 r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 ); 2498 r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 ); 2499 r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 ); 2500 r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 ); 2501 return cast(__m128i)r; 2502 } 2503 } 2504 unittest 2505 { 2506 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2507 __m128i B = _mm_set1_epi16(16384); 2508 short8 R = cast(short8)_mm_mulhi_epu16(A, B); 2509 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1]; 2510 assert(R.array == correct); 2511 } 2512 2513 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe 2514 { 2515 return cast(__m128i)(cast(short8)a * cast(short8)b); 2516 } 2517 unittest 2518 { 2519 __m128i A = _mm_setr_epi16(16384, -16, 0, 3, 4, 1, 16, 7); 2520 __m128i B = _mm_set1_epi16(16384); 2521 short8 R = cast(short8)_mm_mullo_epi16(A, B); 2522 short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384]; 2523 assert(R.array == correct); 2524 } 2525 2526 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 2527 { 2528 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 2529 } 2530 2531 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 2532 { 2533 return a | b; 2534 } 2535 2536 static if (GDC_with_SSE2) 2537 { 2538 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted 2539 { 2540 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2541 } 2542 } 2543 else static if (LDC_with_SSE2) 2544 { 2545 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted 2546 { 2547 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2548 } 2549 } 2550 else 2551 { 2552 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted 2553 { 2554 short8 r; 2555 r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]); 2556 r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]); 2557 r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]); 2558 r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]); 2559 r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]); 2560 r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]); 2561 r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]); 2562 r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]); 2563 return cast(__m128i)r; 2564 } 2565 } 2566 unittest 2567 { 2568 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 2569 short8 R = cast(short8) _mm_packs_epi32(A, A); 2570 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0]; 2571 assert(R.array == correct); 2572 } 2573 2574 static if (GDC_with_SSE2) 2575 { 2576 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted 2577 { 2578 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2579 } 2580 } 2581 else static if (LDC_with_SSE2) 2582 { 2583 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted 2584 { 2585 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2586 } 2587 } 2588 else static if (LDC_with_ARM64) 2589 { 2590 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted 2591 { 2592 // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02 2593 byte8 ra = vqmovn_s16(cast(short8)a); 2594 byte8 rb = vqmovn_s16(cast(short8)b); 2595 return cast(__m128i)vcombine_s8(ra, rb); 2596 } 2597 } 2598 else 2599 { 2600 // PERF: ARM 2601 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted 2602 { 2603 byte16 r; 2604 short8 sa = cast(short8)a; 2605 short8 sb = cast(short8)b; 2606 foreach(i; 0..8) 2607 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]); 2608 foreach(i; 0..8) 2609 r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]); 2610 return cast(__m128i)r; 2611 } 2612 } 2613 unittest 2614 { 2615 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0); 2616 byte16 R = cast(byte16) _mm_packs_epi16(A, A); 2617 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0, 2618 127, -128, 127, 0, 127, -128, 127, 0]; 2619 assert(R.array == correct); 2620 } 2621 2622 static if (GDC_with_SSE2) 2623 { 2624 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 2625 { 2626 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2627 } 2628 } 2629 else static if (LDC_with_SSE2) 2630 { 2631 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 2632 { 2633 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2634 } 2635 } 2636 else static if (LDC_with_ARM64) 2637 { 2638 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 2639 { 2640 // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02 2641 byte8 ra = vqmovun_s16(cast(short8)a); 2642 byte8 rb = vqmovun_s16(cast(short8)b); 2643 return cast(__m128i)vcombine_s8(ra, rb); 2644 } 2645 } 2646 else 2647 { 2648 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 2649 { 2650 short8 sa = cast(short8)a; 2651 short8 sb = cast(short8)b; 2652 ubyte[16] result = void; 2653 for (int i = 0; i < 8; ++i) 2654 { 2655 short s = sa[i]; 2656 if (s < 0) s = 0; 2657 if (s > 255) s = 255; 2658 result[i] = cast(ubyte)s; 2659 2660 s = sb[i]; 2661 if (s < 0) s = 0; 2662 if (s > 255) s = 255; 2663 result[i+8] = cast(ubyte)s; 2664 } 2665 return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr); 2666 } 2667 } 2668 unittest 2669 { 2670 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 2671 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 2672 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 2673 0, 255, 0, 255, 255, 2, 1, 0]; 2674 foreach(i; 0..16) 2675 assert(AA.array[i] == cast(byte)(correctResult[i])); 2676 } 2677 2678 2679 version(GNU) 2680 { 2681 void _mm_pause() pure @trusted 2682 { 2683 static if (GDC_with_SSE2) 2684 { 2685 __builtin_ia32_pause(); 2686 } 2687 else version(X86) 2688 { 2689 asm pure nothrow @nogc @trusted 2690 { 2691 "pause;\n" : : : ; 2692 } 2693 } 2694 else 2695 static assert(false); 2696 } 2697 } 2698 else static if (LDC_with_SSE2) 2699 { 2700 alias _mm_pause = __builtin_ia32_pause; 2701 } 2702 else static if (DMD_with_asm) 2703 { 2704 void _mm_pause() pure @safe 2705 { 2706 asm nothrow @nogc pure @safe 2707 { 2708 rep; nop; // F3 90 = pause 2709 } 2710 } 2711 } 2712 else version (LDC) 2713 { 2714 void _mm_pause() pure @safe 2715 { 2716 // PERF: Do nothing currently , could be the "yield" intruction on ARM. 2717 } 2718 } 2719 else 2720 static assert(false); 2721 unittest 2722 { 2723 _mm_pause(); 2724 } 2725 2726 static if (GDC_with_SSE2) 2727 { 2728 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted 2729 { 2730 return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b); 2731 } 2732 } 2733 else static if (LDC_with_SSE2) 2734 { 2735 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted 2736 { 2737 return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b); 2738 } 2739 } 2740 else 2741 { 2742 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted 2743 { 2744 byte16 ab = cast(byte16)a; 2745 byte16 bb = cast(byte16)b; 2746 ubyte[16] t; 2747 foreach(i; 0..16) 2748 { 2749 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]); 2750 if (diff < 0) diff = -diff; 2751 t[i] = cast(ubyte)(diff); 2752 } 2753 int4 r = _mm_setzero_si128(); 2754 r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 2755 r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15]; 2756 return r; 2757 } 2758 } 2759 unittest 2760 { 2761 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 2762 __m128i B = _mm_set1_epi8(1); 2763 __m128i R = _mm_sad_epu8(A, B); 2764 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 2765 0, 2766 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 2767 0]; 2768 assert(R.array == correct); 2769 } 2770 2771 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 2772 { 2773 short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7]; 2774 return cast(__m128i) loadUnaligned!(short8)(result.ptr); 2775 } 2776 unittest 2777 { 2778 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 2779 short8 B = cast(short8) A; 2780 foreach(i; 0..8) 2781 assert(B.array[i] == i); 2782 } 2783 2784 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 2785 { 2786 int[4] result = [e0, e1, e2, e3]; 2787 return loadUnaligned!(int4)(result.ptr); 2788 } 2789 unittest 2790 { 2791 __m128i A = _mm_set_epi32(3, 2, 1, 0); 2792 foreach(i; 0..4) 2793 assert(A.array[i] == i); 2794 } 2795 2796 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted 2797 { 2798 long[2] result = [e0.array[0], e1.array[0]]; 2799 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 2800 } 2801 unittest 2802 { 2803 __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678)); 2804 long2 B = cast(long2) A; 2805 assert(B.array[0] == 5678); 2806 assert(B.array[1] == 1234); 2807 } 2808 2809 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 2810 { 2811 long[2] result = [e0, e1]; 2812 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 2813 } 2814 unittest 2815 { 2816 __m128i A = _mm_set_epi64x(1234, 5678); 2817 long2 B = cast(long2) A; 2818 assert(B.array[0] == 5678); 2819 assert(B.array[1] == 1234); 2820 } 2821 2822 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 2823 byte e11, byte e10, byte e9, byte e8, 2824 byte e7, byte e6, byte e5, byte e4, 2825 byte e3, byte e2, byte e1, byte e0) pure @trusted 2826 { 2827 byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 2828 e8, e9, e10, e11, e12, e13, e14, e15]; 2829 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 2830 } 2831 2832 __m128d _mm_set_pd (double e1, double e0) pure @trusted 2833 { 2834 double[2] result = [e0, e1]; 2835 return loadUnaligned!(double2)(result.ptr); 2836 } 2837 unittest 2838 { 2839 __m128d A = _mm_set_pd(61.0, 55.0); 2840 double[2] correct = [55.0, 61.0]; 2841 assert(A.array == correct); 2842 } 2843 2844 __m128d _mm_set_pd1 (double a) pure @trusted 2845 { 2846 double[2] result = [a, a]; 2847 return loadUnaligned!(double2)(result.ptr); 2848 } 2849 unittest 2850 { 2851 __m128d A = _mm_set_pd1(61.0); 2852 double[2] correct = [61.0, 61.0]; 2853 assert(A.array == correct); 2854 } 2855 2856 __m128d _mm_set_sd (double a) pure @trusted 2857 { 2858 double[2] result = [a, 0]; 2859 return loadUnaligned!(double2)(result.ptr); 2860 } 2861 2862 __m128i _mm_set1_epi16 (short a) pure @trusted 2863 { 2864 return cast(__m128i)(short8(a)); 2865 } 2866 2867 __m128i _mm_set1_epi32 (int a) pure @trusted 2868 { 2869 return cast(__m128i)(int4(a)); 2870 } 2871 unittest 2872 { 2873 __m128 a = _mm_set1_ps(-1.0f); 2874 __m128 b = cast(__m128) _mm_set1_epi32(0x7fffffff); 2875 assert(_mm_and_ps(a, b).array == [1.0f, 1, 1, 1]); 2876 } 2877 2878 /// Broadcast 64-bit integer `a` to all elements of `dst`. 2879 __m128i _mm_set1_epi64 (__m64 a) pure @safe 2880 { 2881 return _mm_set_epi64(a, a); 2882 } 2883 2884 __m128i _mm_set1_epi64x (long a) pure @trusted 2885 { 2886 return cast(__m128i)(long2(a)); 2887 } 2888 2889 __m128i _mm_set1_epi8 (byte a) pure @trusted 2890 { 2891 return cast(__m128i)(byte16(a)); 2892 } 2893 2894 alias _mm_set1_pd = _mm_set_pd1; 2895 2896 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 2897 short e3, short e2, short e1, short e0) pure @trusted 2898 { 2899 short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0]; 2900 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 2901 } 2902 2903 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 2904 { 2905 int[4] result = [e3, e2, e1, e0]; 2906 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 2907 } 2908 2909 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 2910 { 2911 long[2] result = [e1, e0]; 2912 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 2913 } 2914 2915 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12, 2916 byte e11, byte e10, byte e9, byte e8, 2917 byte e7, byte e6, byte e5, byte e4, 2918 byte e3, byte e2, byte e1, byte e0) pure @trusted 2919 { 2920 byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 2921 e7, e6, e5, e4, e3, e2, e1, e0]; 2922 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 2923 } 2924 2925 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 2926 { 2927 double[2] result = [e1, e0]; 2928 return loadUnaligned!(double2)(result.ptr); 2929 } 2930 unittest 2931 { 2932 __m128d A = _mm_setr_pd(61.0, 55.0); 2933 double[2] correct = [61.0, 55.0]; 2934 assert(A.array == correct); 2935 } 2936 2937 __m128d _mm_setzero_pd () pure @trusted 2938 { 2939 // Note: using loadUnaligned has better -O0 codegen compared to .ptr 2940 double[2] result = [0.0, 0.0]; 2941 return loadUnaligned!(double2)(result.ptr); 2942 } 2943 2944 __m128i _mm_setzero_si128() pure @trusted 2945 { 2946 // Note: using loadUnaligned has better -O0 codegen compared to .ptr 2947 int[4] result = [0, 0, 0, 0]; 2948 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 2949 } 2950 2951 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe 2952 { 2953 static if (GDC_with_SSE2) 2954 { 2955 return __builtin_ia32_pshufd(a, imm8); 2956 } 2957 else 2958 { 2959 return shufflevector!(int4, (imm8 >> 0) & 3, 2960 (imm8 >> 2) & 3, 2961 (imm8 >> 4) & 3, 2962 (imm8 >> 6) & 3)(a, a); 2963 } 2964 } 2965 unittest 2966 { 2967 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 2968 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 2969 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 2970 int[4] expectedB = [ 3, 2, 1, 0 ]; 2971 assert(B.array == expectedB); 2972 } 2973 2974 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe 2975 { 2976 static if (GDC_with_SSE2) 2977 { 2978 return __builtin_ia32_shufpd(a, b, imm8); 2979 } 2980 else 2981 { 2982 return shufflevector!(double2, 0 + ( imm8 & 1 ), 2983 2 + ( (imm8 >> 1) & 1 ))(a, b); 2984 } 2985 } 2986 unittest 2987 { 2988 __m128d A = _mm_setr_pd(0.5, 2.0); 2989 __m128d B = _mm_setr_pd(4.0, 5.0); 2990 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 2991 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B); 2992 double[2] correct = [ 2.0, 5.0 ]; 2993 assert(R.array == correct); 2994 } 2995 2996 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe 2997 { 2998 static if (GDC_with_SSE2) 2999 { 3000 return __builtin_ia32_pshufhw(a, imm8); 3001 } 3002 else 3003 { 3004 return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3, 3005 4 + ( (imm8 >> 0) & 3 ), 3006 4 + ( (imm8 >> 2) & 3 ), 3007 4 + ( (imm8 >> 4) & 3 ), 3008 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 3009 } 3010 } 3011 unittest 3012 { 3013 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3014 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3015 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 3016 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 3017 assert(C.array == expectedC); 3018 } 3019 3020 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe 3021 { 3022 static if (GDC_with_SSE2) 3023 { 3024 return __builtin_ia32_pshuflw(a, imm8); 3025 } 3026 else 3027 { 3028 return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ), 3029 ( (imm8 >> 2) & 3 ), 3030 ( (imm8 >> 4) & 3 ), 3031 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 3032 } 3033 } 3034 unittest 3035 { 3036 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3037 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3038 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 3039 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 3040 assert(B.array == expectedB); 3041 } 3042 3043 static if (LDC_with_SSE2) 3044 { 3045 deprecated("Use _mm_slli_epi32 instead.") alias _mm_sll_epi32 = __builtin_ia32_pslld128; 3046 } 3047 else static if (GDC_with_SSE2) 3048 { 3049 deprecated("Use _mm_slli_epi32 instead.") alias _mm_sll_epi32 = __builtin_ia32_pslld128; 3050 } 3051 else static if (DMD_with_32bit_asm) 3052 { 3053 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe 3054 { 3055 asm pure nothrow @nogc @trusted 3056 { 3057 movdqu XMM0, a; 3058 movdqu XMM1, count; 3059 pslld XMM0, XMM1; 3060 movdqu a, XMM0; 3061 } 3062 return a; 3063 } 3064 } 3065 else 3066 { 3067 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe 3068 { 3069 int4 r = void; 3070 long2 lc = cast(long2)count; 3071 int bits = cast(int)(lc.array[0]); 3072 foreach(i; 0..4) 3073 r[i] = cast(uint)(a[i]) << bits; 3074 return r; 3075 } 3076 } 3077 3078 static if (LDC_with_SSE2) 3079 { 3080 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe 3081 { 3082 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3083 } 3084 } 3085 else static if (GDC_with_SSE2) 3086 { 3087 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe 3088 { 3089 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3090 } 3091 } 3092 else static if (DMD_with_32bit_asm) 3093 { 3094 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe 3095 { 3096 asm pure nothrow @nogc @trusted 3097 { 3098 movdqu XMM0, a; 3099 movdqu XMM1, count; 3100 psllq XMM0, XMM1; 3101 movdqu a, XMM0; 3102 } 3103 return a; 3104 } 3105 } 3106 else 3107 { 3108 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe 3109 { 3110 // ARM: good since LDC 1.12 -O2 3111 // ~but -O0 version is catastrophic 3112 long2 r = void; 3113 long2 sa = cast(long2)a; 3114 long2 lc = cast(long2)count; 3115 int bits = cast(int)(lc.array[0]); 3116 foreach(i; 0..2) 3117 r.array[i] = cast(ulong)(sa.array[i]) << bits; 3118 return cast(__m128i)r; 3119 } 3120 } 3121 3122 static if (LDC_with_SSE2) 3123 { 3124 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 3125 { 3126 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3127 } 3128 } 3129 else static if (GDC_with_SSE2) 3130 { 3131 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 3132 { 3133 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3134 } 3135 } 3136 else static if (DMD_with_32bit_asm) 3137 { 3138 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 3139 { 3140 asm pure nothrow @nogc 3141 { 3142 movdqu XMM0, a; 3143 movdqu XMM1, count; 3144 psllw XMM0, XMM1; 3145 movdqu a, XMM0; 3146 } 3147 return a; 3148 } 3149 } 3150 else 3151 { 3152 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 3153 { 3154 short8 sa = cast(short8)a; 3155 long2 lc = cast(long2)count; 3156 int bits = cast(int)(lc.array[0]); 3157 short8 r = void; 3158 foreach(i; 0..8) 3159 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits); 3160 return cast(int4)r; 3161 } 3162 } 3163 3164 static if (GDC_with_SSE2) 3165 { 3166 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 3167 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @safe 3168 { 3169 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3170 } 3171 } 3172 else static if (LDC_with_SSE2) 3173 { 3174 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 3175 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @safe 3176 { 3177 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3178 } 3179 } 3180 else 3181 { 3182 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 3183 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @safe 3184 { 3185 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3186 // D says "It's illegal to shift by the same or more bits 3187 // than the size of the quantity being shifted" 3188 // and it's UB instead. 3189 int4 r = _mm_setzero_si128(); 3190 3191 ubyte count = cast(ubyte) imm8; 3192 if (count > 31) 3193 return r; 3194 3195 foreach(i; 0..4) 3196 r.array[i] = cast(uint)(a.array[i]) << count; 3197 return r; 3198 } 3199 } 3200 unittest 3201 { 3202 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3203 __m128i B = _mm_slli_epi32(A, 1); 3204 __m128i B2 = _mm_slli_epi32(A, 1 + 256); 3205 int[4] expectedB = [ 0, 4, 6, -8]; 3206 assert(B.array == expectedB); 3207 assert(B2.array == expectedB); 3208 3209 __m128i C = _mm_slli_epi32(A, 0); 3210 int[4] expectedC = [ 0, 2, 3, -4]; 3211 assert(C.array == expectedC); 3212 3213 __m128i D = _mm_slli_epi32(A, 65); 3214 int[4] expectedD = [ 0, 0, 0, 0]; 3215 assert(D.array == expectedD); 3216 } 3217 3218 static if (GDC_with_SSE2) 3219 { 3220 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros. 3221 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @safe 3222 { 3223 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3224 } 3225 } 3226 else static if (LDC_with_SSE2) 3227 { 3228 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros. 3229 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @safe 3230 { 3231 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3232 } 3233 } 3234 else 3235 { 3236 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros. 3237 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted 3238 { 3239 long2 sa = cast(long2)a; 3240 3241 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3242 // D says "It's illegal to shift by the same or more bits 3243 // than the size of the quantity being shifted" 3244 // and it's UB instead. 3245 long2 r = cast(long2) _mm_setzero_si128(); 3246 ubyte count = cast(ubyte) imm8; 3247 if (count > 63) 3248 return cast(__m128i)r; 3249 3250 r.ptr[0] = cast(ulong)(sa.array[0]) << count; 3251 r.ptr[1] = cast(ulong)(sa.array[1]) << count; 3252 return cast(__m128i)r; 3253 } 3254 } 3255 unittest 3256 { 3257 __m128i A = _mm_setr_epi64(8, -4); 3258 long2 B = cast(long2) _mm_slli_epi64(A, 1); 3259 long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024); 3260 long[2] expectedB = [ 16, -8]; 3261 assert(B.array == expectedB); 3262 assert(B2.array == expectedB); 3263 3264 long2 C = cast(long2) _mm_slli_epi64(A, 0); 3265 long[2] expectedC = [ 8, -4]; 3266 assert(C.array == expectedC); 3267 3268 long2 D = cast(long2) _mm_slli_epi64(A, 64); 3269 long[2] expectedD = [ 0, -0]; 3270 assert(D.array == expectedD); 3271 } 3272 3273 static if (GDC_with_SSE2) 3274 { 3275 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 3276 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted 3277 { 3278 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3279 } 3280 } 3281 else static if (LDC_with_SSE2) 3282 { 3283 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 3284 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted 3285 { 3286 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3287 } 3288 } 3289 else static if (LDC_with_ARM64) 3290 { 3291 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 3292 __m128i _mm_slli_epi16 (__m128i a, int imm8) pure @trusted 3293 { 3294 short8 sa = cast(short8)a; 3295 short8 r = cast(short8)_mm_setzero_si128(); 3296 ubyte count = cast(ubyte) imm8; 3297 if (count > 15) 3298 return cast(__m128i)r; 3299 r = sa << short8(count); 3300 return cast(__m128i)r; 3301 } 3302 } 3303 else 3304 { 3305 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 3306 __m128i _mm_slli_epi16 (__m128i a, int imm8) pure @trusted 3307 { 3308 short8 sa = cast(short8)a; 3309 short8 r = cast(short8)_mm_setzero_si128(); 3310 ubyte count = cast(ubyte) imm8; 3311 if (count > 15) 3312 return cast(__m128i)r; 3313 foreach(i; 0..8) 3314 r.ptr[i] = cast(short)(sa.array[i] << count); 3315 return cast(__m128i)r; 3316 } 3317 } 3318 unittest 3319 { 3320 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3321 short8 B = cast(short8)( _mm_slli_epi16(A, 1) ); 3322 short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) ); 3323 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 3324 assert(B.array == expectedB); 3325 assert(B2.array == expectedB); 3326 3327 short8 C = cast(short8)( _mm_slli_epi16(A, 16) ); 3328 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 3329 assert(C.array == expectedC); 3330 } 3331 3332 3333 /// Shift `a` left by `bytes` bytes while shifting in zeros. 3334 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted 3335 { 3336 static if (bytes & 0xF0) 3337 { 3338 return _mm_setzero_si128(); 3339 } 3340 else 3341 { 3342 static if (GDC_with_SSE2) 3343 { 3344 return __builtin_ia32_pslldqi128(op, cast(ubyte)(bytes * 8)); 3345 } 3346 else version(DigitalMars) 3347 { 3348 version(D_InlineAsm_X86) 3349 { 3350 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64 3351 { 3352 movdqu XMM0, op; 3353 pslldq XMM0, bytes; 3354 movdqu op, XMM0; 3355 } 3356 return op; 3357 } 3358 else 3359 { 3360 byte16 A = cast(byte16)op; 3361 byte16 R; 3362 for (int n = 15; n >= bytes; --n) 3363 R.ptr[n] = A.array[n-bytes]; 3364 for (int n = bytes-1; n >= 0; --n) 3365 R.ptr[n] = 0; 3366 return cast(__m128i)R; 3367 } 3368 } 3369 else 3370 { 3371 return cast(__m128i) shufflevector!(byte16, 3372 16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes, 3373 22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes, 3374 28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes) 3375 (cast(byte16)_mm_setzero_si128(), cast(byte16)op); 3376 } 3377 } 3378 } 3379 unittest 3380 { 3381 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3382 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left 3383 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ]; 3384 assert(R.array == correct); 3385 3386 __m128i B = _mm_srli_si128!16(_mm_set1_epi32(-1)); 3387 int[4] expectedB = [0, 0, 0, 0]; 3388 assert(B.array == expectedB); 3389 } 3390 3391 version(LDC) 3392 { 3393 // Disappeared with LDC 1.11 3394 static if (__VERSION__ < 2081) 3395 alias _mm_sqrt_pd = __builtin_ia32_sqrtpd; 3396 else 3397 { 3398 __m128d _mm_sqrt_pd(__m128d vec) pure @safe 3399 { 3400 vec.array[0] = llvm_sqrt(vec.array[0]); 3401 vec.array[1] = llvm_sqrt(vec.array[1]); 3402 return vec; 3403 } 3404 } 3405 } 3406 else 3407 { 3408 static if (GDC_with_SSE2) 3409 { 3410 alias _mm_sqrt_pd = __builtin_ia32_sqrtpd; 3411 } 3412 else 3413 { 3414 __m128d _mm_sqrt_pd(__m128d vec) pure @safe 3415 { 3416 vec.array[0] = sqrt(vec.array[0]); 3417 vec.array[1] = sqrt(vec.array[1]); 3418 return vec; 3419 } 3420 } 3421 } 3422 3423 3424 version(LDC) 3425 { 3426 // Disappeared with LDC 1.11 3427 static if (__VERSION__ < 2081) 3428 alias _mm_sqrt_sd = __builtin_ia32_sqrtsd; 3429 else 3430 { 3431 __m128d _mm_sqrt_sd(__m128d vec) pure @safe 3432 { 3433 vec.array[0] = llvm_sqrt(vec.array[0]); 3434 vec.array[1] = vec.array[1]; 3435 return vec; 3436 } 3437 } 3438 } 3439 else 3440 { 3441 static if (GDC_with_SSE2) 3442 { 3443 alias _mm_sqrt_sd = __builtin_ia32_sqrtsd; 3444 } 3445 else 3446 { 3447 __m128d _mm_sqrt_sd(__m128d vec) pure @safe 3448 { 3449 vec.array[0] = sqrt(vec.array[0]); 3450 vec.array[1] = vec.array[1]; 3451 return vec; 3452 } 3453 } 3454 } 3455 3456 3457 static if (GDC_with_SSE2) 3458 { 3459 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @safe 3460 { 3461 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3462 } 3463 } 3464 else static if (LDC_with_SSE2) 3465 { 3466 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @safe 3467 { 3468 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3469 } 3470 } 3471 else 3472 { 3473 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @safe 3474 { 3475 short8 sa = cast(short8)a; 3476 long2 lc = cast(long2)count; 3477 int bits = cast(int)(lc.array[0]); 3478 short8 r = void; 3479 foreach(i; 0..8) 3480 r.array[i] = cast(short)(sa.array[i] >> bits); 3481 return cast(int4)r; 3482 } 3483 } 3484 3485 static if (LDC_with_SSE2) 3486 { 3487 deprecated("Use _mm_srai_epi32 instead.") alias _mm_sra_epi32 = __builtin_ia32_psrad128; 3488 } 3489 else static if (GDC_with_SSE2) 3490 { 3491 deprecated("Use _mm_srai_epi32 instead.") alias _mm_sra_epi32 = __builtin_ia32_psrad128; 3492 } 3493 else 3494 { 3495 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @safe 3496 { 3497 int4 r = void; 3498 long2 lc = cast(long2)count; 3499 int bits = cast(int)(lc.array[0]); 3500 r.array[0] = (a.array[0] >> bits); 3501 r.array[1] = (a.array[1] >> bits); 3502 r.array[2] = (a.array[2] >> bits); 3503 r.array[3] = (a.array[3] >> bits); 3504 return r; 3505 } 3506 } 3507 3508 3509 static if (GDC_with_SSE2) 3510 { 3511 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 3512 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted 3513 { 3514 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 3515 } 3516 } 3517 else static if (LDC_with_SSE2) 3518 { 3519 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 3520 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted 3521 { 3522 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 3523 } 3524 } 3525 else static if (LDC_with_ARM64) 3526 { 3527 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 3528 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted 3529 { 3530 short8 sa = cast(short8)a; 3531 ubyte count = cast(ubyte)imm8; 3532 if (count > 15) 3533 count = 15; 3534 short8 r = sa >> short8(count); 3535 return cast(__m128i)r; 3536 } 3537 } 3538 else 3539 { 3540 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 3541 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted 3542 { 3543 short8 sa = cast(short8)a; 3544 short8 r = void; 3545 3546 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3547 // D says "It's illegal to shift by the same or more bits 3548 // than the size of the quantity being shifted" 3549 // and it's UB instead. 3550 ubyte count = cast(ubyte)imm8; 3551 if (count > 15) 3552 count = 15; 3553 foreach(i; 0..8) 3554 r.ptr[i] = cast(short)(sa.array[i] >> count); 3555 return cast(int4)r; 3556 } 3557 } 3558 unittest 3559 { 3560 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3561 short8 B = cast(short8)( _mm_srai_epi16(A, 1) ); 3562 short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) ); 3563 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 3564 assert(B.array == expectedB); 3565 assert(B2.array == expectedB); 3566 3567 short8 C = cast(short8)( _mm_srai_epi16(A, 18) ); 3568 short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ]; 3569 assert(C.array == expectedC); 3570 } 3571 3572 static if (LDC_with_SSE2) 3573 { 3574 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 3575 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @safe 3576 { 3577 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 3578 } 3579 } 3580 else static if (GDC_with_SSE2) 3581 { 3582 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 3583 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @safe 3584 { 3585 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 3586 } 3587 } 3588 else 3589 { 3590 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 3591 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted 3592 { 3593 int4 r = void; 3594 3595 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3596 // D says "It's illegal to shift by the same or more bits 3597 // than the size of the quantity being shifted" 3598 // and it's UB instead. 3599 ubyte count = cast(ubyte) imm8; 3600 if (count > 31) 3601 count = 31; 3602 3603 r.ptr[0] = (a.array[0] >> count); 3604 r.ptr[1] = (a.array[1] >> count); 3605 r.ptr[2] = (a.array[2] >> count); 3606 r.ptr[3] = (a.array[3] >> count); 3607 return r; 3608 } 3609 } 3610 unittest 3611 { 3612 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3613 __m128i B = _mm_srai_epi32(A, 1); 3614 __m128i B2 = _mm_srai_epi32(A, 1 + 256); 3615 int[4] expectedB = [ 0, 1, 1, -2]; 3616 assert(B.array == expectedB); 3617 assert(B2.array == expectedB); 3618 3619 __m128i C = _mm_srai_epi32(A, 32); 3620 int[4] expectedC = [ 0, 0, 0, -1]; 3621 assert(C.array == expectedC); 3622 3623 __m128i D = _mm_srai_epi32(A, 0); 3624 int[4] expectedD = [ 0, 2, 3, -4]; 3625 assert(D.array == expectedD); 3626 } 3627 3628 static if (LDC_with_SSE2) 3629 { 3630 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @safe 3631 { 3632 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 3633 } 3634 } 3635 else static if (GDC_with_SSE2) 3636 { 3637 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @safe 3638 { 3639 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 3640 } 3641 } 3642 else 3643 { 3644 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @safe 3645 { 3646 short8 sa = cast(short8)a; 3647 long2 lc = cast(long2)count; 3648 int bits = cast(int)(lc.array[0]); 3649 short8 r = void; 3650 foreach(i; 0..8) 3651 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits); 3652 return cast(int4)r; 3653 } 3654 } 3655 3656 static if (LDC_with_SSE2) 3657 { 3658 deprecated("Use _mm_srli_epi32 instead.") alias _mm_srl_epi32 = __builtin_ia32_psrld128; 3659 } 3660 else static if (GDC_with_SSE2) 3661 { 3662 deprecated("Use _mm_srli_epi32 instead.") alias _mm_srl_epi32 = __builtin_ia32_psrld128; 3663 } 3664 else 3665 { 3666 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @safe 3667 { 3668 int4 r = void; 3669 long2 lc = cast(long2)count; 3670 int bits = cast(int)(lc.array[0]); 3671 r.array[0] = cast(uint)(a.array[0]) >> bits; 3672 r.array[1] = cast(uint)(a.array[1]) >> bits; 3673 r.array[2] = cast(uint)(a.array[2]) >> bits; 3674 r.array[3] = cast(uint)(a.array[3]) >> bits; 3675 return r; 3676 } 3677 } 3678 3679 static if (LDC_with_SSE2) 3680 { 3681 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @safe 3682 { 3683 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 3684 } 3685 } 3686 else static if (GDC_with_SSE2) 3687 { 3688 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @safe 3689 { 3690 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 3691 } 3692 } 3693 else 3694 { 3695 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @safe 3696 { 3697 long2 r = void; 3698 long2 sa = cast(long2)a; 3699 long2 lc = cast(long2)count; 3700 int bits = cast(int)(lc.array[0]); 3701 r.array[0] = cast(ulong)(sa.array[0]) >> bits; 3702 r.array[1] = cast(ulong)(sa.array[1]) >> bits; 3703 return cast(__m128i)r; 3704 } 3705 } 3706 3707 3708 static if (GDC_with_SSE2) 3709 { 3710 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 3711 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @safe 3712 { 3713 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 3714 } 3715 } 3716 else static if (LDC_with_SSE2) 3717 { 3718 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 3719 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @safe 3720 { 3721 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 3722 } 3723 } 3724 else static if (LDC_with_ARM64) 3725 { 3726 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 3727 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted 3728 { 3729 short8 sa = cast(short8)a; 3730 short8 r = cast(short8) _mm_setzero_si128(); 3731 3732 ubyte count = cast(ubyte)imm8; 3733 if (count >= 16) 3734 return cast(__m128i)r; 3735 3736 r = sa >>> short8(count); // This facility offered with LDC, but not DMD. 3737 return cast(__m128i)r; 3738 } 3739 } 3740 else 3741 { 3742 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 3743 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @safe 3744 { 3745 short8 sa = cast(short8)a; 3746 ubyte count = cast(ubyte)imm8; 3747 3748 short8 r = cast(short8) _mm_setzero_si128(); 3749 if (count >= 16) 3750 return cast(__m128i)r; 3751 3752 foreach(i; 0..8) 3753 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count); 3754 return cast(__m128i)r; 3755 } 3756 } 3757 unittest 3758 { 3759 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3760 short8 B = cast(short8)( _mm_srli_epi16(A, 1) ); 3761 short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) ); 3762 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 3763 assert(B.array == expectedB); 3764 assert(B2.array == expectedB); 3765 3766 short8 C = cast(short8)( _mm_srli_epi16(A, 16) ); 3767 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0]; 3768 assert(C.array == expectedC); 3769 3770 short8 D = cast(short8)( _mm_srli_epi16(A, 0) ); 3771 short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ]; 3772 assert(D.array == expectedD); 3773 } 3774 3775 3776 static if (GDC_with_SSE2) 3777 { 3778 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 3779 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted 3780 { 3781 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 3782 } 3783 } 3784 else static if (LDC_with_SSE2) 3785 { 3786 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 3787 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted 3788 { 3789 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 3790 } 3791 } 3792 else 3793 { 3794 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 3795 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted 3796 { 3797 ubyte count = cast(ubyte) imm8; 3798 3799 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3800 // D says "It's illegal to shift by the same or more bits 3801 // than the size of the quantity being shifted" 3802 // and it's UB instead. 3803 int4 r = _mm_setzero_si128(); 3804 if (count >= 32) 3805 return r; 3806 r.ptr[0] = a.array[0] >>> count; 3807 r.ptr[1] = a.array[1] >>> count; 3808 r.ptr[2] = a.array[2] >>> count; 3809 r.ptr[3] = a.array[3] >>> count; 3810 return r; 3811 } 3812 } 3813 unittest 3814 { 3815 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3816 __m128i B = _mm_srli_epi32(A, 1); 3817 __m128i B2 = _mm_srli_epi32(A, 1 + 256); 3818 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 3819 assert(B.array == expectedB); 3820 assert(B2.array == expectedB); 3821 3822 __m128i C = _mm_srli_epi32(A, 255); 3823 int[4] expectedC = [ 0, 0, 0, 0 ]; 3824 assert(C.array == expectedC); 3825 } 3826 3827 static if (GDC_with_SSE2) 3828 { 3829 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros. 3830 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted 3831 { 3832 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 3833 } 3834 } 3835 else static if (LDC_with_SSE2) 3836 { 3837 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros. 3838 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted 3839 { 3840 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 3841 } 3842 } 3843 else 3844 { 3845 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros. 3846 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted 3847 { 3848 long2 r = cast(long2) _mm_setzero_si128(); 3849 long2 sa = cast(long2)a; 3850 3851 ubyte count = cast(ubyte) imm8; 3852 if (count >= 64) 3853 return cast(__m128i)r; 3854 3855 r.ptr[0] = sa.array[0] >>> count; 3856 r.ptr[1] = sa.array[1] >>> count; 3857 return cast(__m128i)r; 3858 } 3859 } 3860 unittest 3861 { 3862 __m128i A = _mm_setr_epi64(8, -4); 3863 long2 B = cast(long2) _mm_srli_epi64(A, 1); 3864 long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512); 3865 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 3866 assert(B.array == expectedB); 3867 assert(B2.array == expectedB); 3868 3869 long2 C = cast(long2) _mm_srli_epi64(A, 64); 3870 long[2] expectedC = [ 0, 0 ]; 3871 assert(C.array == expectedC); 3872 } 3873 3874 /// Shift `v` right by `bytes` bytes while shifting in zeros. 3875 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe 3876 { 3877 static if (bytes & 0xF0) 3878 { 3879 return _mm_setzero_si128(); 3880 } 3881 else 3882 { 3883 static if (GDC_with_SSE2) 3884 { 3885 return cast(__m128i) __builtin_ia32_psrldqi128(v, cast(ubyte)(bytes * 8)); 3886 } 3887 else static if (DMD_with_32bit_asm) 3888 { 3889 asm pure nothrow @nogc @trusted 3890 { 3891 movdqu XMM0, v; 3892 psrldq XMM0, bytes; 3893 movdqu v, XMM0; 3894 } 3895 return v; 3896 } 3897 else 3898 { 3899 return cast(__m128i) shufflevector!(byte16, 3900 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7, 3901 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15) 3902 (cast(byte16) v, cast(byte16)_mm_setzero_si128()); 3903 } 3904 } 3905 3906 } 3907 3908 unittest 3909 { 3910 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1)); 3911 int[4] correct = [2, 3, 4, 0]; 3912 assert(R.array == correct); 3913 3914 __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1)); 3915 int[4] expectedA = [0, 0, 0, 0]; 3916 assert(A.array == expectedA); 3917 } 3918 3919 /// Shift `v` right by `bytes` bytes while shifting in zeros. 3920 /// #BONUS 3921 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe 3922 { 3923 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v); 3924 } 3925 unittest 3926 { 3927 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)); 3928 float[4] correct = [3.0f, 4.0f, 0, 0]; 3929 assert(R.array == correct); 3930 } 3931 3932 /// Shift `v` right by `bytes` bytes while shifting in zeros. 3933 /// #BONUS 3934 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe 3935 { 3936 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v); 3937 } 3938 3939 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted 3940 { 3941 __m128d* aligned = cast(__m128d*)mem_addr; 3942 *aligned = a; 3943 } 3944 3945 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted 3946 { 3947 __m128d* aligned = cast(__m128d*)mem_addr; 3948 __m128d r; 3949 r.ptr[0] = a.array[0]; 3950 r.ptr[1] = a.array[0]; 3951 *aligned = r; 3952 } 3953 3954 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 3955 { 3956 *mem_addr = a.array[0]; 3957 } 3958 3959 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 3960 { 3961 *mem_addr = a; 3962 } 3963 3964 alias _mm_store1_pd = _mm_store_pd1; 3965 3966 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 3967 { 3968 *mem_addr = a.array[1]; 3969 } 3970 3971 // Note: `mem_addr` doesn't have to actually be aligned, which breaks 3972 // expectations from the user point of view. This problem also exist in C++. 3973 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 3974 { 3975 long* dest = cast(long*)mem_addr; 3976 long2 la = cast(long2)a; 3977 *dest = la.array[0]; 3978 } 3979 unittest 3980 { 3981 long[3] A = [1, 2, 3]; 3982 _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 3983 long[3] correct = [1, 0x1_0000_0000, 3]; 3984 assert(A == correct); 3985 } 3986 3987 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 3988 { 3989 *mem_addr = a.array[0]; 3990 } 3991 3992 void _mm_storer_pd (double* mem_addr, __m128d a) pure 3993 { 3994 __m128d* aligned = cast(__m128d*)mem_addr; 3995 *aligned = shufflevector!(double2, 1, 0)(a, a); 3996 } 3997 3998 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe 3999 { 4000 storeUnaligned!double2(a, mem_addr); 4001 } 4002 4003 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe 4004 { 4005 storeUnaligned!__m128i(a, cast(int*)mem_addr); 4006 } 4007 4008 /// Store 32-bit integer from the first element of `a` into memory. 4009 /// `mem_addr` does not need to be aligned on any particular boundary. 4010 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted 4011 { 4012 int* dest = cast(int*)mem_addr; 4013 *dest = a.array[0]; 4014 } 4015 unittest 4016 { 4017 int[2] arr = [-24, 12]; 4018 _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7)); 4019 assert(arr == [-24, -1]); 4020 } 4021 4022 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 4023 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte 4024 /// boundary or a general-protection exception may be generated. 4025 void _mm_stream_pd (double* mem_addr, __m128d a) 4026 { 4027 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4028 __m128d* dest = cast(__m128d*)mem_addr; 4029 *dest = a; 4030 } 4031 4032 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 4033 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception 4034 /// may be generated. 4035 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) 4036 { 4037 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4038 __m128i* dest = cast(__m128i*)mem_addr; 4039 *dest = a; 4040 } 4041 4042 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 4043 /// pollution. If the cache line containing address mem_addr is already in the cache, 4044 /// the cache will be updated. 4045 void _mm_stream_si32 (int* mem_addr, int a) 4046 { 4047 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4048 *mem_addr = a; 4049 } 4050 4051 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 4052 /// cache pollution. If the cache line containing address mem_addr is already 4053 /// in the cache, the cache will be updated. 4054 void _mm_stream_si64 (long* mem_addr, long a) 4055 { 4056 // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4057 *mem_addr = a; 4058 } 4059 4060 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 4061 { 4062 return cast(__m128i)(cast(short8)a - cast(short8)b); 4063 } 4064 4065 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 4066 { 4067 return cast(__m128i)(cast(int4)a - cast(int4)b); 4068 } 4069 4070 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 4071 { 4072 return cast(__m128i)(cast(long2)a - cast(long2)b); 4073 } 4074 4075 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 4076 { 4077 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 4078 } 4079 4080 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 4081 { 4082 return a - b; 4083 } 4084 4085 version(DigitalMars) 4086 { 4087 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 4088 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe 4089 { 4090 asm pure nothrow @nogc @trusted { nop;} 4091 a[0] = a[0] - b[0]; 4092 return a; 4093 } 4094 } 4095 else static if (GDC_with_SSE2) 4096 { 4097 alias _mm_sub_sd = __builtin_ia32_subsd; 4098 } 4099 else 4100 { 4101 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe 4102 { 4103 a.array[0] -= b.array[0]; 4104 return a; 4105 } 4106 } 4107 unittest 4108 { 4109 __m128d a = [1.5, -2.0]; 4110 a = _mm_sub_sd(a, a); 4111 assert(a.array == [0.0, -2.0]); 4112 } 4113 4114 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe 4115 { 4116 return a - b; 4117 } 4118 4119 version(LDC) 4120 { 4121 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4122 { 4123 // Generates PSUBSW since LDC 1.15 -O0 4124 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4125 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 4126 { 4127 enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4128 enum ir = ` 4129 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4130 ret <8 x i16> %r`; 4131 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4132 } 4133 } 4134 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4135 { 4136 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4137 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 4138 { 4139 short[8] res; 4140 short8 sa = cast(short8)a; 4141 short8 sb = cast(short8)b; 4142 foreach(i; 0..8) 4143 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4144 return _mm_loadu_si128(cast(int4*)res.ptr); 4145 } 4146 } 4147 else 4148 alias _mm_subs_epi16 = __builtin_ia32_psubsw128; 4149 } 4150 else 4151 { 4152 static if (GDC_with_SSE2) 4153 { 4154 alias _mm_subs_epi16 = __builtin_ia32_psubsw128; 4155 } 4156 else 4157 { 4158 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4159 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 4160 { 4161 short[8] res; 4162 short8 sa = cast(short8)a; 4163 short8 sb = cast(short8)b; 4164 foreach(i; 0..8) 4165 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4166 return _mm_loadu_si128(cast(int4*)res.ptr); 4167 } 4168 } 4169 } 4170 unittest 4171 { 4172 short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0), 4173 _mm_setr_epi16(-10 , 16, 5, 4, 3, 2, 1, 0)); 4174 static immutable short[8] correctResult = [32767, -32768, 0, 0, 0, 0, 0, 0]; 4175 assert(res.array == correctResult); 4176 } 4177 4178 version(LDC) 4179 { 4180 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4181 { 4182 // x86: Generates PSUBSB since LDC 1.15 -O0 4183 // ARM: Generates sqsub.16b since LDC 1.21 -O0 4184 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 4185 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 4186 { 4187 enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 4188 enum ir = ` 4189 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 4190 ret <16 x i8> %r`; 4191 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 4192 } 4193 } 4194 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4195 { 4196 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 4197 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 4198 { 4199 byte[16] res; 4200 byte16 sa = cast(byte16)a; 4201 byte16 sb = cast(byte16)b; 4202 foreach(i; 0..16) 4203 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4204 return _mm_loadu_si128(cast(int4*)res.ptr); 4205 } 4206 } 4207 else 4208 alias _mm_subs_epi8 = __builtin_ia32_psubsb128; 4209 } 4210 else 4211 { 4212 static if (GDC_with_SSE2) 4213 { 4214 alias _mm_subs_epi8 = __builtin_ia32_psubsb128; 4215 } 4216 else 4217 { 4218 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 4219 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 4220 { 4221 byte[16] res; 4222 byte16 sa = cast(byte16)a; 4223 byte16 sb = cast(byte16)b; 4224 foreach(i; 0..16) 4225 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4226 return _mm_loadu_si128(cast(int4*)res.ptr); 4227 } 4228 } 4229 } 4230 unittest 4231 { 4232 byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4233 _mm_setr_epi8( 15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4234 static immutable byte[16] correctResult = [-128, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4235 assert(res.array == correctResult); 4236 } 4237 4238 version(LDC) 4239 { 4240 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4241 { 4242 // x86: Generates PSUBUSW since LDC 1.15 -O0 4243 // ARM: Generates uqsub.8h since LDC 1.21 -O0 4244 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 4245 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 4246 { 4247 enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4248 enum ir = ` 4249 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4250 ret <8 x i16> %r`; 4251 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4252 } 4253 } 4254 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4255 { 4256 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 4257 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 4258 { 4259 short[8] res; 4260 short8 sa = cast(short8)a; 4261 short8 sb = cast(short8)b; 4262 foreach(i; 0..8) 4263 { 4264 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4265 res[i] = saturateSignedIntToUnsignedShort(sum); 4266 } 4267 return _mm_loadu_si128(cast(int4*)res.ptr); 4268 } 4269 } 4270 else 4271 alias _mm_subs_epu16 = __builtin_ia32_psubusw128; 4272 } 4273 else 4274 { 4275 static if (GDC_with_SSE2) 4276 { 4277 alias _mm_subs_epu16 = __builtin_ia32_psubusw128; 4278 } 4279 else 4280 { 4281 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 4282 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 4283 { 4284 short[8] res; 4285 short8 sa = cast(short8)a; 4286 short8 sb = cast(short8)b; 4287 foreach(i; 0..8) 4288 { 4289 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4290 res[i] = saturateSignedIntToUnsignedShort(sum); 4291 } 4292 return _mm_loadu_si128(cast(int4*)res.ptr); 4293 } 4294 } 4295 } 4296 unittest 4297 { 4298 short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534, 1, 5, 4, 3, 2, 1, 0), 4299 _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0)); 4300 static immutable short[8] correct = [ 0, 0, 1, 0, 0, 2, 0, 0]; 4301 assert(R.array == correct); 4302 } 4303 4304 version(LDC) 4305 { 4306 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4307 { 4308 // x86: Generates PSUBUSB since LDC 1.15 -O0 4309 // ARM: Generates uqsub.16b since LDC 1.21 -O0 4310 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4311 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4312 { 4313 enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 4314 enum ir = ` 4315 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 4316 ret <16 x i8> %r`; 4317 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 4318 } 4319 } 4320 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4321 { 4322 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4323 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4324 { 4325 ubyte[16] res; 4326 byte16 sa = cast(byte16)a; 4327 byte16 sb = cast(byte16)b; 4328 foreach(i; 0..16) 4329 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4330 return _mm_loadu_si128(cast(int4*)res.ptr); 4331 } 4332 } 4333 else 4334 alias _mm_subs_epu8 = __builtin_ia32_psubusb128; 4335 } 4336 else 4337 { 4338 static if (GDC_with_SSE2) 4339 { 4340 alias _mm_subs_epu8 = __builtin_ia32_psubusb128; 4341 } 4342 else 4343 { 4344 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4345 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4346 { 4347 ubyte[16] res; 4348 byte16 sa = cast(byte16)a; 4349 byte16 sb = cast(byte16)b; 4350 foreach(i; 0..16) 4351 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4352 return _mm_loadu_si128(cast(int4*)res.ptr); 4353 } 4354 } 4355 } 4356 unittest 4357 { 4358 byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4359 _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4360 static immutable byte[16] correctResult = [ 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4361 assert(res.array == correctResult); 4362 } 4363 4364 // Note: the only difference between these intrinsics is the signalling 4365 // behaviour of quiet NaNs. This is incorrect but the case where 4366 // you would want to differentiate between qNaN and sNaN and then 4367 // treat them differently on purpose seems extremely rare. 4368 alias _mm_ucomieq_sd = _mm_comieq_sd; 4369 alias _mm_ucomige_sd = _mm_comige_sd; 4370 alias _mm_ucomigt_sd = _mm_comigt_sd; 4371 alias _mm_ucomile_sd = _mm_comile_sd; 4372 alias _mm_ucomilt_sd = _mm_comilt_sd; 4373 alias _mm_ucomineq_sd = _mm_comineq_sd; 4374 4375 __m128d _mm_undefined_pd() pure @safe 4376 { 4377 __m128d result = void; 4378 return result; 4379 } 4380 __m128i _mm_undefined_si128() pure @safe 4381 { 4382 __m128i result = void; 4383 return result; 4384 } 4385 4386 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe 4387 { 4388 static if (GDC_with_SSE2) 4389 { 4390 return __builtin_ia32_punpckhwd128(a, b); 4391 } 4392 else static if (DMD_with_32bit_asm) 4393 { 4394 asm pure nothrow @nogc @trusted 4395 { 4396 movdqu XMM0, a; 4397 movdqu XMM1, b; 4398 punpckhwd XMM0, XMM1; 4399 movdqu a, XMM0; 4400 } 4401 return a; 4402 } 4403 else 4404 { 4405 return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 4406 (cast(short8)a, cast(short8)b); 4407 } 4408 } 4409 unittest 4410 { 4411 __m128i A = _mm_setr_epi16(4, 5, 6, 7, 8, 9, 10, 11); 4412 __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19); 4413 short8 C = cast(short8)(_mm_unpackhi_epi16(A, B)); 4414 short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19]; 4415 assert(C.array == correct); 4416 } 4417 4418 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @safe 4419 { 4420 static if (GDC_with_SSE2) 4421 { 4422 return __builtin_ia32_punpckhdq128(a, b); 4423 } 4424 else 4425 { 4426 return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 4427 } 4428 } 4429 4430 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted 4431 { 4432 static if (GDC_with_SSE2) 4433 { 4434 return __builtin_ia32_punpckhqdq128(a, b); 4435 } 4436 else 4437 { 4438 __m128i r = cast(__m128i)b; 4439 r[0] = a[2]; 4440 r[1] = a[3]; 4441 return r; 4442 } 4443 } 4444 unittest // Issue #36 4445 { 4446 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4447 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4448 long2 C = cast(long2)(_mm_unpackhi_epi64(A, B)); 4449 long[2] correct = [0x33333333_33333333, 0x55555555_55555555]; 4450 assert(C.array == correct); 4451 } 4452 4453 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe 4454 { 4455 static if (GDC_with_SSE2) 4456 { 4457 return __builtin_ia32_punpckhbw128(a, b); 4458 } 4459 else static if (DMD_with_32bit_asm) 4460 { 4461 asm pure nothrow @nogc @trusted 4462 { 4463 movdqu XMM0, a; 4464 movdqu XMM1, b; 4465 punpckhbw XMM0, XMM1; 4466 movdqu a, XMM0; 4467 } 4468 return a; 4469 } 4470 else 4471 { 4472 return cast(__m128i)shufflevector!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 4473 12, 28, 13, 29, 14, 30, 15, 31) 4474 (cast(byte16)a, cast(byte16)b); 4475 } 4476 } 4477 4478 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe 4479 { 4480 static if (GDC_with_SSE2) 4481 { 4482 return __builtin_ia32_unpckhpd(a, b); 4483 } 4484 else 4485 { 4486 return shufflevector!(__m128d, 1, 3)(a, b); 4487 } 4488 } 4489 4490 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe 4491 { 4492 static if (GDC_with_SSE2) 4493 { 4494 return __builtin_ia32_punpcklwd128(a, b); 4495 } 4496 else static if (DMD_with_32bit_asm) 4497 { 4498 asm pure nothrow @nogc @trusted 4499 { 4500 movdqu XMM0, a; 4501 movdqu XMM1, b; 4502 punpcklwd XMM0, XMM1; 4503 movdqu a, XMM0; 4504 } 4505 return a; 4506 } 4507 else 4508 { 4509 return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11) 4510 (cast(short8)a, cast(short8)b); 4511 } 4512 } 4513 4514 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @safe 4515 { 4516 static if (GDC_with_SSE2) 4517 { 4518 return __builtin_ia32_punpckldq128(a, b); 4519 } 4520 else 4521 { 4522 return shufflevector!(int4, 0, 4, 1, 5) 4523 (cast(int4)a, cast(int4)b); 4524 } 4525 } 4526 4527 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted 4528 { 4529 static if (GDC_with_SSE2) 4530 { 4531 return __builtin_ia32_punpcklqdq128(a, b); 4532 } 4533 else 4534 { 4535 long2 lA = cast(long2)a; 4536 long2 lB = cast(long2)b; 4537 long2 R; 4538 R.ptr[0] = lA.array[0]; 4539 R.ptr[1] = lB.array[0]; 4540 return cast(__m128i)R; 4541 } 4542 } 4543 unittest // Issue #36 4544 { 4545 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4546 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4547 long2 C = cast(long2)(_mm_unpacklo_epi64(A, B)); 4548 long[2] correct = [0x22222222_22222222, 0x44444444_44444444]; 4549 assert(C.array == correct); 4550 } 4551 4552 4553 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe 4554 { 4555 static if (GDC_with_SSE2) 4556 { 4557 return __builtin_ia32_punpcklbw128(a, b); 4558 } 4559 else static if (DMD_with_32bit_asm) 4560 { 4561 asm pure nothrow @nogc @trusted 4562 { 4563 movdqu XMM0, a; 4564 movdqu XMM1, b; 4565 punpcklbw XMM0, XMM1; 4566 movdqu a, XMM0; 4567 } 4568 return a; 4569 } 4570 else 4571 { 4572 return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 4573 4, 20, 5, 21, 6, 22, 7, 23) 4574 (cast(byte16)a, cast(byte16)b); 4575 } 4576 } 4577 4578 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe 4579 { 4580 static if (GDC_with_SSE2) 4581 { 4582 return __builtin_ia32_unpcklpd(a, b); 4583 } 4584 else 4585 { 4586 return shufflevector!(__m128d, 0, 2)(a, b); 4587 } 4588 } 4589 4590 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 4591 { 4592 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 4593 } 4594 4595 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 4596 { 4597 return a ^ b; 4598 } 4599 4600 unittest 4601 { 4602 // distance between two points in 4D 4603 float distance(float[4] a, float[4] b) nothrow @nogc 4604 { 4605 __m128 va = _mm_loadu_ps(a.ptr); 4606 __m128 vb = _mm_loadu_ps(b.ptr); 4607 __m128 diffSquared = _mm_sub_ps(va, vb); 4608 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 4609 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared)); 4610 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum)); 4611 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 4612 } 4613 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 4614 }