1 /** 2 * Copyright: Copyright Auburn Sounds 2016-2019, Stefanos Baziotis 2019. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module inteli.emmintrin; 7 8 public import inteli.types; 9 public import inteli.xmmintrin; // SSE2 includes SSE1 10 import inteli.mmx; 11 import inteli.internals; 12 13 nothrow @nogc: 14 15 16 // SSE2 instructions 17 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 18 19 /// Add packed 16-bit integers in `a` and `b`. 20 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 21 { 22 return cast(__m128i)(cast(short8)a + cast(short8)b); 23 } 24 unittest 25 { 26 __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77); 27 short8 R = cast(short8) _mm_add_epi16(A, A); 28 short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154]; 29 assert(R.array == correct); 30 } 31 32 /// Add packed 32-bit integers in `a` and `b`. 33 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 34 { 35 return cast(__m128i)(cast(int4)a + cast(int4)b); 36 } 37 unittest 38 { 39 __m128i A = _mm_setr_epi32( -7, -1, 0, 9); 40 int4 R = _mm_add_epi32(A, A); 41 int[4] correct = [ -14, -2, 0, 18 ]; 42 assert(R.array == correct); 43 } 44 45 /// Add packed 64-bit integers in `a` and `b`. 46 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 47 { 48 return cast(__m128i)(cast(long2)a + cast(long2)b); 49 } 50 unittest 51 { 52 __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000); 53 long2 R = cast(long2) _mm_add_epi64(A, A); 54 long[2] correct = [ -2, 0 ]; 55 assert(R.array == correct); 56 } 57 58 /// Add packed 8-bit integers in `a` and `b`. 59 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 60 { 61 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 62 } 63 unittest 64 { 65 __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78); 66 byte16 R = cast(byte16) _mm_add_epi8(A, A); 67 byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100]; 68 assert(R.array == correct); 69 } 70 71 /// Add the lower double-precision (64-bit) floating-point element 72 /// in `a` and `b`, store the result in the lower element of dst, 73 /// and copy the upper element from `a` to the upper element of destination. 74 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 75 { 76 static if (GDC_with_SSE2) 77 { 78 return __builtin_ia32_addsd(a, b); 79 } 80 else version(DigitalMars) 81 { 82 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 83 asm pure nothrow @nogc @trusted { nop;} 84 a[0] = a[0] + b[0]; 85 return a; 86 } 87 else 88 { 89 a[0] += b[0]; 90 return a; 91 } 92 } 93 unittest 94 { 95 __m128d a = [1.5, -2.0]; 96 a = _mm_add_sd(a, a); 97 assert(a.array == [3.0, -2.0]); 98 } 99 100 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 101 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 102 { 103 return a + b; 104 } 105 unittest 106 { 107 __m128d a = [1.5, -2.0]; 108 a = _mm_add_pd(a, a); 109 assert(a.array == [3.0, -4.0]); 110 } 111 112 /// Add 64-bit integers `a` and `b`. 113 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe 114 { 115 return a + b; 116 } 117 118 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 119 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 120 { 121 static if (GDC_with_SSE2) 122 { 123 return __builtin_ia32_paddsw128(a, b); 124 } 125 else version(LDC) 126 { 127 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 128 { 129 // x86: Generates PADDSW since LDC 1.15 -O0 130 // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20 131 enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 132 enum ir = ` 133 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 134 ret <8 x i16> %r`; 135 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 136 } 137 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 138 { 139 // PERF #ARM32 Use an intrinsic in gccbuiltins_arm.d instead 140 short[8] res; 141 short8 sa = cast(short8)a; 142 short8 sb = cast(short8)b; 143 foreach(i; 0..8) 144 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 145 return _mm_loadu_si128(cast(int4*)res.ptr); 146 } 147 else 148 return __builtin_ia32_paddsw128(a, b); 149 } 150 else 151 { 152 short[8] res; 153 short8 sa = cast(short8)a; 154 short8 sb = cast(short8)b; 155 foreach(i; 0..8) 156 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 157 return _mm_loadu_si128(cast(int4*)res.ptr); 158 } 159 } 160 unittest 161 { 162 short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), 163 _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); 164 static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 165 assert(res.array == correctResult); 166 } 167 168 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 169 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 170 { 171 static if (GDC_with_SSE2) 172 { 173 return __builtin_ia32_paddsb128(a, b); 174 } 175 else version(LDC) 176 { 177 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 178 { 179 // x86: Generates PADDSB since LDC 1.15 -O0 180 // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20 181 enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 182 enum ir = ` 183 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 184 ret <16 x i8> %r`; 185 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 186 } 187 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 188 { 189 // PERF #ARM32 Use an intrinsic in gccbuiltins_arm.d instead 190 byte[16] res; 191 byte16 sa = cast(byte16)a; 192 byte16 sb = cast(byte16)b; 193 foreach(i; 0..16) 194 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 195 return _mm_loadu_si128(cast(int4*)res.ptr); 196 } 197 else 198 return __builtin_ia32_paddsb128(a, b); 199 } 200 else 201 { 202 byte[16] res; 203 byte16 sa = cast(byte16)a; 204 byte16 sb = cast(byte16)b; 205 foreach(i; 0..16) 206 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 207 return _mm_loadu_si128(cast(int4*)res.ptr); 208 } 209 } 210 unittest 211 { 212 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 213 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 214 static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14, 215 16, 18, 20, 22, 24, 26, 28, 30]; 216 assert(res.array == correctResult); 217 } 218 219 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 220 // PERF: #GDC version? 221 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 222 { 223 version(LDC) 224 { 225 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 226 { 227 // x86: Generates PADDUSB since LDC 1.15 -O0 228 // ARM: Generates uqadd.16b since LDC 1.21 -O1 229 enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 230 enum ir = ` 231 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 232 ret <16 x i8> %r`; 233 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 234 } 235 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 236 { 237 // PERF #ARM32 Use an intrinsic in gccbuiltins_arm.d instead 238 ubyte[16] res; 239 byte16 sa = cast(byte16)a; 240 byte16 sb = cast(byte16)b; 241 foreach(i; 0..16) 242 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 243 return _mm_loadu_si128(cast(int4*)res.ptr); 244 } 245 else 246 return __builtin_ia32_paddusb128(a, b); 247 } 248 else 249 { 250 ubyte[16] res; 251 byte16 sa = cast(byte16)a; 252 byte16 sb = cast(byte16)b; 253 foreach(i; 0..16) 254 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 255 return _mm_loadu_si128(cast(int4*)res.ptr); 256 } 257 } 258 unittest 259 { 260 byte16 res = cast(byte16) _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0), 261 _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0)); 262 static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 263 assert(res.array == correctResult); 264 } 265 266 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation. 267 // PERF: #GDC version? 268 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 269 { 270 version(LDC) 271 { 272 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 273 { 274 // x86: Generates PADDUSW since LDC 1.15 -O0 275 // ARM: Generates uqadd.8h since LDC 1.21 -O1 276 enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 277 enum ir = ` 278 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 279 ret <8 x i16> %r`; 280 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 281 } 282 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 283 { 284 // PERF #ARM32 Use an intrinsic in gccbuiltins_arm.d instead 285 ushort[8] res; 286 short8 sa = cast(short8)a; 287 short8 sb = cast(short8)b; 288 foreach(i; 0..8) 289 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 290 return _mm_loadu_si128(cast(int4*)res.ptr); 291 } 292 else 293 return __builtin_ia32_paddusw128(a, b); 294 } 295 else 296 { 297 ushort[8] res; 298 short8 sa = cast(short8)a; 299 short8 sb = cast(short8)b; 300 foreach(i; 0..8) 301 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 302 return _mm_loadu_si128(cast(int4*)res.ptr); 303 } 304 } 305 unittest 306 { 307 short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0), 308 _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0)); 309 static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6]; 310 assert(res.array == correctResult); 311 } 312 313 /// Compute the bitwise AND of packed double-precision (64-bit) 314 /// floating-point elements in `a` and `b`. 315 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 316 { 317 return cast(__m128d)( cast(long2)a & cast(long2)b ); 318 } 319 unittest 320 { 321 double a = 4.32; 322 double b = -78.99; 323 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 324 __m128d A = _mm_set_pd(a, b); 325 __m128d B = _mm_set_pd(b, a); 326 long2 R = cast(long2)( _mm_and_pd(A, B) ); 327 assert(R.array[0] == correct); 328 assert(R.array[1] == correct); 329 } 330 331 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`. 332 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 333 { 334 return a & b; 335 } 336 unittest 337 { 338 __m128i A = _mm_set1_epi32(7); 339 __m128i B = _mm_set1_epi32(14); 340 __m128i R = _mm_and_si128(A, B); 341 int[4] correct = [6, 6, 6, 6]; 342 assert(R.array == correct); 343 } 344 345 /// Compute the bitwise NOT of packed double-precision (64-bit) 346 /// floating-point elements in `a` and then AND with `b`. 347 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 348 { 349 return cast(__m128d)( ~(cast(long2)a) & cast(long2)b); 350 } 351 unittest 352 { 353 double a = 4.32; 354 double b = -78.99; 355 long correct = (~*cast(long*)(&a)) & ( *cast(long*)(&b)); 356 long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b)); 357 __m128d A = _mm_setr_pd(a, b); 358 __m128d B = _mm_setr_pd(b, a); 359 long2 R = cast(long2)( _mm_andnot_pd(A, B) ); 360 assert(R.array[0] == correct); 361 assert(R.array[1] == correct2); 362 } 363 364 /// Compute the bitwise NOT of 128 bits (representing integer data) 365 /// in `a` and then AND with `b`. 366 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 367 { 368 return (~a) & b; 369 } 370 unittest 371 { 372 __m128i A = _mm_set1_epi32(7); 373 __m128i B = _mm_set1_epi32(14); 374 __m128i R = _mm_andnot_si128(A, B); 375 int[4] correct = [8, 8, 8, 8]; 376 assert(R.array == correct); 377 } 378 379 /// Average packed unsigned 16-bit integers in `a` and `b`. 380 /// TODO: #ARM 381 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted 382 { 383 static if (GDC_with_SSE2) 384 { 385 return __builtin_ia32_pavgw128(a, b); 386 } 387 else version(LDC) 388 { 389 // Generates pavgw even in LDC 1.0, even in -O0 390 enum ir = ` 391 %ia = zext <8 x i16> %0 to <8 x i32> 392 %ib = zext <8 x i16> %1 to <8 x i32> 393 %isum = add <8 x i32> %ia, %ib 394 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 395 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 396 %r = trunc <8 x i32> %isums to <8 x i16> 397 ret <8 x i16> %r`; 398 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 399 } 400 else 401 { 402 short8 sa = cast(short8)a; 403 short8 sb = cast(short8)b; 404 short8 sr = void; 405 foreach(i; 0..8) 406 { 407 sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 ); 408 } 409 return cast(int4)sr; 410 } 411 } 412 unittest 413 { 414 __m128i A = _mm_set1_epi16(31); 415 __m128i B = _mm_set1_epi16(64); 416 short8 avg = cast(short8)(_mm_avg_epu16(A, B)); 417 foreach(i; 0..8) 418 assert(avg.array[i] == 48); 419 } 420 421 /// Average packed unsigned 8-bit integers in `a` and `b`. 422 // TODO: #ARM 423 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted 424 { 425 static if (GDC_with_SSE2) 426 { 427 return __builtin_ia32_pavgb128(a, b); 428 } 429 else version(LDC) 430 { 431 // Generates pavgb even in LDC 1.0, even in -O0 432 enum ir = ` 433 %ia = zext <16 x i8> %0 to <16 x i16> 434 %ib = zext <16 x i8> %1 to <16 x i16> 435 %isum = add <16 x i16> %ia, %ib 436 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 437 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 438 %r = trunc <16 x i16> %isums to <16 x i8> 439 ret <16 x i8> %r`; 440 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 441 } 442 else 443 { 444 byte16 sa = cast(byte16)a; 445 byte16 sb = cast(byte16)b; 446 byte16 sr = void; 447 foreach(i; 0..16) 448 { 449 sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 ); 450 } 451 return cast(int4)sr; 452 } 453 } 454 unittest 455 { 456 __m128i A = _mm_set1_epi8(31); 457 __m128i B = _mm_set1_epi8(64); 458 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B)); 459 foreach(i; 0..16) 460 assert(avg.array[i] == 48); 461 } 462 463 /// Shift `a` left by `bytes` bytes while shifting in zeros. 464 alias _mm_bslli_si128 = _mm_slli_si128; 465 unittest 466 { 467 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 468 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 469 __m128i result = _mm_bslli_si128!5(toShift); 470 assert( (cast(byte16)result).array == exact); 471 } 472 473 /// Shift `v` right by `bytes` bytes while shifting in zeros. 474 alias _mm_bsrli_si128 = _mm_srli_si128; 475 unittest 476 { 477 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 478 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0]; 479 __m128i result = _mm_bsrli_si128!5(toShift); 480 assert( (cast(byte16)result).array == exact); 481 } 482 483 /// Cast vector of type `__m128d` to type `__m128`. 484 /// Note: Also possible with a regular `cast(__m128)(a)`. 485 __m128 _mm_castpd_ps (__m128d a) pure @safe 486 { 487 return cast(__m128)a; 488 } 489 490 /// Cast vector of type `__m128d` to type `__m128i`. 491 /// Note: Also possible with a regular `cast(__m128i)(a)`. 492 __m128i _mm_castpd_si128 (__m128d a) pure @safe 493 { 494 return cast(__m128i)a; 495 } 496 497 /// Cast vector of type `__m128` to type `__m128d`. 498 /// Note: Also possible with a regular `cast(__m128d)(a)`. 499 __m128d _mm_castps_pd (__m128 a) pure @safe 500 { 501 return cast(__m128d)a; 502 } 503 504 /// Cast vector of type `__m128` to type `__m128i`. 505 /// Note: Also possible with a regular `cast(__m128i)(a)`. 506 __m128i _mm_castps_si128 (__m128 a) pure @safe 507 { 508 return cast(__m128i)a; 509 } 510 511 /// Cast vector of type `__m128i` to type `__m128d`. 512 /// Note: Also possible with a regular `cast(__m128d)(a)`. 513 __m128d _mm_castsi128_pd (__m128i a) pure @safe 514 { 515 return cast(__m128d)a; 516 } 517 518 /// Cast vector of type `__m128i` to type `__m128`. 519 /// Note: Also possible with a regular `cast(__m128)(a)`. 520 __m128 _mm_castsi128_ps (__m128i a) pure @safe 521 { 522 return cast(__m128)a; 523 } 524 525 /// Invalidate and flush the cache line that contains `p` 526 /// from all levels of the cache hierarchy. 527 void _mm_clflush (const(void)* p) @trusted 528 { 529 static if (GDC_with_SSE2) 530 { 531 __builtin_ia32_clflush(p); 532 } 533 else static if (LDC_with_SSE2) 534 { 535 __builtin_ia32_clflush(cast(void*)p); 536 } 537 else version(D_InlineAsm_X86) 538 { 539 asm pure nothrow @nogc @safe 540 { 541 mov EAX, p; 542 clflush [EAX]; 543 } 544 } 545 else version(D_InlineAsm_X86_64) 546 { 547 asm pure nothrow @nogc @safe 548 { 549 mov RAX, p; 550 clflush [RAX]; 551 } 552 } 553 else 554 { 555 // Do nothing. Invalidating cacheline does 556 // not affect correctness. 557 } 558 } 559 unittest 560 { 561 ubyte[64] cacheline; 562 _mm_clflush(cacheline.ptr); 563 } 564 565 /// Compare packed 16-bit integers in `a` and `b` for equality. 566 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 567 { 568 static if (GDC_with_SSE2) 569 { 570 return __builtin_ia32_pcmpeqw128(a, b); 571 } 572 else 573 { 574 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 575 } 576 } 577 unittest 578 { 579 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 580 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 581 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 582 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 583 assert(R.array == E); 584 } 585 586 /// Compare packed 32-bit integers in `a` and `b` for equality. 587 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 588 { 589 static if (GDC_with_SSE2) 590 { 591 return __builtin_ia32_pcmpeqd128(a, b); 592 } 593 else 594 { 595 return equalMask!__m128i(a, b); 596 } 597 } 598 unittest 599 { 600 int4 A = [-3, -2, -1, 0]; 601 int4 B = [ 4, -2, 2, 0]; 602 int[4] E = [ 0, -1, 0, -1]; 603 int4 R = cast(int4)(_mm_cmpeq_epi16(A, B)); 604 assert(R.array == E); 605 } 606 607 /// Compare packed 8-bit integers in `a` and `b` for equality. 608 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 609 { 610 static if (GDC_with_SSE2) 611 { 612 return __builtin_ia32_pcmpeqb128(a, b); 613 } 614 else 615 { 616 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 617 } 618 } 619 unittest 620 { 621 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 622 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 623 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 624 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 625 assert(C.array == correct); 626 } 627 628 /// Compare packed double-precision (64-bit) floating-point elements 629 /// in `a` and `b` for equality. 630 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 631 { 632 static if (GDC_with_SSE2) 633 { 634 return __builtin_ia32_cmpeqpd(a, b); 635 } 636 else 637 { 638 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b); 639 } 640 } 641 642 /// Compare the lower double-precision (64-bit) floating-point elements 643 /// in `a` and `b` for equality, store the result in the lower element, 644 /// and copy the upper element from `a`. 645 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 646 { 647 static if (GDC_with_SSE2) 648 { 649 return __builtin_ia32_cmpeqsd(a, b); 650 } 651 else 652 { 653 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b); 654 } 655 } 656 657 /// Compare packed double-precision (64-bit) floating-point elements 658 /// in `a` and `b` for greater-than-or-equal. 659 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 660 { 661 static if (GDC_with_SSE2) 662 { 663 return __builtin_ia32_cmpgepd(a, b); 664 } 665 else 666 { 667 return cast(__m128d) cmppd!(FPComparison.oge)(a, b); 668 } 669 } 670 671 /// Compare the lower double-precision (64-bit) floating-point elements 672 /// in `a` and `b` for greater-than-or-equal, store the result in the 673 /// lower element, and copy the upper element from `a`. 674 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 675 { 676 // Note: There is no __builtin_ia32_cmpgesd builtin. 677 static if (GDC_with_SSE2) 678 { 679 return __builtin_ia32_cmpnltsd(b, a); 680 } 681 else 682 { 683 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b); 684 } 685 } 686 687 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 688 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 689 { 690 static if (GDC_with_SSE2) 691 { 692 return __builtin_ia32_pcmpgtw128(a, b); 693 } 694 else 695 { 696 return cast(__m128i)( greaterMask!short8(cast(short8)a, cast(short8)b)); 697 } 698 } 699 unittest 700 { 701 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 702 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 703 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 704 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 705 assert(R.array == E); 706 } 707 708 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 709 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 710 { 711 static if (GDC_with_SSE2) 712 { 713 return __builtin_ia32_pcmpgtd128(a, b); 714 } 715 else 716 { 717 return cast(__m128i)( greaterMask!int4(a, b)); 718 } 719 } 720 unittest 721 { 722 int4 A = [-3, 2, -1, 0]; 723 int4 B = [ 4, -2, 2, 0]; 724 int[4] E = [ 0, -1, 0, 0]; 725 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 726 assert(R.array == E); 727 } 728 729 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 730 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 731 { 732 static if (GDC_with_SSE2) 733 { 734 return __builtin_ia32_pcmpgtb128(a, b); 735 } 736 else 737 { 738 return cast(__m128i)( greaterMask!byte16(cast(byte16)a, cast(byte16)b)); 739 } 740 } 741 unittest 742 { 743 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 744 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 745 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 746 byte[16] correct = [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 747 __m128i D = _mm_cmpeq_epi8(A, B); 748 assert(C.array == correct); 749 } 750 751 /// Compare packed double-precision (64-bit) floating-point elements 752 /// in `a` and `b` for greater-than. 753 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 754 { 755 static if (GDC_with_SSE2) 756 { 757 return __builtin_ia32_cmpgtpd(a, b); 758 } 759 else 760 { 761 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b); 762 } 763 } 764 765 /// Compare the lower double-precision (64-bit) floating-point elements 766 /// in `a` and `b` for greater-than, store the result in the lower element, 767 /// and copy the upper element from `a`. 768 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 769 { 770 // Note: There is no __builtin_ia32_cmpgtsd builtin. 771 static if (GDC_with_SSE2) 772 { 773 return __builtin_ia32_cmpnlesd(b, a); 774 } 775 else 776 { 777 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b); 778 } 779 } 780 781 /// Compare packed double-precision (64-bit) floating-point elements 782 /// in `a` and `b` for less-than-or-equal. 783 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 784 { 785 static if (GDC_with_SSE2) 786 { 787 return __builtin_ia32_cmplepd(a, b); 788 } 789 else 790 { 791 return cast(__m128d) cmppd!(FPComparison.ole)(a, b); 792 } 793 } 794 795 /// Compare the lower double-precision (64-bit) floating-point elements 796 /// in `a` and `b` for less-than-or-equal, store the result in the 797 /// lower element, and copy the upper element from `a`. 798 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 799 { 800 static if (GDC_with_SSE2) 801 { 802 return __builtin_ia32_cmplesd(a, b); 803 } 804 else 805 { 806 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b); 807 } 808 } 809 810 /// Compare packed 16-bit integers in `a` and `b` for less-than. 811 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 812 { 813 return _mm_cmpgt_epi16(b, a); 814 } 815 816 /// Compare packed 32-bit integers in `a` and `b` for less-than. 817 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 818 { 819 return _mm_cmpgt_epi32(b, a); 820 } 821 822 /// Compare packed 8-bit integers in `a` and `b` for less-than. 823 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 824 { 825 return _mm_cmpgt_epi8(b, a); 826 } 827 828 /// Compare packed double-precision (64-bit) floating-point elements 829 /// in `a` and `b` for less-than. 830 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 831 { 832 static if (GDC_with_SSE2) 833 { 834 return __builtin_ia32_cmpltpd(a, b); 835 } 836 else 837 { 838 return cast(__m128d) cmppd!(FPComparison.olt)(a, b); 839 } 840 } 841 842 /// Compare the lower double-precision (64-bit) floating-point elements 843 /// in `a` and `b` for less-than, store the result in the lower 844 /// element, and copy the upper element from `a`. 845 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 846 { 847 static if (GDC_with_SSE2) 848 { 849 return __builtin_ia32_cmpltsd(a, b); 850 } 851 else 852 { 853 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b); 854 } 855 } 856 857 /// Compare packed double-precision (64-bit) floating-point elements 858 /// in `a` and `b` for not-equal. 859 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 860 { 861 static if (GDC_with_SSE2) 862 { 863 return __builtin_ia32_cmpneqpd(a, b); 864 } 865 else 866 { 867 return cast(__m128d) cmppd!(FPComparison.une)(a, b); 868 } 869 } 870 871 /// Compare the lower double-precision (64-bit) floating-point elements 872 /// in `a` and `b` for not-equal, store the result in the lower 873 /// element, and copy the upper element from `a`. 874 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 875 { 876 static if (GDC_with_SSE2) 877 { 878 return __builtin_ia32_cmpneqsd(a, b); 879 } 880 else 881 { 882 return cast(__m128d) cmpsd!(FPComparison.une)(a, b); 883 } 884 } 885 886 /// Compare packed double-precision (64-bit) floating-point elements 887 /// in `a` and `b` for not-greater-than-or-equal. 888 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 889 { 890 static if (GDC_with_SSE2) 891 { 892 return __builtin_ia32_cmpngepd(a, b); 893 } 894 else 895 { 896 return cast(__m128d) cmppd!(FPComparison.ult)(a, b); 897 } 898 } 899 900 /// Compare the lower double-precision (64-bit) floating-point elements 901 /// in `a` and `b` for not-greater-than-or-equal, store the result in 902 /// the lower element, and copy the upper element from `a`. 903 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 904 { 905 // Note: There is no __builtin_ia32_cmpngesd builtin. 906 static if (GDC_with_SSE2) 907 { 908 return __builtin_ia32_cmpltsd(b, a); 909 } 910 else 911 { 912 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b); 913 } 914 } 915 916 /// Compare packed double-precision (64-bit) floating-point elements 917 /// in `a` and `b` for not-greater-than. 918 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 919 { 920 static if (GDC_with_SSE2) 921 { 922 return __builtin_ia32_cmpngtpd(a, b); 923 } 924 else 925 { 926 return cast(__m128d) cmppd!(FPComparison.ule)(a, b); 927 } 928 } 929 930 /// Compare the lower double-precision (64-bit) floating-point elements 931 /// in `a` and `b` for not-greater-than, store the result in the 932 /// lower element, and copy the upper element from `a`. 933 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 934 { 935 // Note: There is no __builtin_ia32_cmpngtsd builtin. 936 static if (GDC_with_SSE2) 937 { 938 return __builtin_ia32_cmplesd(b, a); 939 } 940 else 941 { 942 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b); 943 } 944 } 945 946 /// Compare packed double-precision (64-bit) floating-point elements 947 /// in `a` and `b` for not-less-than-or-equal. 948 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 949 { 950 static if (GDC_with_SSE2) 951 { 952 return __builtin_ia32_cmpnlepd(a, b); 953 } 954 else 955 { 956 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b); 957 } 958 } 959 960 /// Compare the lower double-precision (64-bit) floating-point elements 961 /// in `a` and `b` for not-less-than-or-equal, store the result in the 962 /// lower element, and copy the upper element from `a`. 963 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 964 { 965 static if (GDC_with_SSE2) 966 { 967 return __builtin_ia32_cmpnlesd(a, b); 968 } 969 else 970 { 971 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b); 972 } 973 } 974 975 /// Compare packed double-precision (64-bit) floating-point elements 976 /// in `a` and `b` for not-less-than. 977 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 978 { 979 static if (GDC_with_SSE2) 980 { 981 return __builtin_ia32_cmpnltpd(a, b); 982 } 983 else 984 { 985 return cast(__m128d) cmppd!(FPComparison.uge)(a, b); 986 } 987 } 988 989 /// Compare the lower double-precision (64-bit) floating-point elements 990 /// in `a` and `b` for not-less-than, store the result in the lower 991 /// element, and copy the upper element from `a`. 992 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 993 { 994 static if (GDC_with_SSE2) 995 { 996 return __builtin_ia32_cmpnltsd(a, b); 997 } 998 else 999 { 1000 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b); 1001 } 1002 } 1003 1004 /// Compare packed double-precision (64-bit) floating-point elements 1005 /// in `a` and `b` to see if neither is NaN. 1006 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 1007 { 1008 static if (GDC_with_SSE2) 1009 { 1010 return __builtin_ia32_cmpordpd(a, b); 1011 } 1012 else 1013 { 1014 return cast(__m128d) cmppd!(FPComparison.ord)(a, b); 1015 } 1016 } 1017 1018 /// Compare the lower double-precision (64-bit) floating-point elements 1019 /// in `a` and `b` to see if neither is NaN, store the result in the 1020 /// lower element, and copy the upper element from `a` to the upper element. 1021 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 1022 { 1023 static if (GDC_with_SSE2) 1024 { 1025 return __builtin_ia32_cmpordsd(a, b); 1026 } 1027 else 1028 { 1029 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b); 1030 } 1031 } 1032 1033 /// Compare packed double-precision (64-bit) floating-point elements 1034 /// in `a` and `b` to see if either is NaN. 1035 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 1036 { 1037 static if (GDC_with_SSE2) 1038 { 1039 return __builtin_ia32_cmpunordpd(a, b); 1040 } 1041 else 1042 { 1043 return cast(__m128d) cmppd!(FPComparison.uno)(a, b); 1044 } 1045 } 1046 1047 /// Compare the lower double-precision (64-bit) floating-point elements 1048 /// in `a` and `b` to see if either is NaN, store the result in the lower 1049 /// element, and copy the upper element from `a` to the upper element. 1050 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 1051 { 1052 static if (GDC_with_SSE2) 1053 { 1054 return __builtin_ia32_cmpunordsd(a, b); 1055 } 1056 else 1057 { 1058 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b); 1059 } 1060 } 1061 1062 1063 // Note: we've reverted clang and GCC behaviour with regards to EFLAGS 1064 // Some such comparisons yields true for NaNs, other don't. 1065 1066 /// Compare the lower double-precision (64-bit) floating-point element 1067 /// in `a` and `b` for equality, and return the boolean result (0 or 1). 1068 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe 1069 { 1070 static if (GDC_with_SSE2) 1071 { 1072 return __builtin_ia32_comieq(a, b); 1073 } 1074 else 1075 { 1076 return comsd!(FPComparison.ueq)(a, b); // yields true for NaN, same as GCC 1077 } 1078 } 1079 1080 /// Compare the lower double-precision (64-bit) floating-point element 1081 /// in `a` and `b` for greater-than-or-equal, and return the boolean 1082 /// result (0 or 1). 1083 int _mm_comige_sd (__m128d a, __m128d b) pure @safe 1084 { 1085 static if (GDC_with_SSE2) 1086 { 1087 return __builtin_ia32_comige(a, b); 1088 } 1089 else 1090 { 1091 return comsd!(FPComparison.oge)(a, b); 1092 } 1093 } 1094 1095 /// Compare the lower double-precision (64-bit) floating-point element 1096 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1). 1097 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe 1098 { 1099 static if (GDC_with_SSE2) 1100 { 1101 return __builtin_ia32_comigt(a, b); 1102 } 1103 else 1104 { 1105 return comsd!(FPComparison.ogt)(a, b); 1106 } 1107 } 1108 1109 /// Compare the lower double-precision (64-bit) floating-point element 1110 /// in `a` and `b` for less-than-or-equal. 1111 int _mm_comile_sd (__m128d a, __m128d b) pure @safe 1112 { 1113 static if (GDC_with_SSE2) 1114 { 1115 return __builtin_ia32_comile(a, b); 1116 } 1117 else 1118 { 1119 return comsd!(FPComparison.ule)(a, b); // yields true for NaN, same as GCC 1120 } 1121 } 1122 1123 /// Compare the lower double-precision (64-bit) floating-point element 1124 /// in `a` and `b` for less-than, and return the boolean result (0 or 1). 1125 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe 1126 { 1127 static if (GDC_with_SSE2) 1128 { 1129 return __builtin_ia32_comilt(a, b); 1130 } 1131 else 1132 { 1133 return comsd!(FPComparison.ult)(a, b); // yields true for NaN, same as GCC 1134 } 1135 } 1136 1137 /// Compare the lower double-precision (64-bit) floating-point element 1138 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1). 1139 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe 1140 { 1141 static if (GDC_with_SSE2) 1142 { 1143 return __builtin_ia32_comineq(a, b); 1144 } 1145 else 1146 { 1147 return comsd!(FPComparison.one)(a, b); 1148 } 1149 } 1150 1151 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) 1152 /// floating-point elements. 1153 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted 1154 { 1155 version(LDC) 1156 { 1157 // Generates cvtdq2pd since LDC 1.0, even without optimizations 1158 enum ir = ` 1159 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 1160 %r = sitofp <2 x i32> %v to <2 x double> 1161 ret <2 x double> %r`; 1162 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a); 1163 } 1164 else static if (GDC_with_SSE2) 1165 { 1166 return __builtin_ia32_cvtdq2pd(a); 1167 } 1168 else 1169 { 1170 double2 r = void; 1171 r.ptr[0] = a.array[0]; 1172 r.ptr[1] = a.array[1]; 1173 return r; 1174 } 1175 } 1176 unittest 1177 { 1178 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54)); 1179 assert(A.array[0] == 54.0); 1180 assert(A.array[1] == 54.0); 1181 } 1182 1183 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 1184 /// floating-point elements. 1185 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted 1186 { 1187 static if (GDC_with_SSE2) 1188 { 1189 return __builtin_ia32_cvtdq2ps(a); 1190 } 1191 else 1192 { 1193 // x86: Generates cvtdq2ps since LDC 1.0.0 -O1 1194 // ARM: Generats scvtf.4s since LDC 1.8.0 -02 1195 __m128 res; 1196 res.ptr[0] = cast(float)a.array[0]; 1197 res.ptr[1] = cast(float)a.array[1]; 1198 res.ptr[2] = cast(float)a.array[2]; 1199 res.ptr[3] = cast(float)a.array[3]; 1200 return res; 1201 } 1202 } 1203 unittest 1204 { 1205 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 1206 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 1207 } 1208 1209 /// Convert packed double-precision (64-bit) floating-point elements 1210 /// in `a` to packed 32-bit integers. 1211 // TODO #ARM 1212 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted 1213 { 1214 static if (LDC_with_SSE2) 1215 { 1216 // Like in clang, implemented with a magic intrinsic right now 1217 return __builtin_ia32_cvtpd2dq(a); 1218 } 1219 else static if (GDC_with_SSE2) 1220 { 1221 return __builtin_ia32_cvtpd2dq(a); 1222 } 1223 else 1224 { 1225 __m128i r = _mm_setzero_si128(); 1226 r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]); 1227 r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]); 1228 return r; 1229 } 1230 } 1231 unittest 1232 { 1233 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0)); 1234 assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0); 1235 } 1236 1237 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1238 /// to packed 32-bit integers 1239 __m64 _mm_cvtpd_pi32 (__m128d v) @safe 1240 { 1241 return to_m64(_mm_cvtpd_epi32(v)); 1242 } 1243 unittest 1244 { 1245 int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0)); 1246 assert(A.array[0] == 55 && A.array[1] == 61); 1247 } 1248 1249 /// Convert packed double-precision (64-bit) floating-point elements 1250 /// in `a` to packed single-precision (32-bit) floating-point elements. 1251 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted 1252 { 1253 static if (LDC_with_SSE2) 1254 { 1255 return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately 1256 } 1257 else static if (GDC_with_SSE2) 1258 { 1259 return __builtin_ia32_cvtpd2ps(a); 1260 } 1261 else 1262 { 1263 __m128 r = void; 1264 r.ptr[0] = a.array[0]; 1265 r.ptr[1] = a.array[1]; 1266 r.ptr[2] = 0; 1267 r.ptr[3] = 0; 1268 return r; 1269 } 1270 } 1271 unittest 1272 { 1273 __m128d A = _mm_set_pd(5.25, 4.0); 1274 __m128 B = _mm_cvtpd_ps(A); 1275 assert(B.array == [4.0f, 5.25f, 0, 0]); 1276 } 1277 1278 /// Convert packed 32-bit integers in `v` to packed double-precision 1279 /// (64-bit) floating-point elements. 1280 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe 1281 { 1282 return _mm_cvtepi32_pd(to_m128i(v)); 1283 } 1284 unittest 1285 { 1286 __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5)); 1287 assert(A.array[0] == 4.0 && A.array[1] == -5.0); 1288 } 1289 1290 /// Convert packed single-precision (32-bit) floating-point elements 1291 /// in `a` to packed 32-bit integers 1292 // TODO #ARM 1293 __m128i _mm_cvtps_epi32 (__m128 a) @trusted 1294 { 1295 static if (LDC_with_SSE2) 1296 { 1297 // Disabled, since it fail with optimizations unfortunately 1298 //alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq; 1299 return __asm!__m128i("cvtps2dq $1,$0","=x,x",a); 1300 } 1301 else static if (GDC_with_SSE2) 1302 { 1303 return __builtin_ia32_cvtps2dq(a); 1304 } 1305 else 1306 { 1307 __m128i r = void; 1308 r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]); 1309 r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]); 1310 r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]); 1311 r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]); 1312 return r; 1313 } 1314 } 1315 unittest 1316 { 1317 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1318 1319 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1320 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1321 assert(A.array == [1, -2, 54, -3]); 1322 1323 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1324 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1325 assert(A.array == [1, -3, 53, -3]); 1326 1327 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1328 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1329 assert(A.array == [2, -2, 54, -2]); 1330 1331 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1332 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1333 assert(A.array == [1, -2, 53, -2]); 1334 1335 _MM_SET_ROUNDING_MODE(savedRounding); 1336 } 1337 1338 /// Convert packed single-precision (32-bit) floating-point elements 1339 /// in `a` to packed double-precision (64-bit) floating-point elements. 1340 __m128d _mm_cvtps_pd (__m128 a) pure @trusted 1341 { 1342 version(LDC) 1343 { 1344 // Generates cvtps2pd since LDC 1.0 -O0 1345 enum ir = ` 1346 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1> 1347 %r = fpext <2 x float> %v to <2 x double> 1348 ret <2 x double> %r`; 1349 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a); 1350 } 1351 else static if (GDC_with_SSE2) 1352 { 1353 return __builtin_ia32_cvtps2pd(a); 1354 } 1355 else 1356 { 1357 double2 r = void; 1358 r.ptr[0] = a.array[0]; 1359 r.ptr[1] = a.array[1]; 1360 return r; 1361 } 1362 } 1363 unittest 1364 { 1365 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f)); 1366 assert(A.array[0] == 54.0); 1367 assert(A.array[1] == 54.0); 1368 } 1369 1370 /// Copy the lower double-precision (64-bit) floating-point element of `a`. 1371 double _mm_cvtsd_f64 (__m128d a) pure @safe 1372 { 1373 return a.array[0]; 1374 } 1375 1376 /// Convert the lower double-precision (64-bit) floating-point element 1377 /// in `a` to a 32-bit integer. 1378 int _mm_cvtsd_si32 (__m128d a) @safe 1379 { 1380 static if (LDC_with_SSE2) 1381 { 1382 return __builtin_ia32_cvtsd2si(a); 1383 } 1384 else static if (GDC_with_SSE2) 1385 { 1386 return __builtin_ia32_cvtsd2si(a); 1387 } 1388 else 1389 { 1390 return convertDoubleToInt32UsingMXCSR(a[0]); 1391 } 1392 } 1393 unittest 1394 { 1395 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0))); 1396 } 1397 1398 version(LDC) 1399 { 1400 // Unfortunately this builtin crashes in 32-bit 1401 version(X86_64) 1402 alias _mm_cvtsd_si64 = __builtin_ia32_cvtsd2si64; 1403 else 1404 { 1405 long _mm_cvtsd_si64 (__m128d a) @safe 1406 { 1407 return convertDoubleToInt64UsingMXCSR(a[0]); 1408 } 1409 } 1410 } 1411 else 1412 { 1413 long _mm_cvtsd_si64 (__m128d a) @safe 1414 { 1415 return convertDoubleToInt64UsingMXCSR(a.array[0]); 1416 } 1417 } 1418 unittest 1419 { 1420 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0))); 1421 1422 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1423 1424 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1425 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49))); 1426 1427 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1428 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1))); 1429 1430 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1431 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1))); 1432 1433 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1434 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9))); 1435 1436 _MM_SET_ROUNDING_MODE(savedRounding); 1437 } 1438 1439 alias _mm_cvtsd_si64x = _mm_cvtsd_si64; 1440 1441 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @safe 1442 { 1443 static if (GDC_with_SSE2) 1444 { 1445 return __builtin_ia32_cvtsd2ss(a, b); 1446 } 1447 else 1448 { 1449 // Generates cvtsd2ss since LDC 1.3 -O0 1450 a[0] = b[0]; 1451 return a; 1452 } 1453 } 1454 unittest 1455 { 1456 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0)); 1457 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]); 1458 } 1459 1460 int _mm_cvtsi128_si32 (__m128i a) pure @safe 1461 { 1462 return a.array[0]; 1463 } 1464 1465 long _mm_cvtsi128_si64 (__m128i a) pure @safe 1466 { 1467 long2 la = cast(long2)a; 1468 return la.array[0]; 1469 } 1470 alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 1471 1472 __m128d _mm_cvtsi32_sd(__m128d v, int x) pure @trusted 1473 { 1474 v.ptr[0] = cast(double)x; 1475 return v; 1476 } 1477 unittest 1478 { 1479 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 1480 assert(a.array == [42.0, 0]); 1481 } 1482 1483 __m128i _mm_cvtsi32_si128 (int a) pure @trusted 1484 { 1485 int4 r = [0, 0, 0, 0]; 1486 r.ptr[0] = a; 1487 return r; 1488 } 1489 unittest 1490 { 1491 __m128i a = _mm_cvtsi32_si128(65); 1492 assert(a.array == [65, 0, 0, 0]); 1493 } 1494 1495 1496 // Note: on macOS, using "llvm.x86.sse2.cvtsi642sd" was buggy 1497 __m128d _mm_cvtsi64_sd(__m128d v, long x) pure @trusted 1498 { 1499 v.ptr[0] = cast(double)x; 1500 return v; 1501 } 1502 unittest 1503 { 1504 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 1505 assert(a.array == [42.0, 0]); 1506 } 1507 1508 __m128i _mm_cvtsi64_si128 (long a) pure @trusted 1509 { 1510 long2 r = [0, 0]; 1511 r.ptr[0] = a; 1512 return cast(__m128i)(r); 1513 } 1514 1515 alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; 1516 alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; 1517 1518 double2 _mm_cvtss_sd(double2 v, float4 x) pure @trusted 1519 { 1520 v.ptr[0] = x.array[0]; 1521 return v; 1522 } 1523 unittest 1524 { 1525 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 1526 assert(a.array == [42.0, 0]); 1527 } 1528 1529 long _mm_cvttss_si64 (__m128 a) pure @safe 1530 { 1531 return cast(long)(a.array[0]); // Generates cvttss2si as expected 1532 } 1533 unittest 1534 { 1535 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1536 } 1537 1538 static if (LDC_with_SSE2) 1539 { 1540 alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq; 1541 } 1542 else static if (GDC_with_SSE2) 1543 { 1544 alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq; 1545 } 1546 else 1547 { 1548 __m128i _mm_cvttpd_epi32 (__m128d a) pure @safe 1549 { 1550 // Note: doesn't generate cvttpd2dq as of LDC 1.13 1551 __m128i r; 1552 r.array[0] = cast(int)a.array[0]; 1553 r.array[1] = cast(int)a.array[1]; 1554 r.array[2] = 0; 1555 r.array[3] = 0; 1556 return r; 1557 } 1558 } 1559 unittest 1560 { 1561 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f)); 1562 assert(R.array == [-4, 45641, 0, 0]); 1563 } 1564 1565 1566 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1567 /// to packed 32-bit integers with truncation. 1568 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe 1569 { 1570 return to_m64(_mm_cvttpd_epi32(v)); 1571 } 1572 unittest 1573 { 1574 int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f)); 1575 int[2] correct = [-4, 45641]; 1576 assert(R.array == correct); 1577 } 1578 1579 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted 1580 { 1581 // Note: Generates cvttps2dq since LDC 1.3 -O2 1582 __m128i r; 1583 r.ptr[0] = cast(int)a.array[0]; 1584 r.ptr[1] = cast(int)a.array[1]; 1585 r.ptr[2] = cast(int)a.array[2]; 1586 r.ptr[3] = cast(int)a.array[3]; 1587 return r; 1588 } 1589 unittest 1590 { 1591 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f)); 1592 assert(R.array == [-4, 45641, 0, 1]); 1593 } 1594 1595 int _mm_cvttsd_si32 (__m128d a) 1596 { 1597 // Generates cvttsd2si since LDC 1.3 -O0 1598 return cast(int)a.array[0]; 1599 } 1600 1601 long _mm_cvttsd_si64 (__m128d a) 1602 { 1603 // Generates cvttsd2si since LDC 1.3 -O0 1604 // but in 32-bit instead, it's a long sequence that resort to FPU 1605 return cast(long)a.array[0]; 1606 } 1607 1608 alias _mm_cvttsd_si64x = _mm_cvttsd_si64; 1609 1610 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe 1611 { 1612 return a / b; 1613 } 1614 1615 static if (GDC_with_SSE2) 1616 { 1617 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted 1618 { 1619 return __builtin_ia32_divsd(a, b); 1620 } 1621 } 1622 else version(DigitalMars) 1623 { 1624 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1625 __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe 1626 { 1627 asm pure nothrow @nogc @trusted { nop;} 1628 a.array[0] = a.array[0] / b.array[0]; 1629 return a; 1630 } 1631 } 1632 else 1633 { 1634 __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe 1635 { 1636 a.array[0] /= b.array[0]; 1637 return a; 1638 } 1639 } 1640 unittest 1641 { 1642 __m128d a = [2.0, 4.5]; 1643 a = _mm_div_sd(a, a); 1644 assert(a.array == [1.0, 4.5]); 1645 } 1646 1647 /// Extract a 16-bit integer from `v`, selected with `index` 1648 // PERF: ARM version has array bound check 1649 int _mm_extract_epi16(__m128i v, int index) pure @safe 1650 { 1651 short8 r = cast(short8)v; 1652 return cast(ushort)(r.array[index]); 1653 } 1654 unittest 1655 { 1656 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1); 1657 assert(_mm_extract_epi16(A, 6) == 6); 1658 assert(_mm_extract_epi16(A, 0) == 65535); 1659 } 1660 1661 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`. 1662 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted 1663 { 1664 short8 r = cast(short8)v; 1665 r.ptr[index & 7] = cast(short)i; 1666 return cast(__m128i)r; 1667 } 1668 unittest 1669 { 1670 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1671 short8 R = cast(short8) _mm_insert_epi16(A, 42, 6); 1672 short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7]; 1673 assert(R.array == correct); 1674 } 1675 1676 version(GNU) 1677 { 1678 void _mm_lfence() pure @trusted 1679 { 1680 static if (GDC_with_SSE2) 1681 { 1682 __builtin_ia32_lfence(); 1683 } 1684 else version(X86) 1685 { 1686 asm pure nothrow @nogc @trusted 1687 { 1688 "lfence;\n" : : : ; 1689 } 1690 } 1691 else 1692 static assert(false); 1693 } 1694 } 1695 else static if (LDC_with_SSE2) 1696 { 1697 alias _mm_lfence = __builtin_ia32_lfence; 1698 } 1699 else static if (DMD_with_asm) 1700 { 1701 void _mm_lfence() pure @safe 1702 { 1703 asm nothrow @nogc pure @safe 1704 { 1705 lfence; 1706 } 1707 } 1708 } 1709 else version(LDC) 1710 { 1711 void _mm_lfence() pure @safe 1712 { 1713 llvm_memory_fence(); // Note: actually generates mfence 1714 } 1715 } 1716 else 1717 static assert(false); 1718 unittest 1719 { 1720 _mm_lfence(); 1721 } 1722 1723 1724 __m128d _mm_load_pd (const(double) * mem_addr) pure 1725 { 1726 __m128d* aligned = cast(__m128d*)mem_addr; 1727 return *aligned; 1728 } 1729 1730 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 1731 { 1732 double[2] arr = [*mem_addr, *mem_addr]; 1733 return loadUnaligned!(double2)(&arr[0]); 1734 } 1735 1736 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted 1737 { 1738 double2 r = [0, 0]; 1739 r.ptr[0] = *mem_addr; 1740 return r; 1741 } 1742 unittest 1743 { 1744 double x = -42; 1745 __m128d a = _mm_load_sd(&x); 1746 assert(a.array == [-42.0, 0.0]); 1747 } 1748 1749 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted 1750 { 1751 return *mem_addr; 1752 } 1753 1754 alias _mm_load1_pd = _mm_load_pd1; 1755 1756 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted 1757 { 1758 a.ptr[1] = *mem_addr; 1759 return a; 1760 } 1761 1762 // Note: strange signature since the memory doesn't have to aligned 1763 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted 1764 { 1765 auto pLong = cast(const(long)*)mem_addr; 1766 long2 r = [0, 0]; 1767 r.ptr[0] = *pLong; 1768 return cast(__m128i)(r); 1769 } 1770 1771 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted 1772 { 1773 a.ptr[0] = *mem_addr; 1774 return a; 1775 } 1776 1777 __m128d _mm_loadr_pd2 (const(double)* mem_addr) pure @trusted 1778 { 1779 __m128d a = *cast(__m128d*)(mem_addr); 1780 __m128d r; 1781 r.ptr[0] = a.array[1]; 1782 r.ptr[1] = a.array[0]; 1783 return r; 1784 } 1785 1786 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @safe 1787 { 1788 static if (GDC_with_SSE2) 1789 { 1790 return __builtin_ia32_loadupd(mem_addr); 1791 } 1792 else 1793 { 1794 return loadUnaligned!(double2)(mem_addr); 1795 } 1796 } 1797 1798 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 1799 { 1800 static if (GDC_with_SSE2) 1801 { 1802 return __builtin_ia32_loaddqu(cast(const(char*))mem_addr); 1803 } 1804 else 1805 { 1806 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 1807 } 1808 } 1809 1810 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted 1811 { 1812 int r = *cast(int*)(mem_addr); 1813 int4 result = [0, 0, 0, 0]; 1814 result.ptr[0] = r; 1815 return result; 1816 } 1817 unittest 1818 { 1819 int r = 42; 1820 __m128i A = _mm_loadu_si32(&r); 1821 int[4] correct = [42, 0, 0, 0]; 1822 assert(A.array == correct); 1823 } 1824 1825 static if (GDC_with_SSE2) 1826 { 1827 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 1828 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 1829 /// and pack the results in destination. 1830 alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128; 1831 } 1832 else static if (LDC_with_SSE2) 1833 { 1834 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 1835 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 1836 /// and pack the results in destination. 1837 alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128; 1838 } 1839 else 1840 { 1841 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 1842 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 1843 /// and pack the results in destination. 1844 // TODO: #ARM 1845 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @safe 1846 { 1847 short8 sa = cast(short8)a; 1848 short8 sb = cast(short8)b; 1849 1850 int4 r; 1851 foreach(i; 0..4) 1852 { 1853 r.array[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1]; 1854 } 1855 return r; 1856 } 1857 } 1858 unittest 1859 { 1860 short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 1861 short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 1862 int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B); 1863 int[4] correct = [1, 13, -2147483648, 2*32767*32767]; 1864 assert(R.array == correct); 1865 } 1866 1867 static if (LDC_with_SSE2) 1868 { 1869 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 1870 /// (elements are not stored when the highest bit is not set in the corresponding element) 1871 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 1872 /// boundary. 1873 alias _mm_maskmoveu_si128 = __builtin_ia32_maskmovdqu; // can't do it with pure IR 1874 } 1875 else 1876 { 1877 static if (GDC_with_SSE2) 1878 { 1879 ///ditto 1880 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) pure @trusted 1881 { 1882 return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr); 1883 } 1884 } 1885 else 1886 { 1887 ///ditto 1888 // PERF: on ARM, is absolutely catastrophic, however needing this intrinsics is rare. 1889 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) pure @trusted 1890 { 1891 byte16 b = cast(byte16)a; 1892 byte16 m = cast(byte16)mask; 1893 byte* dest = cast(byte*)(mem_addr); 1894 foreach(j; 0..16) 1895 { 1896 if (m.array[j] & 128) 1897 { 1898 dest[j] = b.array[j]; 1899 } 1900 } 1901 } 1902 } 1903 } 1904 unittest 1905 { 1906 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]; 1907 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0); 1908 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); 1909 _mm_maskmoveu_si128(A, mask, dest.ptr); 1910 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42]; 1911 assert(dest == correct); 1912 } 1913 1914 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe 1915 { 1916 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1917 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else 1918 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1919 __m128i mask = _mm_and_si128(aTob, lowerShorts); 1920 return _mm_xor_si128(b, mask); 1921 } 1922 unittest 1923 { 1924 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-57), 1925 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 1926 short[8] correct = [45, 1, 9, 7, 9, 7, 0, 0]; 1927 assert(R.array == correct); 1928 } 1929 1930 1931 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1932 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe 1933 { 1934 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 1935 __m128i value128 = _mm_set1_epi8(-128); 1936 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 1937 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 1938 __m128i mask = _mm_and_si128(aTob, higher); 1939 return _mm_xor_si128(b, mask); 1940 } 1941 unittest 1942 { 1943 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 1944 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 1945 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 1946 assert(R.array == correct); 1947 } 1948 1949 __m128d _mm_max_pd (__m128d a, __m128d b) pure @safe 1950 { 1951 static if (GDC_with_SSE2) 1952 { 1953 return __builtin_ia32_maxpd(a, b); 1954 } 1955 else 1956 { 1957 // Generates maxpd starting with LDC 1.9 1958 a[0] = (a[0] > b[0]) ? a[0] : b[0]; 1959 a[1] = (a[1] > b[1]) ? a[1] : b[1]; 1960 return a; 1961 } 1962 } 1963 unittest 1964 { 1965 __m128d A = _mm_setr_pd(4.0, 1.0); 1966 __m128d B = _mm_setr_pd(1.0, 8.0); 1967 __m128d M = _mm_max_pd(A, B); 1968 assert(M.array[0] == 4.0); 1969 assert(M.array[1] == 8.0); 1970 } 1971 1972 __m128d _mm_max_sd (__m128d a, __m128d b) pure @safe 1973 { 1974 static if (GDC_with_SSE2) 1975 { 1976 return __builtin_ia32_maxsd(a, b); 1977 } 1978 else 1979 { 1980 __m128d r = a; 1981 // Generates maxsd starting with LDC 1.3 1982 r.array[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 1983 return r; 1984 } 1985 } 1986 unittest 1987 { 1988 __m128d A = _mm_setr_pd(1.0, 1.0); 1989 __m128d B = _mm_setr_pd(4.0, 2.0); 1990 __m128d M = _mm_max_sd(A, B); 1991 assert(M.array[0] == 4.0); 1992 assert(M.array[1] == 1.0); 1993 } 1994 1995 version(GNU) 1996 { 1997 void _mm_mfence() pure @trusted 1998 { 1999 static if (GDC_with_SSE2) 2000 { 2001 __builtin_ia32_mfence(); 2002 } 2003 else version(X86) 2004 { 2005 asm pure nothrow @nogc @trusted 2006 { 2007 "mfence;\n" : : : ; 2008 } 2009 } 2010 else 2011 static assert(false); 2012 } 2013 } 2014 else static if (LDC_with_SSE2) 2015 { 2016 alias _mm_mfence = __builtin_ia32_mfence; 2017 } 2018 else static if (DMD_with_asm) 2019 { 2020 void _mm_mfence() pure @safe 2021 { 2022 asm nothrow @nogc pure @safe 2023 { 2024 mfence; 2025 } 2026 } 2027 } 2028 else version(LDC) 2029 { 2030 void _mm_mfence() pure @safe 2031 { 2032 // Note: will generate the DMB instruction on ARM 2033 llvm_memory_fence(); 2034 } 2035 } 2036 else 2037 static assert(false); 2038 unittest 2039 { 2040 _mm_mfence(); 2041 } 2042 2043 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe 2044 { 2045 // Note: clang uses a __builtin_ia32_pminsw128 which has disappeared from LDC LLVM (?) 2046 // Implemented using masks and XOR 2047 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else 2048 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2049 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2050 return _mm_xor_si128(b, mask); 2051 } 2052 unittest 2053 { 2054 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-57), 2055 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 2056 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -57]; 2057 assert(R.array == correct); 2058 } 2059 2060 2061 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe 2062 { 2063 // Same remark as with _mm_min_epi16: clang uses mystery intrinsics we don't have 2064 __m128i value128 = _mm_set1_epi8(-128); 2065 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2066 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2067 __m128i mask = _mm_and_si128(aTob, lower); 2068 return _mm_xor_si128(b, mask); 2069 } 2070 unittest 2071 { 2072 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2073 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2074 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 2075 assert(R.array == correct); 2076 } 2077 2078 __m128d _mm_min_pd (__m128d a, __m128d b) pure @safe 2079 { 2080 static if (GDC_with_SSE2) 2081 { 2082 return __builtin_ia32_minpd(a, b); 2083 } 2084 else 2085 { 2086 // Generates minpd starting with LDC 1.9 2087 a.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2088 a.array[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2089 return a; 2090 } 2091 } 2092 unittest 2093 { 2094 __m128d A = _mm_setr_pd(1.0, 2.0); 2095 __m128d B = _mm_setr_pd(4.0, 1.0); 2096 __m128d M = _mm_min_pd(A, B); 2097 assert(M.array[0] == 1.0); 2098 assert(M.array[1] == 1.0); 2099 } 2100 2101 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe 2102 { 2103 static if (GDC_with_SSE2) 2104 { 2105 return __builtin_ia32_minsd(a, b); 2106 } 2107 else 2108 { 2109 // Generates minsd starting with LDC 1.3 2110 __m128d r = a; 2111 r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2112 return r; 2113 } 2114 } 2115 unittest 2116 { 2117 __m128d A = _mm_setr_pd(1.0, 3.0); 2118 __m128d B = _mm_setr_pd(4.0, 2.0); 2119 __m128d M = _mm_min_sd(A, B); 2120 assert(M.array[0] == 1.0); 2121 assert(M.array[1] == 3.0); 2122 } 2123 2124 __m128i _mm_move_epi64 (__m128i a) pure @safe 2125 { 2126 static if (GDC_with_SSE2) 2127 { 2128 return __builtin_ia32_movq128(a); 2129 } 2130 else 2131 { 2132 long2 result = [ 0, 0 ]; 2133 long2 la = cast(long2) a; 2134 result.array[0] = la.array[0]; 2135 return cast(__m128i)(result); 2136 } 2137 } 2138 unittest 2139 { 2140 long2 A = [13, 47]; 2141 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 2142 long[2] correct = [13, 0]; 2143 assert(B.array == correct); 2144 } 2145 2146 __m128d _mm_move_sd (__m128d a, __m128d b) pure @safe 2147 { 2148 static if (GDC_with_SSE2) 2149 { 2150 return __builtin_ia32_movsd(a, b); 2151 } 2152 else 2153 { 2154 b.array[1] = a.array[1]; 2155 return b; 2156 } 2157 } 2158 unittest 2159 { 2160 double2 A = [13.0, 47.0]; 2161 double2 B = [34.0, 58.0]; 2162 double2 C = _mm_move_sd(A, B); 2163 double[2] correct = [34.0, 47.0]; 2164 assert(C.array == correct); 2165 } 2166 2167 static if (GDC_with_SSE2) 2168 { 2169 /// Create mask from the most significant bit of each 8-bit element in `v`. 2170 alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128; 2171 } 2172 else static if (LDC_with_SSE2) 2173 { 2174 /// Create mask from the most significant bit of each 8-bit element in `v`. 2175 alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128; 2176 } 2177 // TODO #ARM: doesn't work 2178 /* 2179 else static if (LDC_with_ARM) 2180 { 2181 /// Create mask from the most significant bit of each 8-bit element in `v`. 2182 int _mm_movemask_epi8 (__m128i a) pure @safe 2183 { 2184 // PERF: looks worse than the one in simde 2185 byte16 ai = cast(byte16)a; 2186 byte16 shift7 = [7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]; 2187 ai = ai >>> shift7; 2188 byte16 shift = [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]; 2189 ai = ai << shift; // 4-way shift, only efficient on ARM. 2190 short8 lo = cast(short8) _mm_unpacklo_epi8(ai, _mm_setzero_si128()); 2191 short8 hi = cast(short8) _mm_unpackhi_epi8(ai, _mm_setzero_si128()); 2192 short8 shift8 = [8, 8, 8, 8, 8, 8, 8, 8]; 2193 lo |= (hi << shift8); 2194 return lo.array[0] + lo.array[1] + lo.array[2] + lo.array[3] 2195 + lo.array[4] + lo.array[5] + lo.array[6] + lo.array[7]; 2196 } 2197 } */ 2198 else 2199 { 2200 /// Create mask from the most significant bit of each 8-bit element in `v`. 2201 int _mm_movemask_epi8(__m128i v) pure @safe 2202 { 2203 byte16 ai = cast(byte16)v; 2204 int r = 0; 2205 foreach(bit; 0..16) 2206 { 2207 if (ai.array[bit] < 0) r += (1 << bit); 2208 } 2209 return r; 2210 } 2211 } 2212 unittest 2213 { 2214 assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0))); 2215 } 2216 2217 static if (GDC_with_SSE2) 2218 { 2219 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2220 /// packed double-precision (64-bit) floating-point element in `v`. 2221 alias _mm_movemask_pd = __builtin_ia32_movmskpd; 2222 } 2223 else static if (LDC_with_SSE2) 2224 { 2225 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2226 /// packed double-precision (64-bit) floating-point element in `v`. 2227 alias _mm_movemask_pd = __builtin_ia32_movmskpd; 2228 } 2229 else 2230 { 2231 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2232 /// packed double-precision (64-bit) floating-point element in `v`. 2233 int _mm_movemask_pd(__m128d v) pure @safe 2234 { 2235 long2 lv = cast(long2)v; 2236 int r = 0; 2237 if (lv.array[0] < 0) r += 1; 2238 if (lv.array[1] < 0) r += 2; 2239 return r; 2240 } 2241 } 2242 unittest 2243 { 2244 __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0); 2245 assert(_mm_movemask_pd(A) == 2); 2246 } 2247 2248 /// Copy the lower 64-bit integer in `v`. 2249 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe 2250 { 2251 long2 lv = cast(long2)v; 2252 return long1(lv.array[0]); 2253 } 2254 unittest 2255 { 2256 __m128i A = _mm_set_epi64x(-1, -2); 2257 __m64 R = _mm_movepi64_pi64(A); 2258 assert(R.array[0] == -2); 2259 } 2260 2261 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element. 2262 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted 2263 { 2264 long2 r; 2265 r.ptr[0] = a.array[0]; 2266 r.ptr[1] = 0; 2267 return cast(__m128i)r; 2268 } 2269 2270 // Note: generates pmuludq in LDC with -O1 2271 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted 2272 { 2273 __m128i zero = _mm_setzero_si128(); 2274 2275 static if (__VERSION__ >= 2088) 2276 { 2277 // Need LLVM9 to avoid this shufflevector 2278 long2 la, lb; 2279 la.ptr[0] = cast(uint)a.array[0]; 2280 la.ptr[1] = cast(uint)a.array[2]; 2281 lb.ptr[0] = cast(uint)b.array[0]; 2282 lb.ptr[1] = cast(uint)b.array[2]; 2283 } 2284 else 2285 { 2286 long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero); 2287 long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero); 2288 } 2289 2290 static if (__VERSION__ >= 2076) 2291 { 2292 return cast(__m128i)(la * lb); 2293 } 2294 else 2295 { 2296 // long2 mul not supported before LDC 1.5 2297 la.ptr[0] *= lb.array[0]; 2298 la.ptr[1] *= lb.array[1]; 2299 return cast(__m128i)(la); 2300 } 2301 } 2302 unittest 2303 { 2304 __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff); 2305 __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff); 2306 __m128i C = _mm_mul_epu32(A, B); 2307 long2 LC = cast(long2)C; 2308 assert(LC.array[0] == 18446744065119617025uL); 2309 assert(LC.array[1] == 12723420444339690338uL); 2310 } 2311 2312 2313 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 2314 { 2315 return a * b; 2316 } 2317 unittest 2318 { 2319 __m128d a = [-2.0, 1.5]; 2320 a = _mm_mul_pd(a, a); 2321 assert(a.array == [4.0, 2.25]); 2322 } 2323 2324 version(DigitalMars) 2325 { 2326 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 2327 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe 2328 { 2329 asm pure nothrow @nogc @trusted { nop;} 2330 a.array[0] = a.array[0] * b.array[0]; 2331 return a; 2332 } 2333 } 2334 else 2335 { 2336 static if (GDC_with_SSE2) 2337 { 2338 alias _mm_mul_sd = __builtin_ia32_mulsd; 2339 } 2340 else 2341 { 2342 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe 2343 { 2344 a.array[0] *= b.array[0]; 2345 return a; 2346 } 2347 } 2348 } 2349 unittest 2350 { 2351 __m128d a = [-2.0, 1.5]; 2352 a = _mm_mul_sd(a, a); 2353 assert(a.array == [4.0, 1.5]); 2354 } 2355 2356 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 2357 /// and get an unsigned 64-bit result. 2358 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe 2359 { 2360 return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b))); 2361 } 2362 unittest 2363 { 2364 __m64 A = _mm_set_pi32(42, 0xDEADBEEF); 2365 __m64 B = _mm_set_pi32(42, 0xCAFEBABE); 2366 __m64 C = _mm_mul_su32(A, B); 2367 assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL); 2368 } 2369 2370 static if (GDC_with_SSE2) 2371 { 2372 alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128; 2373 } 2374 else static if (LDC_with_SSE2) 2375 { 2376 alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128; 2377 } 2378 else 2379 { 2380 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted 2381 { 2382 short8 sa = cast(short8)a; 2383 short8 sb = cast(short8)b; 2384 short8 r = void; 2385 r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16; 2386 r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16; 2387 r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16; 2388 r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16; 2389 r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16; 2390 r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16; 2391 r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16; 2392 r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16; 2393 return cast(__m128i)r; 2394 } 2395 } 2396 unittest 2397 { 2398 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2399 __m128i B = _mm_set1_epi16(16384); 2400 short8 R = cast(short8)_mm_mulhi_epi16(A, B); 2401 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1]; 2402 assert(R.array == correct); 2403 } 2404 2405 static if (GDC_with_SSE2) 2406 { 2407 alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128; 2408 } 2409 else static if (LDC_with_SSE2) 2410 { 2411 alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128; 2412 } 2413 else 2414 { 2415 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted 2416 { 2417 short8 sa = cast(short8)a; 2418 short8 sb = cast(short8)b; 2419 short8 r = void; 2420 r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 ); 2421 r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 ); 2422 r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 ); 2423 r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 ); 2424 r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 ); 2425 r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 ); 2426 r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 ); 2427 r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 ); 2428 return cast(__m128i)r; 2429 } 2430 } 2431 unittest 2432 { 2433 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2434 __m128i B = _mm_set1_epi16(16384); 2435 short8 R = cast(short8)_mm_mulhi_epu16(A, B); 2436 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1]; 2437 assert(R.array == correct); 2438 } 2439 2440 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe 2441 { 2442 return cast(__m128i)(cast(short8)a * cast(short8)b); 2443 } 2444 unittest 2445 { 2446 __m128i A = _mm_setr_epi16(16384, -16, 0, 3, 4, 1, 16, 7); 2447 __m128i B = _mm_set1_epi16(16384); 2448 short8 R = cast(short8)_mm_mullo_epi16(A, B); 2449 short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384]; 2450 assert(R.array == correct); 2451 } 2452 2453 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 2454 { 2455 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 2456 } 2457 2458 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 2459 { 2460 return a | b; 2461 } 2462 2463 static if (GDC_with_SSE2) 2464 { 2465 alias _mm_packs_epi32 = __builtin_ia32_packssdw128; 2466 } 2467 else static if (LDC_with_SSE2) 2468 { 2469 alias _mm_packs_epi32 = __builtin_ia32_packssdw128; 2470 } 2471 else 2472 { 2473 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted 2474 { 2475 short8 r; 2476 r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]); 2477 r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]); 2478 r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]); 2479 r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]); 2480 r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]); 2481 r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]); 2482 r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]); 2483 r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]); 2484 return cast(__m128i)r; 2485 } 2486 } 2487 unittest 2488 { 2489 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 2490 short8 R = cast(short8) _mm_packs_epi32(A, A); 2491 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0]; 2492 assert(R.array == correct); 2493 } 2494 2495 static if (GDC_with_SSE2) 2496 { 2497 alias _mm_packs_epi16 = __builtin_ia32_packsswb128; 2498 } 2499 else static if (LDC_with_SSE2) 2500 { 2501 alias _mm_packs_epi16 = __builtin_ia32_packsswb128; 2502 } 2503 else 2504 { 2505 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted 2506 { 2507 byte16 r; 2508 short8 sa = cast(short8)a; 2509 short8 sb = cast(short8)b; 2510 foreach(i; 0..8) 2511 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]); 2512 foreach(i; 0..8) 2513 r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]); 2514 return cast(__m128i)r; 2515 } 2516 } 2517 unittest 2518 { 2519 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0); 2520 byte16 R = cast(byte16) _mm_packs_epi16(A, A); 2521 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0, 2522 127, -128, 127, 0, 127, -128, 127, 0]; 2523 assert(R.array == correct); 2524 } 2525 2526 static if (GDC_with_SSE2) 2527 { 2528 alias _mm_packus_epi16 = __builtin_ia32_packuswb128; 2529 } 2530 else static if (LDC_with_SSE2) 2531 { 2532 alias _mm_packus_epi16 = __builtin_ia32_packuswb128; 2533 } 2534 else 2535 { 2536 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 2537 { 2538 short8 sa = cast(short8)a; 2539 short8 sb = cast(short8)b; 2540 ubyte[16] result = void; 2541 for (int i = 0; i < 8; ++i) 2542 { 2543 short s = sa[i]; 2544 if (s < 0) s = 0; 2545 if (s > 255) s = 255; 2546 result[i] = cast(ubyte)s; 2547 2548 s = sb[i]; 2549 if (s < 0) s = 0; 2550 if (s > 255) s = 255; 2551 result[i+8] = cast(ubyte)s; 2552 } 2553 return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr); 2554 } 2555 } 2556 unittest 2557 { 2558 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 2559 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 2560 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 2561 0, 255, 0, 255, 255, 2, 1, 0]; 2562 foreach(i; 0..16) 2563 assert(AA.array[i] == cast(byte)(correctResult[i])); 2564 } 2565 2566 2567 version(GNU) 2568 { 2569 void _mm_pause() pure @trusted 2570 { 2571 static if (GDC_with_SSE2) 2572 { 2573 __builtin_ia32_pause(); 2574 } 2575 else version(X86) 2576 { 2577 asm pure nothrow @nogc @trusted 2578 { 2579 "pause;\n" : : : ; 2580 } 2581 } 2582 else 2583 static assert(false); 2584 } 2585 } 2586 else static if (LDC_with_SSE2) 2587 { 2588 alias _mm_pause = __builtin_ia32_pause; 2589 } 2590 else static if (DMD_with_asm) 2591 { 2592 void _mm_pause() pure @safe 2593 { 2594 asm nothrow @nogc pure @safe 2595 { 2596 rep; nop; // F3 90 = pause 2597 } 2598 } 2599 } 2600 else version (LDC) 2601 { 2602 void _mm_pause() pure @safe 2603 { 2604 // PERF: Do nothing currently , could be the "yield" intruction on ARM. 2605 } 2606 } 2607 else 2608 static assert(false); 2609 unittest 2610 { 2611 _mm_pause(); 2612 } 2613 2614 static if (GDC_with_SSE2) 2615 { 2616 alias _mm_sad_epu8 = __builtin_ia32_psadbw128; 2617 } 2618 else static if (LDC_with_SSE2) 2619 { 2620 alias _mm_sad_epu8 = __builtin_ia32_psadbw128; 2621 } 2622 else 2623 { 2624 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted 2625 { 2626 byte16 ab = cast(byte16)a; 2627 byte16 bb = cast(byte16)b; 2628 ubyte[16] t; 2629 foreach(i; 0..16) 2630 { 2631 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]); 2632 if (diff < 0) diff = -diff; 2633 t[i] = cast(ubyte)(diff); 2634 } 2635 int4 r = _mm_setzero_si128(); 2636 r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 2637 r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15]; 2638 return r; 2639 } 2640 } 2641 unittest 2642 { 2643 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 2644 __m128i B = _mm_set1_epi8(1); 2645 __m128i R = _mm_sad_epu8(A, B); 2646 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 2647 0, 2648 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 2649 0]; 2650 assert(R.array == correct); 2651 } 2652 2653 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 2654 { 2655 short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7]; 2656 return cast(__m128i) loadUnaligned!(short8)(result.ptr); 2657 } 2658 unittest 2659 { 2660 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 2661 short8 B = cast(short8) A; 2662 foreach(i; 0..8) 2663 assert(B.array[i] == i); 2664 } 2665 2666 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 2667 { 2668 int[4] result = [e0, e1, e2, e3]; 2669 return loadUnaligned!(int4)(result.ptr); 2670 } 2671 unittest 2672 { 2673 __m128i A = _mm_set_epi32(3, 2, 1, 0); 2674 foreach(i; 0..4) 2675 assert(A.array[i] == i); 2676 } 2677 2678 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted 2679 { 2680 long[2] result = [e0.array[0], e1.array[0]]; 2681 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 2682 } 2683 unittest 2684 { 2685 __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678)); 2686 long2 B = cast(long2) A; 2687 assert(B.array[0] == 5678); 2688 assert(B.array[1] == 1234); 2689 } 2690 2691 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 2692 { 2693 long[2] result = [e0, e1]; 2694 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 2695 } 2696 unittest 2697 { 2698 __m128i A = _mm_set_epi64x(1234, 5678); 2699 long2 B = cast(long2) A; 2700 assert(B.array[0] == 5678); 2701 assert(B.array[1] == 1234); 2702 } 2703 2704 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 2705 byte e11, byte e10, byte e9, byte e8, 2706 byte e7, byte e6, byte e5, byte e4, 2707 byte e3, byte e2, byte e1, byte e0) pure @trusted 2708 { 2709 byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 2710 e8, e9, e10, e11, e12, e13, e14, e15]; 2711 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 2712 } 2713 2714 __m128d _mm_set_pd (double e1, double e0) pure @trusted 2715 { 2716 double[2] result = [e0, e1]; 2717 return loadUnaligned!(double2)(result.ptr); 2718 } 2719 unittest 2720 { 2721 __m128d A = _mm_set_pd(61.0, 55.0); 2722 double[2] correct = [55.0, 61.0]; 2723 assert(A.array == correct); 2724 } 2725 2726 __m128d _mm_set_pd1 (double a) pure @trusted 2727 { 2728 double[2] result = [a, a]; 2729 return loadUnaligned!(double2)(result.ptr); 2730 } 2731 unittest 2732 { 2733 __m128d A = _mm_set_pd1(61.0); 2734 double[2] correct = [61.0, 61.0]; 2735 assert(A.array == correct); 2736 } 2737 2738 __m128d _mm_set_sd (double a) pure @trusted 2739 { 2740 double[2] result = [a, 0]; 2741 return loadUnaligned!(double2)(result.ptr); 2742 } 2743 2744 __m128i _mm_set1_epi16 (short a) pure @trusted 2745 { 2746 return cast(__m128i)(short8(a)); 2747 } 2748 2749 __m128i _mm_set1_epi32 (int a) pure @trusted 2750 { 2751 return cast(__m128i)(int4(a)); 2752 } 2753 unittest 2754 { 2755 __m128 a = _mm_set1_ps(-1.0f); 2756 __m128 b = cast(__m128) _mm_set1_epi32(0x7fffffff); 2757 assert(_mm_and_ps(a, b).array == [1.0f, 1, 1, 1]); 2758 } 2759 2760 /// Broadcast 64-bit integer `a` to all elements of `dst`. 2761 __m128i _mm_set1_epi64 (__m64 a) pure @safe 2762 { 2763 return _mm_set_epi64(a, a); 2764 } 2765 2766 __m128i _mm_set1_epi64x (long a) pure @trusted 2767 { 2768 return cast(__m128i)(long2(a)); 2769 } 2770 2771 __m128i _mm_set1_epi8 (byte a) pure @trusted 2772 { 2773 return cast(__m128i)(byte16(a)); 2774 } 2775 2776 alias _mm_set1_pd = _mm_set_pd1; 2777 2778 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 2779 short e3, short e2, short e1, short e0) pure @trusted 2780 { 2781 short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0]; 2782 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 2783 } 2784 2785 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 2786 { 2787 int[4] result = [e3, e2, e1, e0]; 2788 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 2789 } 2790 2791 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 2792 { 2793 long[2] result = [e1, e0]; 2794 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 2795 } 2796 2797 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12, 2798 byte e11, byte e10, byte e9, byte e8, 2799 byte e7, byte e6, byte e5, byte e4, 2800 byte e3, byte e2, byte e1, byte e0) pure @trusted 2801 { 2802 byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 2803 e7, e6, e5, e4, e3, e2, e1, e0]; 2804 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 2805 } 2806 2807 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 2808 { 2809 double[2] result = [e1, e0]; 2810 return loadUnaligned!(double2)(result.ptr); 2811 } 2812 unittest 2813 { 2814 __m128d A = _mm_setr_pd(61.0, 55.0); 2815 double[2] correct = [61.0, 55.0]; 2816 assert(A.array == correct); 2817 } 2818 2819 __m128d _mm_setzero_pd () pure @trusted 2820 { 2821 // Note: using loadUnaligned has better -O0 codegen compared to .ptr 2822 double[2] result = [0.0, 0.0]; 2823 return loadUnaligned!(double2)(result.ptr); 2824 } 2825 2826 __m128i _mm_setzero_si128() pure @trusted 2827 { 2828 // Note: using loadUnaligned has better -O0 codegen compared to .ptr 2829 int[4] result = [0, 0, 0, 0]; 2830 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 2831 } 2832 2833 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe 2834 { 2835 static if (GDC_with_SSE2) 2836 { 2837 return __builtin_ia32_pshufd(a, imm8); 2838 } 2839 else 2840 { 2841 return shufflevector!(int4, (imm8 >> 0) & 3, 2842 (imm8 >> 2) & 3, 2843 (imm8 >> 4) & 3, 2844 (imm8 >> 6) & 3)(a, a); 2845 } 2846 } 2847 unittest 2848 { 2849 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 2850 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 2851 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 2852 int[4] expectedB = [ 3, 2, 1, 0 ]; 2853 assert(B.array == expectedB); 2854 } 2855 2856 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe 2857 { 2858 static if (GDC_with_SSE2) 2859 { 2860 return __builtin_ia32_shufpd(a, b, imm8); 2861 } 2862 else 2863 { 2864 return shufflevector!(double2, 0 + ( imm8 & 1 ), 2865 2 + ( (imm8 >> 1) & 1 ))(a, b); 2866 } 2867 } 2868 unittest 2869 { 2870 __m128d A = _mm_setr_pd(0.5, 2.0); 2871 __m128d B = _mm_setr_pd(4.0, 5.0); 2872 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 2873 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B); 2874 double[2] correct = [ 2.0, 5.0 ]; 2875 assert(R.array == correct); 2876 } 2877 2878 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe 2879 { 2880 static if (GDC_with_SSE2) 2881 { 2882 return __builtin_ia32_pshufhw(a, imm8); 2883 } 2884 else 2885 { 2886 return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3, 2887 4 + ( (imm8 >> 0) & 3 ), 2888 4 + ( (imm8 >> 2) & 3 ), 2889 4 + ( (imm8 >> 4) & 3 ), 2890 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 2891 } 2892 } 2893 unittest 2894 { 2895 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 2896 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 2897 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 2898 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 2899 assert(C.array == expectedC); 2900 } 2901 2902 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe 2903 { 2904 static if (GDC_with_SSE2) 2905 { 2906 return __builtin_ia32_pshuflw(a, imm8); 2907 } 2908 else 2909 { 2910 return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ), 2911 ( (imm8 >> 2) & 3 ), 2912 ( (imm8 >> 4) & 3 ), 2913 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 2914 } 2915 } 2916 unittest 2917 { 2918 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 2919 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 2920 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 2921 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 2922 assert(B.array == expectedB); 2923 } 2924 2925 static if (LDC_with_SSE2) 2926 { 2927 deprecated("Use _mm_slli_epi32 instead.") alias _mm_sll_epi32 = __builtin_ia32_pslld128; 2928 } 2929 else static if (GDC_with_SSE2) 2930 { 2931 deprecated("Use _mm_slli_epi32 instead.") alias _mm_sll_epi32 = __builtin_ia32_pslld128; 2932 } 2933 else static if (DMD_with_32bit_asm) 2934 { 2935 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe 2936 { 2937 asm pure nothrow @nogc @trusted 2938 { 2939 movdqu XMM0, a; 2940 movdqu XMM1, count; 2941 pslld XMM0, XMM1; 2942 movdqu a, XMM0; 2943 } 2944 return a; 2945 } 2946 } 2947 else 2948 { 2949 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @safe 2950 { 2951 int4 r = void; 2952 long2 lc = cast(long2)count; 2953 int bits = cast(int)(lc.array[0]); 2954 foreach(i; 0..4) 2955 r[i] = cast(uint)(a[i]) << bits; 2956 return r; 2957 } 2958 } 2959 2960 static if (LDC_with_SSE2) 2961 { 2962 deprecated("Use _mm_slli_epi64 instead.") alias _mm_sll_epi64 = __builtin_ia32_psllq128; 2963 } 2964 else static if (GDC_with_SSE2) 2965 { 2966 deprecated("Use _mm_slli_epi64 instead.") alias _mm_sll_epi64 = __builtin_ia32_psllq128; 2967 } 2968 else static if (DMD_with_32bit_asm) 2969 { 2970 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe 2971 { 2972 asm pure nothrow @nogc @trusted 2973 { 2974 movdqu XMM0, a; 2975 movdqu XMM1, count; 2976 psllq XMM0, XMM1; 2977 movdqu a, XMM0; 2978 } 2979 return a; 2980 } 2981 } 2982 else 2983 { 2984 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @safe 2985 { 2986 // ARM: good since LDC 1.12 -O2 2987 // ~but -O0 version is catastrophic 2988 long2 r = void; 2989 long2 sa = cast(long2)a; 2990 long2 lc = cast(long2)count; 2991 int bits = cast(int)(lc.array[0]); 2992 foreach(i; 0..2) 2993 r.array[i] = cast(ulong)(sa.array[i]) << bits; 2994 return cast(__m128i)r; 2995 } 2996 } 2997 2998 static if (LDC_with_SSE2) 2999 { 3000 deprecated("Use _mm_slli_epi16 instead.") alias _mm_sll_epi16 = __builtin_ia32_psllw128; 3001 } 3002 else static if (GDC_with_SSE2) 3003 { 3004 deprecated("Use _mm_slli_epi16 instead.") alias _mm_sll_epi16 = __builtin_ia32_psllw128; 3005 } 3006 else static if (DMD_with_32bit_asm) 3007 { 3008 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 3009 { 3010 asm pure nothrow @nogc 3011 { 3012 movdqu XMM0, a; 3013 movdqu XMM1, count; 3014 psllw XMM0, XMM1; 3015 movdqu a, XMM0; 3016 } 3017 return a; 3018 } 3019 } 3020 else 3021 { 3022 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 3023 { 3024 short8 sa = cast(short8)a; 3025 long2 lc = cast(long2)count; 3026 int bits = cast(int)(lc.array[0]); 3027 short8 r = void; 3028 foreach(i; 0..8) 3029 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits); 3030 return cast(int4)r; 3031 } 3032 } 3033 3034 static if (LDC_with_SSE2) 3035 { 3036 alias _mm_slli_epi32 = __builtin_ia32_pslldi128; 3037 } 3038 else 3039 { 3040 static if (GDC_with_SSE2) 3041 { 3042 alias _mm_slli_epi32 = __builtin_ia32_pslldi128; 3043 } 3044 else 3045 { 3046 // TODO #ARM, not fantastic again 3047 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @safe 3048 { 3049 int4 r = void; 3050 foreach(i; 0..4) 3051 r.array[i] = cast(uint)(a.array[i]) << imm8; 3052 return r; 3053 } 3054 } 3055 } 3056 unittest 3057 { 3058 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3059 __m128i B = _mm_slli_epi32(A, 1); 3060 int[4] expectedB = [ 0, 4, 6, -8]; 3061 assert(B.array == expectedB); 3062 } 3063 3064 static if (LDC_with_SSE2) 3065 { 3066 alias _mm_slli_epi64 = __builtin_ia32_psllqi128; 3067 } 3068 else 3069 { 3070 static if (GDC_with_SSE2) 3071 { 3072 alias _mm_slli_epi64 = __builtin_ia32_psllqi128; 3073 } 3074 else 3075 { 3076 // PERF #ARM: unroll that loop 3077 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @safe 3078 { 3079 long2 r = void; 3080 long2 sa = cast(long2)a; 3081 foreach(i; 0..2) 3082 r.array[i] = cast(ulong)(sa.array[i]) << imm8; 3083 return cast(__m128i)r; 3084 } 3085 } 3086 } 3087 unittest 3088 { 3089 __m128i A = _mm_setr_epi64(8, -4); 3090 long2 B = cast(long2) _mm_slli_epi64(A, 1); 3091 long[2] expectedB = [ 16, -8]; 3092 assert(B.array == expectedB); 3093 } 3094 3095 static if (LDC_with_SSE2) 3096 { 3097 alias _mm_slli_epi16 = __builtin_ia32_psllwi128; 3098 } 3099 else 3100 { 3101 static if (GDC_with_SSE2) 3102 { 3103 alias _mm_slli_epi16 = __builtin_ia32_psllwi128; 3104 } 3105 else 3106 { 3107 // TODO #ARM 3108 __m128i _mm_slli_epi16 (__m128i a, int imm8) pure @safe 3109 { 3110 short8 sa = cast(short8)a; 3111 short8 r = void; 3112 foreach(i; 0..8) 3113 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) << imm8); 3114 return cast(int4)r; 3115 } 3116 } 3117 } 3118 unittest 3119 { 3120 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3121 short8 B = cast(short8)( _mm_slli_epi16(A, 1) ); 3122 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 3123 assert(B.array == expectedB); 3124 } 3125 3126 3127 /// Shift `a` left by `bytes` bytes while shifting in zeros. 3128 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted 3129 { 3130 static if (bytes & 0xF0) 3131 { 3132 return _mm_setzero_si128(); 3133 } 3134 else 3135 { 3136 static if (GDC_with_SSE2) 3137 { 3138 return __builtin_ia32_pslldqi128(op, cast(ubyte)(bytes * 8)); 3139 } 3140 else version(DigitalMars) 3141 { 3142 version(D_InlineAsm_X86) 3143 { 3144 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64 3145 { 3146 movdqu XMM0, op; 3147 pslldq XMM0, bytes; 3148 movdqu op, XMM0; 3149 } 3150 return op; 3151 } 3152 else 3153 { 3154 byte16 A = cast(byte16)op; 3155 byte16 R; 3156 for (int n = 15; n >= bytes; --n) 3157 R.ptr[n] = A.array[n-bytes]; 3158 for (int n = bytes-1; n >= 0; --n) 3159 R.ptr[n] = 0; 3160 return cast(__m128i)R; 3161 } 3162 } 3163 else 3164 { 3165 return cast(__m128i) shufflevector!(byte16, 3166 16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes, 3167 22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes, 3168 28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes) 3169 (cast(byte16)_mm_setzero_si128(), cast(byte16)op); 3170 } 3171 } 3172 } 3173 unittest 3174 { 3175 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3176 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left 3177 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ]; 3178 assert(R.array == correct); 3179 } 3180 3181 version(LDC) 3182 { 3183 // Disappeared with LDC 1.11 3184 static if (__VERSION__ < 2081) 3185 alias _mm_sqrt_pd = __builtin_ia32_sqrtpd; 3186 else 3187 { 3188 __m128d _mm_sqrt_pd(__m128d vec) pure @safe 3189 { 3190 vec.array[0] = llvm_sqrt(vec.array[0]); 3191 vec.array[1] = llvm_sqrt(vec.array[1]); 3192 return vec; 3193 } 3194 } 3195 } 3196 else 3197 { 3198 static if (GDC_with_SSE2) 3199 { 3200 alias _mm_sqrt_pd = __builtin_ia32_sqrtpd; 3201 } 3202 else 3203 { 3204 __m128d _mm_sqrt_pd(__m128d vec) pure @safe 3205 { 3206 vec.array[0] = sqrt(vec.array[0]); 3207 vec.array[1] = sqrt(vec.array[1]); 3208 return vec; 3209 } 3210 } 3211 } 3212 3213 3214 version(LDC) 3215 { 3216 // Disappeared with LDC 1.11 3217 static if (__VERSION__ < 2081) 3218 alias _mm_sqrt_sd = __builtin_ia32_sqrtsd; 3219 else 3220 { 3221 __m128d _mm_sqrt_sd(__m128d vec) pure @safe 3222 { 3223 vec.array[0] = llvm_sqrt(vec.array[0]); 3224 vec.array[1] = vec.array[1]; 3225 return vec; 3226 } 3227 } 3228 } 3229 else 3230 { 3231 static if (GDC_with_SSE2) 3232 { 3233 alias _mm_sqrt_sd = __builtin_ia32_sqrtsd; 3234 } 3235 else 3236 { 3237 __m128d _mm_sqrt_sd(__m128d vec) pure @safe 3238 { 3239 vec.array[0] = sqrt(vec.array[0]); 3240 vec.array[1] = vec.array[1]; 3241 return vec; 3242 } 3243 } 3244 } 3245 3246 3247 static if (LDC_with_SSE2) 3248 { 3249 deprecated("Use _mm_srai_epi16 instead.") alias _mm_sra_epi16 = __builtin_ia32_psraw128; 3250 } 3251 else 3252 { 3253 static if (GDC_with_SSE2) 3254 { 3255 deprecated("Use _mm_srai_epi16 instead.") alias _mm_sra_epi16 = __builtin_ia32_psraw128; 3256 } 3257 else 3258 { 3259 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @safe 3260 { 3261 short8 sa = cast(short8)a; 3262 long2 lc = cast(long2)count; 3263 int bits = cast(int)(lc.array[0]); 3264 short8 r = void; 3265 foreach(i; 0..8) 3266 r.array[i] = cast(short)(sa.array[i] >> bits); 3267 return cast(int4)r; 3268 } 3269 } 3270 } 3271 3272 static if (LDC_with_SSE2) 3273 { 3274 deprecated("Use _mm_srai_epi32 instead.") alias _mm_sra_epi32 = __builtin_ia32_psrad128; 3275 } 3276 else static if (GDC_with_SSE2) 3277 { 3278 deprecated("Use _mm_srai_epi32 instead.") alias _mm_sra_epi32 = __builtin_ia32_psrad128; 3279 } 3280 else 3281 { 3282 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @safe 3283 { 3284 int4 r = void; 3285 long2 lc = cast(long2)count; 3286 int bits = cast(int)(lc.array[0]); 3287 r.array[0] = (a.array[0] >> bits); 3288 r.array[1] = (a.array[1] >> bits); 3289 r.array[2] = (a.array[2] >> bits); 3290 r.array[3] = (a.array[3] >> bits); 3291 return r; 3292 } 3293 } 3294 3295 static if (LDC_with_SSE2) 3296 { 3297 alias _mm_srai_epi16 = __builtin_ia32_psrawi128; 3298 } 3299 else 3300 { 3301 static if (GDC_with_SSE2) 3302 { 3303 alias _mm_srai_epi16 = __builtin_ia32_psrawi128; 3304 } 3305 else 3306 { 3307 // TODO: ARM 3308 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @safe 3309 { 3310 short8 sa = cast(short8)a; 3311 short8 r = void; 3312 foreach(i; 0..8) 3313 r.array[i] = cast(short)(sa.array[i] >> imm8); 3314 return cast(int4)r; 3315 } 3316 } 3317 } 3318 unittest 3319 { 3320 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3321 short8 B = cast(short8)( _mm_srai_epi16(A, 1) ); 3322 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 3323 assert(B.array == expectedB); 3324 } 3325 3326 static if (LDC_with_SSE2) 3327 { 3328 alias _mm_srai_epi32 = __builtin_ia32_psradi128; 3329 } 3330 else static if (GDC_with_SSE2) 3331 { 3332 alias _mm_srai_epi32 = __builtin_ia32_psradi128; 3333 } 3334 else 3335 { 3336 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @safe 3337 { 3338 int4 r = void; 3339 r.array[0] = (a.array[0] >> imm8); 3340 r.array[1] = (a.array[1] >> imm8); 3341 r.array[2] = (a.array[2] >> imm8); 3342 r.array[3] = (a.array[3] >> imm8); 3343 return r; 3344 } 3345 } 3346 unittest 3347 { 3348 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3349 __m128i B = _mm_srai_epi32(A, 1); 3350 int[4] expectedB = [ 0, 1, 1, -2]; 3351 assert(B.array == expectedB); 3352 } 3353 3354 static if (LDC_with_SSE2) 3355 { 3356 deprecated("Use _mm_srli_epi16 instead.") alias _mm_srl_epi16 = __builtin_ia32_psrlw128; 3357 } 3358 else static if (GDC_with_SSE2) 3359 { 3360 deprecated("Use _mm_srli_epi16 instead.") alias _mm_srl_epi16 = __builtin_ia32_psrlw128; 3361 } 3362 else 3363 { 3364 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @safe 3365 { 3366 short8 sa = cast(short8)a; 3367 long2 lc = cast(long2)count; 3368 int bits = cast(int)(lc.array[0]); 3369 short8 r = void; 3370 foreach(i; 0..8) 3371 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits); 3372 return cast(int4)r; 3373 } 3374 } 3375 3376 static if (LDC_with_SSE2) 3377 { 3378 deprecated("Use _mm_srli_epi32 instead.") alias _mm_srl_epi32 = __builtin_ia32_psrld128; 3379 } 3380 else static if (GDC_with_SSE2) 3381 { 3382 deprecated("Use _mm_srli_epi32 instead.") alias _mm_srl_epi32 = __builtin_ia32_psrld128; 3383 } 3384 else 3385 { 3386 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @safe 3387 { 3388 int4 r = void; 3389 long2 lc = cast(long2)count; 3390 int bits = cast(int)(lc.array[0]); 3391 r.array[0] = cast(uint)(a.array[0]) >> bits; 3392 r.array[1] = cast(uint)(a.array[1]) >> bits; 3393 r.array[2] = cast(uint)(a.array[2]) >> bits; 3394 r.array[3] = cast(uint)(a.array[3]) >> bits; 3395 return r; 3396 } 3397 } 3398 3399 static if (LDC_with_SSE2) 3400 { 3401 deprecated("Use _mm_srli_epi64 instead.") alias _mm_srl_epi64 = __builtin_ia32_psrlq128; 3402 } 3403 else static if (GDC_with_SSE2) 3404 { 3405 deprecated("Use _mm_srli_epi64 instead.") alias _mm_srl_epi64 = __builtin_ia32_psrlq128; 3406 } 3407 else 3408 { 3409 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @safe 3410 { 3411 long2 r = void; 3412 long2 sa = cast(long2)a; 3413 long2 lc = cast(long2)count; 3414 int bits = cast(int)(lc.array[0]); 3415 r.array[0] = cast(ulong)(sa.array[0]) >> bits; 3416 r.array[1] = cast(ulong)(sa.array[1]) >> bits; 3417 return cast(__m128i)r; 3418 } 3419 } 3420 3421 static if (LDC_with_SSE2) 3422 { 3423 alias _mm_srli_epi16 = __builtin_ia32_psrlwi128; 3424 } 3425 else 3426 { 3427 static if (GDC_with_SSE2) 3428 { 3429 alias _mm_srli_epi16 = __builtin_ia32_psrlwi128; 3430 } 3431 else 3432 { 3433 // TODO #ARM 3434 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @safe 3435 { 3436 short8 sa = cast(short8)a; 3437 short8 r = void; 3438 foreach(i; 0..8) 3439 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> imm8); 3440 return cast(int4)r; 3441 } 3442 } 3443 } 3444 unittest 3445 { 3446 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3447 short8 B = cast(short8)( _mm_srli_epi16(A, 1) ); 3448 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 3449 assert(B.array == expectedB); 3450 } 3451 3452 static if (LDC_with_SSE2) 3453 { 3454 alias _mm_srli_epi32 = __builtin_ia32_psrldi128; 3455 } 3456 else 3457 { 3458 static if (GDC_with_SSE2) 3459 { 3460 alias _mm_srli_epi32 = __builtin_ia32_psrldi128; 3461 } 3462 else 3463 { 3464 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted 3465 { 3466 int4 r = void; 3467 r.ptr[0] = cast(uint)(a.array[0]) >> imm8; 3468 r.ptr[1] = cast(uint)(a.array[1]) >> imm8; 3469 r.ptr[2] = cast(uint)(a.array[2]) >> imm8; 3470 r.ptr[3] = cast(uint)(a.array[3]) >> imm8; 3471 return r; 3472 } 3473 } 3474 } 3475 unittest 3476 { 3477 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3478 __m128i B = _mm_srli_epi32(A, 1); 3479 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 3480 assert(B.array == expectedB); 3481 } 3482 3483 static if (LDC_with_SSE2) 3484 { 3485 alias _mm_srli_epi64 = __builtin_ia32_psrlqi128; 3486 } 3487 else 3488 { 3489 static if (GDC_with_SSE2) 3490 { 3491 alias _mm_srli_epi64 = __builtin_ia32_psrlqi128; 3492 } 3493 else 3494 { 3495 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted 3496 { 3497 long2 r = void; 3498 long2 sa = cast(long2)a; 3499 r.ptr[0] = cast(ulong)(sa.array[0]) >> imm8; 3500 r.ptr[1] = cast(ulong)(sa.array[1]) >> imm8; 3501 return cast(__m128i)r; 3502 } 3503 } 3504 } 3505 unittest 3506 { 3507 __m128i A = _mm_setr_epi64(8, -4); 3508 long2 B = cast(long2) _mm_srli_epi64(A, 1); 3509 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 3510 assert(B.array == expectedB); 3511 } 3512 3513 /// Shift `v` right by `bytes` bytes while shifting in zeros. 3514 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe 3515 { 3516 static if (bytes & 0xF0) 3517 { 3518 return _mm_setzero_si128(); 3519 } 3520 else 3521 { 3522 static if (GDC_with_SSE2) 3523 { 3524 return cast(__m128i) __builtin_ia32_psrldqi128(v, cast(ubyte)(bytes * 8)); 3525 } 3526 else static if (DMD_with_32bit_asm) 3527 { 3528 asm pure nothrow @nogc @trusted 3529 { 3530 movdqu XMM0, v; 3531 psrldq XMM0, bytes; 3532 movdqu v, XMM0; 3533 } 3534 return v; 3535 } 3536 else 3537 { 3538 return cast(__m128i) shufflevector!(byte16, 3539 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7, 3540 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15) 3541 (cast(byte16) v, cast(byte16)_mm_setzero_si128()); 3542 } 3543 } 3544 3545 } 3546 3547 unittest 3548 { 3549 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1)); 3550 int[4] correct = [2, 3, 4, 0]; 3551 assert(R.array == correct); 3552 } 3553 3554 /// Shift `v` right by `bytes` bytes while shifting in zeros. 3555 /// #BONUS 3556 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe 3557 { 3558 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v); 3559 } 3560 unittest 3561 { 3562 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)); 3563 float[4] correct = [3.0f, 4.0f, 0, 0]; 3564 assert(R.array == correct); 3565 } 3566 3567 /// Shift `v` right by `bytes` bytes while shifting in zeros. 3568 /// #BONUS 3569 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe 3570 { 3571 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v); 3572 } 3573 3574 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted 3575 { 3576 __m128d* aligned = cast(__m128d*)mem_addr; 3577 *aligned = a; 3578 } 3579 3580 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted 3581 { 3582 __m128d* aligned = cast(__m128d*)mem_addr; 3583 __m128d r; 3584 r.ptr[0] = a.array[0]; 3585 r.ptr[1] = a.array[0]; 3586 *aligned = r; 3587 } 3588 3589 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 3590 { 3591 *mem_addr = a.array[0]; 3592 } 3593 3594 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 3595 { 3596 *mem_addr = a; 3597 } 3598 3599 alias _mm_store1_pd = _mm_store_pd1; 3600 3601 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 3602 { 3603 *mem_addr = a.array[1]; 3604 } 3605 3606 // Note: `mem_addr` doesn't have to actually be aligned, which breaks 3607 // expectations from the user point of view. This problem also exist in C++. 3608 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 3609 { 3610 long* dest = cast(long*)mem_addr; 3611 long2 la = cast(long2)a; 3612 *dest = la.array[0]; 3613 } 3614 unittest 3615 { 3616 long[3] A = [1, 2, 3]; 3617 _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 3618 long[3] correct = [1, 0x1_0000_0000, 3]; 3619 assert(A == correct); 3620 } 3621 3622 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 3623 { 3624 *mem_addr = a.array[0]; 3625 } 3626 3627 void _mm_storer_pd (double* mem_addr, __m128d a) pure 3628 { 3629 __m128d* aligned = cast(__m128d*)mem_addr; 3630 *aligned = shufflevector!(double2, 1, 0)(a, a); 3631 } 3632 3633 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe 3634 { 3635 storeUnaligned!double2(a, mem_addr); 3636 } 3637 3638 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe 3639 { 3640 storeUnaligned!__m128i(a, cast(int*)mem_addr); 3641 } 3642 3643 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 3644 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte 3645 /// boundary or a general-protection exception may be generated. 3646 void _mm_stream_pd (double* mem_addr, __m128d a) 3647 { 3648 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 3649 __m128d* dest = cast(__m128d*)mem_addr; 3650 *dest = a; 3651 } 3652 3653 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 3654 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception 3655 /// may be generated. 3656 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) 3657 { 3658 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 3659 __m128i* dest = cast(__m128i*)mem_addr; 3660 *dest = a; 3661 } 3662 3663 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 3664 /// pollution. If the cache line containing address mem_addr is already in the cache, 3665 /// the cache will be updated. 3666 void _mm_stream_si32 (int* mem_addr, int a) 3667 { 3668 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 3669 *mem_addr = a; 3670 } 3671 3672 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 3673 /// cache pollution. If the cache line containing address mem_addr is already 3674 /// in the cache, the cache will be updated. 3675 void _mm_stream_si64 (long* mem_addr, long a) 3676 { 3677 // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 3678 *mem_addr = a; 3679 } 3680 3681 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 3682 { 3683 return cast(__m128i)(cast(short8)a - cast(short8)b); 3684 } 3685 3686 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 3687 { 3688 return cast(__m128i)(cast(int4)a - cast(int4)b); 3689 } 3690 3691 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 3692 { 3693 return cast(__m128i)(cast(long2)a - cast(long2)b); 3694 } 3695 3696 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 3697 { 3698 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 3699 } 3700 3701 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 3702 { 3703 return a - b; 3704 } 3705 3706 version(DigitalMars) 3707 { 3708 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 3709 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe 3710 { 3711 asm pure nothrow @nogc @trusted { nop;} 3712 a[0] = a[0] - b[0]; 3713 return a; 3714 } 3715 } 3716 else static if (GDC_with_SSE2) 3717 { 3718 alias _mm_sub_sd = __builtin_ia32_subsd; 3719 } 3720 else 3721 { 3722 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe 3723 { 3724 a.array[0] -= b.array[0]; 3725 return a; 3726 } 3727 } 3728 unittest 3729 { 3730 __m128d a = [1.5, -2.0]; 3731 a = _mm_sub_sd(a, a); 3732 assert(a.array == [0.0, -2.0]); 3733 } 3734 3735 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe 3736 { 3737 return a - b; 3738 } 3739 3740 version(LDC) 3741 { 3742 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 3743 { 3744 // Generates PSUBSW since LDC 1.15 -O0 3745 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 3746 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 3747 { 3748 enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 3749 enum ir = ` 3750 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 3751 ret <8 x i16> %r`; 3752 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 3753 } 3754 } 3755 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 3756 { 3757 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 3758 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 3759 { 3760 // PERF #ARM32 Use an intrinsic in gccbuiltins_arm.d instead 3761 short[8] res; 3762 short8 sa = cast(short8)a; 3763 short8 sb = cast(short8)b; 3764 foreach(i; 0..8) 3765 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 3766 return _mm_loadu_si128(cast(int4*)res.ptr); 3767 } 3768 } 3769 else 3770 alias _mm_subs_epi16 = __builtin_ia32_psubsw128; 3771 } 3772 else 3773 { 3774 static if (GDC_with_SSE2) 3775 { 3776 alias _mm_subs_epi16 = __builtin_ia32_psubsw128; 3777 } 3778 else 3779 { 3780 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 3781 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 3782 { 3783 short[8] res; 3784 short8 sa = cast(short8)a; 3785 short8 sb = cast(short8)b; 3786 foreach(i; 0..8) 3787 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 3788 return _mm_loadu_si128(cast(int4*)res.ptr); 3789 } 3790 } 3791 } 3792 unittest 3793 { 3794 short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0), 3795 _mm_setr_epi16(-10 , 16, 5, 4, 3, 2, 1, 0)); 3796 static immutable short[8] correctResult = [32767, -32768, 0, 0, 0, 0, 0, 0]; 3797 assert(res.array == correctResult); 3798 } 3799 3800 version(LDC) 3801 { 3802 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 3803 { 3804 // x86: Generates PSUBSB since LDC 1.15 -O0 3805 // ARM: Generates sqsub.16b since LDC 1.21 -O0 3806 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 3807 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 3808 { 3809 enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 3810 enum ir = ` 3811 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 3812 ret <16 x i8> %r`; 3813 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 3814 } 3815 } 3816 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 3817 { 3818 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 3819 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 3820 { 3821 // PERF #ARM32 Use an intrinsic in gccbuiltins_arm.d instead 3822 byte[16] res; 3823 byte16 sa = cast(byte16)a; 3824 byte16 sb = cast(byte16)b; 3825 foreach(i; 0..16) 3826 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 3827 return _mm_loadu_si128(cast(int4*)res.ptr); 3828 } 3829 } 3830 else 3831 alias _mm_subs_epi8 = __builtin_ia32_psubsb128; 3832 } 3833 else 3834 { 3835 static if (GDC_with_SSE2) 3836 { 3837 alias _mm_subs_epi8 = __builtin_ia32_psubsb128; 3838 } 3839 else 3840 { 3841 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 3842 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 3843 { 3844 byte[16] res; 3845 byte16 sa = cast(byte16)a; 3846 byte16 sb = cast(byte16)b; 3847 foreach(i; 0..16) 3848 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 3849 return _mm_loadu_si128(cast(int4*)res.ptr); 3850 } 3851 } 3852 } 3853 unittest 3854 { 3855 byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 3856 _mm_setr_epi8( 15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 3857 static immutable byte[16] correctResult = [-128, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 3858 assert(res.array == correctResult); 3859 } 3860 3861 version(LDC) 3862 { 3863 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 3864 { 3865 // x86: Generates PSUBUSW since LDC 1.15 -O0 3866 // ARM: Generates uqsub.8h since LDC 1.21 -O0 3867 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 3868 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 3869 { 3870 enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 3871 enum ir = ` 3872 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 3873 ret <8 x i16> %r`; 3874 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 3875 } 3876 } 3877 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 3878 { 3879 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 3880 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 3881 { 3882 // PERF #ARM32 Use an intrinsic in gccbuiltins_arm.d instead 3883 short[8] res; 3884 short8 sa = cast(short8)a; 3885 short8 sb = cast(short8)b; 3886 foreach(i; 0..8) 3887 { 3888 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 3889 res[i] = saturateSignedIntToUnsignedShort(sum); 3890 } 3891 return _mm_loadu_si128(cast(int4*)res.ptr); 3892 } 3893 } 3894 else 3895 alias _mm_subs_epu16 = __builtin_ia32_psubusw128; 3896 } 3897 else 3898 { 3899 static if (GDC_with_SSE2) 3900 { 3901 alias _mm_subs_epu16 = __builtin_ia32_psubusw128; 3902 } 3903 else 3904 { 3905 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 3906 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 3907 { 3908 short[8] res; 3909 short8 sa = cast(short8)a; 3910 short8 sb = cast(short8)b; 3911 foreach(i; 0..8) 3912 { 3913 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 3914 res[i] = saturateSignedIntToUnsignedShort(sum); 3915 } 3916 return _mm_loadu_si128(cast(int4*)res.ptr); 3917 } 3918 } 3919 } 3920 unittest 3921 { 3922 short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534, 1, 5, 4, 3, 2, 1, 0), 3923 _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0)); 3924 static immutable short[8] correct = [ 0, 0, 1, 0, 0, 2, 0, 0]; 3925 assert(R.array == correct); 3926 } 3927 3928 version(LDC) 3929 { 3930 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 3931 { 3932 // x86: Generates PSUBUSB since LDC 1.15 -O0 3933 // ARM: Generates uqsub.16b since LDC 1.21 -O0 3934 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 3935 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 3936 { 3937 enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 3938 enum ir = ` 3939 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 3940 ret <16 x i8> %r`; 3941 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 3942 } 3943 } 3944 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 3945 { 3946 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 3947 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 3948 { 3949 // PERF #ARM32 Use an intrinsic in gccbuiltins_arm.d instead 3950 ubyte[16] res; 3951 byte16 sa = cast(byte16)a; 3952 byte16 sb = cast(byte16)b; 3953 foreach(i; 0..16) 3954 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 3955 return _mm_loadu_si128(cast(int4*)res.ptr); 3956 } 3957 } 3958 else 3959 alias _mm_subs_epu8 = __builtin_ia32_psubusb128; 3960 } 3961 else 3962 { 3963 static if (GDC_with_SSE2) 3964 { 3965 alias _mm_subs_epu8 = __builtin_ia32_psubusb128; 3966 } 3967 else 3968 { 3969 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 3970 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 3971 { 3972 ubyte[16] res; 3973 byte16 sa = cast(byte16)a; 3974 byte16 sb = cast(byte16)b; 3975 foreach(i; 0..16) 3976 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 3977 return _mm_loadu_si128(cast(int4*)res.ptr); 3978 } 3979 } 3980 } 3981 unittest 3982 { 3983 byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 3984 _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 3985 static immutable byte[16] correctResult = [ 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 3986 assert(res.array == correctResult); 3987 } 3988 3989 // Note: the only difference between these intrinsics is the signalling 3990 // behaviour of quiet NaNs. This is incorrect but the case where 3991 // you would want to differentiate between qNaN and sNaN and then 3992 // treat them differently on purpose seems extremely rare. 3993 alias _mm_ucomieq_sd = _mm_comieq_sd; 3994 alias _mm_ucomige_sd = _mm_comige_sd; 3995 alias _mm_ucomigt_sd = _mm_comigt_sd; 3996 alias _mm_ucomile_sd = _mm_comile_sd; 3997 alias _mm_ucomilt_sd = _mm_comilt_sd; 3998 alias _mm_ucomineq_sd = _mm_comineq_sd; 3999 4000 __m128d _mm_undefined_pd() pure @safe 4001 { 4002 __m128d result = void; 4003 return result; 4004 } 4005 __m128i _mm_undefined_si128() pure @safe 4006 { 4007 __m128i result = void; 4008 return result; 4009 } 4010 4011 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe 4012 { 4013 static if (GDC_with_SSE2) 4014 { 4015 return __builtin_ia32_punpckhwd128(a, b); 4016 } 4017 else static if (DMD_with_32bit_asm) 4018 { 4019 asm pure nothrow @nogc @trusted 4020 { 4021 movdqu XMM0, a; 4022 movdqu XMM1, b; 4023 punpckhwd XMM0, XMM1; 4024 movdqu a, XMM0; 4025 } 4026 return a; 4027 } 4028 else 4029 { 4030 return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 4031 (cast(short8)a, cast(short8)b); 4032 } 4033 } 4034 unittest 4035 { 4036 __m128i A = _mm_setr_epi16(4, 5, 6, 7, 8, 9, 10, 11); 4037 __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19); 4038 short8 C = cast(short8)(_mm_unpackhi_epi16(A, B)); 4039 short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19]; 4040 assert(C.array == correct); 4041 } 4042 4043 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @safe 4044 { 4045 static if (GDC_with_SSE2) 4046 { 4047 return __builtin_ia32_punpckhdq128(a, b); 4048 } 4049 else 4050 { 4051 return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 4052 } 4053 } 4054 4055 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted 4056 { 4057 static if (GDC_with_SSE2) 4058 { 4059 return __builtin_ia32_punpckhqdq128(a, b); 4060 } 4061 else 4062 { 4063 __m128i r = cast(__m128i)b; 4064 r[0] = a[2]; 4065 r[1] = a[3]; 4066 return r; 4067 } 4068 } 4069 unittest // Issue #36 4070 { 4071 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4072 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4073 long2 C = cast(long2)(_mm_unpackhi_epi64(A, B)); 4074 long[2] correct = [0x33333333_33333333, 0x55555555_55555555]; 4075 assert(C.array == correct); 4076 } 4077 4078 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe 4079 { 4080 static if (GDC_with_SSE2) 4081 { 4082 return __builtin_ia32_punpckhbw128(a, b); 4083 } 4084 else static if (DMD_with_32bit_asm) 4085 { 4086 asm pure nothrow @nogc @trusted 4087 { 4088 movdqu XMM0, a; 4089 movdqu XMM1, b; 4090 punpckhbw XMM0, XMM1; 4091 movdqu a, XMM0; 4092 } 4093 return a; 4094 } 4095 else 4096 { 4097 return cast(__m128i)shufflevector!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 4098 12, 28, 13, 29, 14, 30, 15, 31) 4099 (cast(byte16)a, cast(byte16)b); 4100 } 4101 } 4102 4103 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe 4104 { 4105 static if (GDC_with_SSE2) 4106 { 4107 return __builtin_ia32_unpckhpd(a, b); 4108 } 4109 else 4110 { 4111 return shufflevector!(__m128d, 1, 3)(a, b); 4112 } 4113 } 4114 4115 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe 4116 { 4117 static if (GDC_with_SSE2) 4118 { 4119 return __builtin_ia32_punpcklwd128(a, b); 4120 } 4121 else static if (DMD_with_32bit_asm) 4122 { 4123 asm pure nothrow @nogc @trusted 4124 { 4125 movdqu XMM0, a; 4126 movdqu XMM1, b; 4127 punpcklwd XMM0, XMM1; 4128 movdqu a, XMM0; 4129 } 4130 return a; 4131 } 4132 else 4133 { 4134 return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11) 4135 (cast(short8)a, cast(short8)b); 4136 } 4137 } 4138 4139 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @safe 4140 { 4141 static if (GDC_with_SSE2) 4142 { 4143 return __builtin_ia32_punpckldq128(a, b); 4144 } 4145 else 4146 { 4147 return shufflevector!(int4, 0, 4, 1, 5) 4148 (cast(int4)a, cast(int4)b); 4149 } 4150 } 4151 4152 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted 4153 { 4154 static if (GDC_with_SSE2) 4155 { 4156 return __builtin_ia32_punpcklqdq128(a, b); 4157 } 4158 else 4159 { 4160 long2 lA = cast(long2)a; 4161 long2 lB = cast(long2)b; 4162 long2 R; 4163 R.ptr[0] = lA.array[0]; 4164 R.ptr[1] = lB.array[0]; 4165 return cast(__m128i)R; 4166 } 4167 } 4168 unittest // Issue #36 4169 { 4170 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4171 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4172 long2 C = cast(long2)(_mm_unpacklo_epi64(A, B)); 4173 long[2] correct = [0x22222222_22222222, 0x44444444_44444444]; 4174 assert(C.array == correct); 4175 } 4176 4177 4178 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe 4179 { 4180 static if (GDC_with_SSE2) 4181 { 4182 return __builtin_ia32_punpcklbw128(a, b); 4183 } 4184 else static if (DMD_with_32bit_asm) 4185 { 4186 asm pure nothrow @nogc @trusted 4187 { 4188 movdqu XMM0, a; 4189 movdqu XMM1, b; 4190 punpcklbw XMM0, XMM1; 4191 movdqu a, XMM0; 4192 } 4193 return a; 4194 } 4195 else 4196 { 4197 return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 4198 4, 20, 5, 21, 6, 22, 7, 23) 4199 (cast(byte16)a, cast(byte16)b); 4200 } 4201 } 4202 4203 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe 4204 { 4205 static if (GDC_with_SSE2) 4206 { 4207 return __builtin_ia32_unpcklpd(a, b); 4208 } 4209 else 4210 { 4211 return shufflevector!(__m128d, 0, 2)(a, b); 4212 } 4213 } 4214 4215 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 4216 { 4217 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 4218 } 4219 4220 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 4221 { 4222 return a ^ b; 4223 } 4224 4225 unittest 4226 { 4227 // distance between two points in 4D 4228 float distance(float[4] a, float[4] b) nothrow @nogc 4229 { 4230 __m128 va = _mm_loadu_ps(a.ptr); 4231 __m128 vb = _mm_loadu_ps(b.ptr); 4232 __m128 diffSquared = _mm_sub_ps(va, vb); 4233 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 4234 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared)); 4235 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum)); 4236 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 4237 } 4238 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 4239 }