1 /** 2 * SSE2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2 4 * 5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.emmintrin; 9 10 public import inteli.types; 11 public import inteli.xmmintrin; // SSE2 includes SSE1 12 import inteli.mmx; 13 import inteli.internals; 14 15 nothrow @nogc: 16 17 18 // SSE2 instructions 19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 20 21 /// Add packed 16-bit integers in `a` and `b`. 22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 23 { 24 pragma(inline, true); 25 return cast(__m128i)(cast(short8)a + cast(short8)b); 26 } 27 unittest 28 { 29 __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77); 30 short8 R = cast(short8) _mm_add_epi16(A, A); 31 short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154]; 32 assert(R.array == correct); 33 } 34 35 /// Add packed 32-bit integers in `a` and `b`. 36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 37 { 38 pragma(inline, true); 39 return cast(__m128i)(cast(int4)a + cast(int4)b); 40 } 41 unittest 42 { 43 __m128i A = _mm_setr_epi32( -7, -1, 0, 9); 44 int4 R = _mm_add_epi32(A, A); 45 int[4] correct = [ -14, -2, 0, 18 ]; 46 assert(R.array == correct); 47 } 48 49 /// Add packed 64-bit integers in `a` and `b`. 50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 51 { 52 pragma(inline, true); 53 return cast(__m128i)(cast(long2)a + cast(long2)b); 54 } 55 unittest 56 { 57 __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000); 58 long2 R = cast(long2) _mm_add_epi64(A, A); 59 long[2] correct = [ -2, 0 ]; 60 assert(R.array == correct); 61 } 62 63 /// Add packed 8-bit integers in `a` and `b`. 64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 65 { 66 pragma(inline, true); 67 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 68 } 69 unittest 70 { 71 __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78); 72 byte16 R = cast(byte16) _mm_add_epi8(A, A); 73 byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100]; 74 assert(R.array == correct); 75 } 76 77 /// Add the lower double-precision (64-bit) floating-point element 78 /// in `a` and `b`, store the result in the lower element of dst, 79 /// and copy the upper element from `a` to the upper element of destination. 80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 81 { 82 static if (GDC_with_SSE2) 83 { 84 return __builtin_ia32_addsd(a, b); 85 } 86 else version(DigitalMars) 87 { 88 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 89 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 90 asm pure nothrow @nogc @trusted { nop;} 91 a[0] = a[0] + b[0]; 92 return a; 93 } 94 else 95 { 96 a[0] += b[0]; 97 return a; 98 } 99 } 100 unittest 101 { 102 __m128d a = [1.5, -2.0]; 103 a = _mm_add_sd(a, a); 104 assert(a.array == [3.0, -2.0]); 105 } 106 107 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 108 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 109 { 110 pragma(inline, true); 111 return a + b; 112 } 113 unittest 114 { 115 __m128d a = [1.5, -2.0]; 116 a = _mm_add_pd(a, a); 117 assert(a.array == [3.0, -4.0]); 118 } 119 120 /// Add 64-bit integers `a` and `b`. 121 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe 122 { 123 pragma(inline, true); 124 return a + b; 125 } 126 127 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 128 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 129 { 130 static if (GDC_with_SSE2) 131 { 132 return cast(__m128i)__builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 133 } 134 else version(LDC) 135 { 136 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 137 { 138 // x86: Generates PADDSW since LDC 1.15 -O0 139 // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20 140 enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 141 enum ir = ` 142 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 143 ret <8 x i16> %r`; 144 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 145 } 146 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 147 { 148 short[8] res; // PERF =void; 149 short8 sa = cast(short8)a; 150 short8 sb = cast(short8)b; 151 foreach(i; 0..8) 152 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 153 return _mm_loadu_si128(cast(int4*)res.ptr); 154 } 155 else 156 return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 157 } 158 else 159 { 160 short[8] res; // PERF =void; 161 short8 sa = cast(short8)a; 162 short8 sb = cast(short8)b; 163 foreach(i; 0..8) 164 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 165 return _mm_loadu_si128(cast(int4*)res.ptr); 166 } 167 } 168 unittest 169 { 170 short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), 171 _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); 172 static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 173 assert(res.array == correctResult); 174 } 175 176 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 177 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 178 { 179 static if (GDC_with_SSE2) 180 { 181 return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b); 182 } 183 else version(LDC) 184 { 185 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 186 { 187 // x86: Generates PADDSB since LDC 1.15 -O0 188 // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20 189 enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 190 enum ir = ` 191 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 192 ret <16 x i8> %r`; 193 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 194 } 195 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 196 { 197 byte[16] res; // PERF =void; 198 byte16 sa = cast(byte16)a; 199 byte16 sb = cast(byte16)b; 200 foreach(i; 0..16) 201 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 202 return _mm_loadu_si128(cast(int4*)res.ptr); 203 } 204 else 205 return cast(__m128i) __builtin_ia32_paddsb128(cast(byte16)a, cast(byte16)b); 206 } 207 else 208 { 209 byte[16] res; // PERF =void; 210 byte16 sa = cast(byte16)a; 211 byte16 sb = cast(byte16)b; 212 foreach(i; 0..16) 213 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 214 return _mm_loadu_si128(cast(int4*)res.ptr); 215 } 216 } 217 unittest 218 { 219 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 220 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 221 static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14, 222 16, 18, 20, 22, 24, 26, 28, 30]; 223 assert(res.array == correctResult); 224 } 225 226 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 227 // PERF: #GDC version? 228 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 229 { 230 version(LDC) 231 { 232 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 233 { 234 // x86: Generates PADDUSB since LDC 1.15 -O0 235 // ARM: Generates uqadd.16b since LDC 1.21 -O1 236 enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 237 enum ir = ` 238 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 239 ret <16 x i8> %r`; 240 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 241 } 242 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 243 { 244 ubyte[16] res; // PERF =void; 245 byte16 sa = cast(byte16)a; 246 byte16 sb = cast(byte16)b; 247 foreach(i; 0..16) 248 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 249 return _mm_loadu_si128(cast(int4*)res.ptr); 250 } 251 else 252 return __builtin_ia32_paddusb128(a, b); 253 } 254 else 255 { 256 ubyte[16] res; // PERF =void; 257 byte16 sa = cast(byte16)a; 258 byte16 sb = cast(byte16)b; 259 foreach(i; 0..16) 260 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 261 return _mm_loadu_si128(cast(int4*)res.ptr); 262 } 263 } 264 unittest 265 { 266 byte16 res = cast(byte16) 267 _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0), 268 _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0)); 269 static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 270 0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 271 assert(res.array == correctResult); 272 } 273 274 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation. 275 // PERF: #GDC version? 276 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 277 { 278 version(LDC) 279 { 280 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 281 { 282 // x86: Generates PADDUSW since LDC 1.15 -O0 283 // ARM: Generates uqadd.8h since LDC 1.21 -O1 284 enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 285 enum ir = ` 286 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 287 ret <8 x i16> %r`; 288 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 289 } 290 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 291 { 292 ushort[8] res; // PERF =void; 293 short8 sa = cast(short8)a; 294 short8 sb = cast(short8)b; 295 foreach(i; 0..8) 296 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 297 return _mm_loadu_si128(cast(int4*)res.ptr); 298 } 299 else 300 return __builtin_ia32_paddusw128(a, b); 301 } 302 else 303 { 304 ushort[8] res; // PERF =void; 305 short8 sa = cast(short8)a; 306 short8 sb = cast(short8)b; 307 foreach(i; 0..8) 308 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 309 return _mm_loadu_si128(cast(int4*)res.ptr); 310 } 311 } 312 unittest 313 { 314 short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0), 315 _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0)); 316 static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6]; 317 assert(res.array == correctResult); 318 } 319 320 /// Compute the bitwise AND of packed double-precision (64-bit) 321 /// floating-point elements in `a` and `b`. 322 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 323 { 324 pragma(inline, true); 325 return cast(__m128d)( cast(long2)a & cast(long2)b ); 326 } 327 unittest 328 { 329 double a = 4.32; 330 double b = -78.99; 331 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 332 __m128d A = _mm_set_pd(a, b); 333 __m128d B = _mm_set_pd(b, a); 334 long2 R = cast(long2)( _mm_and_pd(A, B) ); 335 assert(R.array[0] == correct); 336 assert(R.array[1] == correct); 337 } 338 339 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`. 340 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 341 { 342 pragma(inline, true); 343 return a & b; 344 } 345 unittest 346 { 347 __m128i A = _mm_set1_epi32(7); 348 __m128i B = _mm_set1_epi32(14); 349 __m128i R = _mm_and_si128(A, B); 350 int[4] correct = [6, 6, 6, 6]; 351 assert(R.array == correct); 352 } 353 354 /// Compute the bitwise NOT of packed double-precision (64-bit) 355 /// floating-point elements in `a` and then AND with `b`. 356 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 357 { 358 return cast(__m128d)( ~(cast(long2)a) & cast(long2)b); 359 } 360 unittest 361 { 362 double a = 4.32; 363 double b = -78.99; 364 long correct = (~*cast(long*)(&a)) & ( *cast(long*)(&b)); 365 long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b)); 366 __m128d A = _mm_setr_pd(a, b); 367 __m128d B = _mm_setr_pd(b, a); 368 long2 R = cast(long2)( _mm_andnot_pd(A, B) ); 369 assert(R.array[0] == correct); 370 assert(R.array[1] == correct2); 371 } 372 373 /// Compute the bitwise NOT of 128 bits (representing integer data) 374 /// in `a` and then AND with `b`. 375 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 376 { 377 return (~a) & b; 378 } 379 unittest 380 { 381 __m128i A = _mm_set1_epi32(7); 382 __m128i B = _mm_set1_epi32(14); 383 __m128i R = _mm_andnot_si128(A, B); 384 int[4] correct = [8, 8, 8, 8]; 385 assert(R.array == correct); 386 } 387 388 /// Average packed unsigned 16-bit integers in `a` and `b`. 389 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted 390 { 391 static if (GDC_with_SSE2) 392 { 393 return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b); 394 } 395 else static if (LDC_with_ARM64) 396 { 397 return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b); 398 } 399 else version(LDC) 400 { 401 // Generates pavgw even in LDC 1.0, even in -O0 402 // But not in ARM 403 enum ir = ` 404 %ia = zext <8 x i16> %0 to <8 x i32> 405 %ib = zext <8 x i16> %1 to <8 x i32> 406 %isum = add <8 x i32> %ia, %ib 407 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 408 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 409 %r = trunc <8 x i32> %isums to <8 x i16> 410 ret <8 x i16> %r`; 411 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 412 } 413 else 414 { 415 short8 sa = cast(short8)a; 416 short8 sb = cast(short8)b; 417 short8 sr = void; 418 foreach(i; 0..8) 419 { 420 sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 ); 421 } 422 return cast(int4)sr; 423 } 424 } 425 unittest 426 { 427 __m128i A = _mm_set1_epi16(31); 428 __m128i B = _mm_set1_epi16(64); 429 short8 avg = cast(short8)(_mm_avg_epu16(A, B)); 430 foreach(i; 0..8) 431 assert(avg.array[i] == 48); 432 } 433 434 /// Average packed unsigned 8-bit integers in `a` and `b`. 435 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted 436 { 437 static if (GDC_with_SSE2) 438 { 439 return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b); 440 } 441 else static if (LDC_with_ARM64) 442 { 443 return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b); 444 } 445 else version(LDC) 446 { 447 // Generates pavgb even in LDC 1.0, even in -O0 448 // But not in ARM 449 enum ir = ` 450 %ia = zext <16 x i8> %0 to <16 x i16> 451 %ib = zext <16 x i8> %1 to <16 x i16> 452 %isum = add <16 x i16> %ia, %ib 453 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 454 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 455 %r = trunc <16 x i16> %isums to <16 x i8> 456 ret <16 x i8> %r`; 457 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 458 } 459 else 460 { 461 byte16 sa = cast(byte16)a; 462 byte16 sb = cast(byte16)b; 463 byte16 sr = void; 464 foreach(i; 0..16) 465 { 466 sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 ); 467 } 468 return cast(int4)sr; 469 } 470 } 471 unittest 472 { 473 __m128i A = _mm_set1_epi8(31); 474 __m128i B = _mm_set1_epi8(64); 475 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B)); 476 foreach(i; 0..16) 477 assert(avg.array[i] == 48); 478 } 479 480 /// Shift `a` left by `bytes` bytes while shifting in zeros. 481 alias _mm_bslli_si128 = _mm_slli_si128; 482 unittest 483 { 484 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 485 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 486 __m128i result = _mm_bslli_si128!5(toShift); 487 assert( (cast(byte16)result).array == exact); 488 } 489 490 /// Shift `v` right by `bytes` bytes while shifting in zeros. 491 alias _mm_bsrli_si128 = _mm_srli_si128; 492 unittest 493 { 494 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 495 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0]; 496 __m128i result = _mm_bsrli_si128!5(toShift); 497 assert( (cast(byte16)result).array == exact); 498 } 499 500 /// Cast vector of type `__m128d` to type `__m128`. 501 /// Note: Also possible with a regular `cast(__m128)(a)`. 502 __m128 _mm_castpd_ps (__m128d a) pure @safe 503 { 504 return cast(__m128)a; 505 } 506 507 /// Cast vector of type `__m128d` to type `__m128i`. 508 /// Note: Also possible with a regular `cast(__m128i)(a)`. 509 __m128i _mm_castpd_si128 (__m128d a) pure @safe 510 { 511 return cast(__m128i)a; 512 } 513 514 /// Cast vector of type `__m128` to type `__m128d`. 515 /// Note: Also possible with a regular `cast(__m128d)(a)`. 516 __m128d _mm_castps_pd (__m128 a) pure @safe 517 { 518 return cast(__m128d)a; 519 } 520 521 /// Cast vector of type `__m128` to type `__m128i`. 522 /// Note: Also possible with a regular `cast(__m128i)(a)`. 523 __m128i _mm_castps_si128 (__m128 a) pure @safe 524 { 525 return cast(__m128i)a; 526 } 527 528 /// Cast vector of type `__m128i` to type `__m128d`. 529 /// Note: Also possible with a regular `cast(__m128d)(a)`. 530 __m128d _mm_castsi128_pd (__m128i a) pure @safe 531 { 532 return cast(__m128d)a; 533 } 534 535 /// Cast vector of type `__m128i` to type `__m128`. 536 /// Note: Also possible with a regular `cast(__m128)(a)`. 537 __m128 _mm_castsi128_ps (__m128i a) pure @safe 538 { 539 return cast(__m128)a; 540 } 541 542 /// Invalidate and flush the cache line that contains `p` 543 /// from all levels of the cache hierarchy. 544 void _mm_clflush (const(void)* p) @trusted 545 { 546 static if (GDC_with_SSE2) 547 { 548 __builtin_ia32_clflush(p); 549 } 550 else static if (LDC_with_SSE2) 551 { 552 __builtin_ia32_clflush(cast(void*)p); 553 } 554 else version(D_InlineAsm_X86) 555 { 556 asm pure nothrow @nogc @safe 557 { 558 mov EAX, p; 559 clflush [EAX]; 560 } 561 } 562 else version(D_InlineAsm_X86_64) 563 { 564 asm pure nothrow @nogc @safe 565 { 566 mov RAX, p; 567 clflush [RAX]; 568 } 569 } 570 else 571 { 572 // Do nothing. Invalidating cacheline does 573 // not affect correctness. 574 } 575 } 576 unittest 577 { 578 ubyte[64] cacheline; 579 _mm_clflush(cacheline.ptr); 580 } 581 582 /// Compare packed 16-bit integers in `a` and `b` for equality. 583 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 584 { 585 static if (GDC_with_SSE2) 586 { 587 return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b); 588 } 589 else 590 { 591 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 592 } 593 } 594 unittest 595 { 596 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 597 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 598 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 599 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 600 assert(R.array == E); 601 } 602 603 /// Compare packed 32-bit integers in `a` and `b` for equality. 604 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 605 { 606 static if (GDC_with_SSE2) 607 { 608 return __builtin_ia32_pcmpeqd128(a, b); 609 } 610 else 611 { 612 return equalMask!__m128i(a, b); 613 } 614 } 615 unittest 616 { 617 int4 A = [-3, -2, -1, 0]; 618 int4 B = [ 4, -2, 2, 0]; 619 int[4] E = [ 0, -1, 0, -1]; 620 int4 R = cast(int4)(_mm_cmpeq_epi32(A, B)); 621 assert(R.array == E); 622 } 623 624 /// Compare packed 8-bit integers in `a` and `b` for equality. 625 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 626 { 627 static if (GDC_with_SSE2) 628 { 629 return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b); 630 } 631 else 632 { 633 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 634 } 635 } 636 unittest 637 { 638 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 639 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 640 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 641 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 642 assert(C.array == correct); 643 } 644 645 /// Compare packed double-precision (64-bit) floating-point elements 646 /// in `a` and `b` for equality. 647 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 648 { 649 static if (GDC_with_SSE2) 650 { 651 return __builtin_ia32_cmpeqpd(a, b); 652 } 653 else 654 { 655 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b); 656 } 657 } 658 659 /// Compare the lower double-precision (64-bit) floating-point elements 660 /// in `a` and `b` for equality, store the result in the lower element, 661 /// and copy the upper element from `a`. 662 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 663 { 664 static if (GDC_with_SSE2) 665 { 666 return __builtin_ia32_cmpeqsd(a, b); 667 } 668 else 669 { 670 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b); 671 } 672 } 673 674 /// Compare packed double-precision (64-bit) floating-point elements 675 /// in `a` and `b` for greater-than-or-equal. 676 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 677 { 678 static if (GDC_with_SSE2) 679 { 680 return __builtin_ia32_cmpgepd(a, b); 681 } 682 else 683 { 684 return cast(__m128d) cmppd!(FPComparison.oge)(a, b); 685 } 686 } 687 688 /// Compare the lower double-precision (64-bit) floating-point elements 689 /// in `a` and `b` for greater-than-or-equal, store the result in the 690 /// lower element, and copy the upper element from `a`. 691 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 692 { 693 // Note: There is no __builtin_ia32_cmpgesd builtin. 694 static if (GDC_with_SSE2) 695 { 696 return __builtin_ia32_cmpnltsd(b, a); 697 } 698 else 699 { 700 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b); 701 } 702 } 703 704 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 705 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 706 { 707 static if (GDC_with_SSE2) 708 { 709 return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b); 710 } 711 else 712 { 713 return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b); 714 } 715 } 716 unittest 717 { 718 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 719 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 720 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 721 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 722 assert(R.array == E); 723 } 724 725 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 726 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 727 { 728 static if (GDC_with_SSE2) 729 { 730 return __builtin_ia32_pcmpgtd128(a, b); 731 } 732 else 733 { 734 return cast(__m128i)( greaterMask!int4(a, b)); 735 } 736 } 737 unittest 738 { 739 int4 A = [-3, 2, -1, 0]; 740 int4 B = [ 4, -2, 2, 0]; 741 int[4] E = [ 0, -1, 0, 0]; 742 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 743 assert(R.array == E); 744 } 745 746 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 747 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 748 { 749 // Workaround of a GCC bug here. 750 // Of course the GCC builtin is buggy and generates a weird (and wrong) sequence 751 // with __builtin_ia32_pcmpgtb128. 752 // GCC's emmintrin.h uses comparison operators we don't have instead. 753 // PERF: this is a quite severe GDC performance problem. 754 // Could be workarounded with inline assembly, or another algorithm I guess. 755 756 /* 757 static if (GDC_with_SSE2) 758 { 759 return cast(__m128i) __builtin_ia32_pcmpgtb128(cast(ubyte16)a, cast(ubyte16)b); 760 } 761 else */ 762 { 763 return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b); 764 } 765 } 766 unittest 767 { 768 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 769 __m128i B = _mm_setr_epi8(2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 770 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 771 byte[16] correct = [0, 0,-1, 0, -1, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 772 __m128i D = _mm_cmpeq_epi8(A, B); 773 assert(C.array == correct); 774 } 775 776 /// Compare packed double-precision (64-bit) floating-point elements 777 /// in `a` and `b` for greater-than. 778 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 779 { 780 static if (GDC_with_SSE2) 781 { 782 return __builtin_ia32_cmpgtpd(a, b); 783 } 784 else 785 { 786 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b); 787 } 788 } 789 790 /// Compare the lower double-precision (64-bit) floating-point elements 791 /// in `a` and `b` for greater-than, store the result in the lower element, 792 /// and copy the upper element from `a`. 793 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 794 { 795 // Note: There is no __builtin_ia32_cmpgtsd builtin. 796 static if (GDC_with_SSE2) 797 { 798 return __builtin_ia32_cmpnlesd(b, a); 799 } 800 else 801 { 802 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b); 803 } 804 } 805 806 /// Compare packed double-precision (64-bit) floating-point elements 807 /// in `a` and `b` for less-than-or-equal. 808 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 809 { 810 static if (GDC_with_SSE2) 811 { 812 return __builtin_ia32_cmplepd(a, b); 813 } 814 else 815 { 816 return cast(__m128d) cmppd!(FPComparison.ole)(a, b); 817 } 818 } 819 820 /// Compare the lower double-precision (64-bit) floating-point elements 821 /// in `a` and `b` for less-than-or-equal, store the result in the 822 /// lower element, and copy the upper element from `a`. 823 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 824 { 825 static if (GDC_with_SSE2) 826 { 827 return __builtin_ia32_cmplesd(a, b); 828 } 829 else 830 { 831 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b); 832 } 833 } 834 835 /// Compare packed 16-bit integers in `a` and `b` for less-than. 836 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 837 { 838 return _mm_cmpgt_epi16(b, a); 839 } 840 841 /// Compare packed 32-bit integers in `a` and `b` for less-than. 842 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 843 { 844 return _mm_cmpgt_epi32(b, a); 845 } 846 847 /// Compare packed 8-bit integers in `a` and `b` for less-than. 848 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 849 { 850 return _mm_cmpgt_epi8(b, a); 851 } 852 853 /// Compare packed double-precision (64-bit) floating-point elements 854 /// in `a` and `b` for less-than. 855 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 856 { 857 static if (GDC_with_SSE2) 858 { 859 return __builtin_ia32_cmpltpd(a, b); 860 } 861 else 862 { 863 return cast(__m128d) cmppd!(FPComparison.olt)(a, b); 864 } 865 } 866 867 /// Compare the lower double-precision (64-bit) floating-point elements 868 /// in `a` and `b` for less-than, store the result in the lower 869 /// element, and copy the upper element from `a`. 870 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 871 { 872 static if (GDC_with_SSE2) 873 { 874 return __builtin_ia32_cmpltsd(a, b); 875 } 876 else 877 { 878 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b); 879 } 880 } 881 882 /// Compare packed double-precision (64-bit) floating-point elements 883 /// in `a` and `b` for not-equal. 884 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 885 { 886 static if (GDC_with_SSE2) 887 { 888 return __builtin_ia32_cmpneqpd(a, b); 889 } 890 else 891 { 892 return cast(__m128d) cmppd!(FPComparison.une)(a, b); 893 } 894 } 895 896 /// Compare the lower double-precision (64-bit) floating-point elements 897 /// in `a` and `b` for not-equal, store the result in the lower 898 /// element, and copy the upper element from `a`. 899 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 900 { 901 static if (GDC_with_SSE2) 902 { 903 return __builtin_ia32_cmpneqsd(a, b); 904 } 905 else 906 { 907 return cast(__m128d) cmpsd!(FPComparison.une)(a, b); 908 } 909 } 910 911 /// Compare packed double-precision (64-bit) floating-point elements 912 /// in `a` and `b` for not-greater-than-or-equal. 913 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 914 { 915 static if (GDC_with_SSE2) 916 { 917 return __builtin_ia32_cmpngepd(a, b); 918 } 919 else 920 { 921 return cast(__m128d) cmppd!(FPComparison.ult)(a, b); 922 } 923 } 924 925 /// Compare the lower double-precision (64-bit) floating-point elements 926 /// in `a` and `b` for not-greater-than-or-equal, store the result in 927 /// the lower element, and copy the upper element from `a`. 928 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 929 { 930 // Note: There is no __builtin_ia32_cmpngesd builtin. 931 static if (GDC_with_SSE2) 932 { 933 return __builtin_ia32_cmpltsd(b, a); 934 } 935 else 936 { 937 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b); 938 } 939 } 940 941 /// Compare packed double-precision (64-bit) floating-point elements 942 /// in `a` and `b` for not-greater-than. 943 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 944 { 945 static if (GDC_with_SSE2) 946 { 947 return __builtin_ia32_cmpngtpd(a, b); 948 } 949 else 950 { 951 return cast(__m128d) cmppd!(FPComparison.ule)(a, b); 952 } 953 } 954 955 /// Compare the lower double-precision (64-bit) floating-point elements 956 /// in `a` and `b` for not-greater-than, store the result in the 957 /// lower element, and copy the upper element from `a`. 958 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 959 { 960 // Note: There is no __builtin_ia32_cmpngtsd builtin. 961 static if (GDC_with_SSE2) 962 { 963 return __builtin_ia32_cmplesd(b, a); 964 } 965 else 966 { 967 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b); 968 } 969 } 970 971 /// Compare packed double-precision (64-bit) floating-point elements 972 /// in `a` and `b` for not-less-than-or-equal. 973 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 974 { 975 static if (GDC_with_SSE2) 976 { 977 return __builtin_ia32_cmpnlepd(a, b); 978 } 979 else 980 { 981 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b); 982 } 983 } 984 985 /// Compare the lower double-precision (64-bit) floating-point elements 986 /// in `a` and `b` for not-less-than-or-equal, store the result in the 987 /// lower element, and copy the upper element from `a`. 988 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 989 { 990 static if (GDC_with_SSE2) 991 { 992 return __builtin_ia32_cmpnlesd(a, b); 993 } 994 else 995 { 996 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b); 997 } 998 } 999 1000 /// Compare packed double-precision (64-bit) floating-point elements 1001 /// in `a` and `b` for not-less-than. 1002 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 1003 { 1004 static if (GDC_with_SSE2) 1005 { 1006 return __builtin_ia32_cmpnltpd(a, b); 1007 } 1008 else 1009 { 1010 return cast(__m128d) cmppd!(FPComparison.uge)(a, b); 1011 } 1012 } 1013 1014 /// Compare the lower double-precision (64-bit) floating-point elements 1015 /// in `a` and `b` for not-less-than, store the result in the lower 1016 /// element, and copy the upper element from `a`. 1017 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 1018 { 1019 static if (GDC_with_SSE2) 1020 { 1021 return __builtin_ia32_cmpnltsd(a, b); 1022 } 1023 else 1024 { 1025 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b); 1026 } 1027 } 1028 1029 /// Compare packed double-precision (64-bit) floating-point elements 1030 /// in `a` and `b` to see if neither is NaN. 1031 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 1032 { 1033 static if (GDC_with_SSE2) 1034 { 1035 return __builtin_ia32_cmpordpd(a, b); 1036 } 1037 else 1038 { 1039 return cast(__m128d) cmppd!(FPComparison.ord)(a, b); 1040 } 1041 } 1042 1043 /// Compare the lower double-precision (64-bit) floating-point elements 1044 /// in `a` and `b` to see if neither is NaN, store the result in the 1045 /// lower element, and copy the upper element from `a` to the upper element. 1046 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 1047 { 1048 static if (GDC_with_SSE2) 1049 { 1050 return __builtin_ia32_cmpordsd(a, b); 1051 } 1052 else 1053 { 1054 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b); 1055 } 1056 } 1057 1058 /// Compare packed double-precision (64-bit) floating-point elements 1059 /// in `a` and `b` to see if either is NaN. 1060 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 1061 { 1062 static if (GDC_with_SSE2) 1063 { 1064 return __builtin_ia32_cmpunordpd(a, b); 1065 } 1066 else 1067 { 1068 return cast(__m128d) cmppd!(FPComparison.uno)(a, b); 1069 } 1070 } 1071 1072 /// Compare the lower double-precision (64-bit) floating-point elements 1073 /// in `a` and `b` to see if either is NaN, store the result in the lower 1074 /// element, and copy the upper element from `a` to the upper element. 1075 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 1076 { 1077 static if (GDC_with_SSE2) 1078 { 1079 return __builtin_ia32_cmpunordsd(a, b); 1080 } 1081 else 1082 { 1083 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b); 1084 } 1085 } 1086 1087 /// Compare the lower double-precision (64-bit) floating-point element 1088 /// in `a` and `b` for equality, and return the boolean result (0 or 1). 1089 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe 1090 { 1091 // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 1092 // comisd instruction, it returns false in case of unordered instead. 1093 // 1094 // Actually C++ compilers disagree over the meaning of that instruction. 1095 // GCC will manage NaNs like the comisd instruction (return true if unordered), 1096 // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says. 1097 // We choose to do like the most numerous. It seems GCC is buggy with NaNs. 1098 return a.array[0] == b.array[0]; 1099 } 1100 unittest 1101 { 1102 assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1103 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1104 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1105 assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1106 assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1107 } 1108 1109 /// Compare the lower double-precision (64-bit) floating-point element 1110 /// in `a` and `b` for greater-than-or-equal, and return the boolean 1111 /// result (0 or 1). 1112 int _mm_comige_sd (__m128d a, __m128d b) pure @safe 1113 { 1114 return a.array[0] >= b.array[0]; 1115 } 1116 unittest 1117 { 1118 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1119 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1120 assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1121 assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1122 assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1123 assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1124 } 1125 1126 /// Compare the lower double-precision (64-bit) floating-point element 1127 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1). 1128 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe 1129 { 1130 return a.array[0] > b.array[0]; 1131 } 1132 unittest 1133 { 1134 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1135 assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1136 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1137 assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1138 assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1139 } 1140 1141 /// Compare the lower double-precision (64-bit) floating-point element 1142 /// in `a` and `b` for less-than-or-equal. 1143 int _mm_comile_sd (__m128d a, __m128d b) pure @safe 1144 { 1145 return a.array[0] <= b.array[0]; 1146 } 1147 unittest 1148 { 1149 assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1150 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1151 assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1152 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1153 assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1154 assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1155 } 1156 1157 /// Compare the lower double-precision (64-bit) floating-point element 1158 /// in `a` and `b` for less-than, and return the boolean result (0 or 1). 1159 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe 1160 { 1161 return a.array[0] < b.array[0]; 1162 } 1163 unittest 1164 { 1165 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1166 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1167 assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1168 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1169 assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1170 assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1171 } 1172 1173 /// Compare the lower double-precision (64-bit) floating-point element 1174 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1). 1175 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe 1176 { 1177 return a.array[0] != b.array[0]; 1178 } 1179 unittest 1180 { 1181 assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1182 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1183 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1184 assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1185 assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1186 } 1187 1188 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) 1189 /// floating-point elements. 1190 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted 1191 { 1192 version(LDC) 1193 { 1194 // Generates cvtdq2pd since LDC 1.0, even without optimizations 1195 enum ir = ` 1196 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 1197 %r = sitofp <2 x i32> %v to <2 x double> 1198 ret <2 x double> %r`; 1199 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a); 1200 } 1201 else static if (GDC_with_SSE2) 1202 { 1203 return __builtin_ia32_cvtdq2pd(a); 1204 } 1205 else 1206 { 1207 double2 r = void; 1208 r.ptr[0] = a.array[0]; 1209 r.ptr[1] = a.array[1]; 1210 return r; 1211 } 1212 } 1213 unittest 1214 { 1215 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54)); 1216 assert(A.array[0] == 54.0); 1217 assert(A.array[1] == 54.0); 1218 } 1219 1220 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 1221 /// floating-point elements. 1222 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted 1223 { 1224 static if (GDC_with_SSE2) 1225 { 1226 return __builtin_ia32_cvtdq2ps(a); 1227 } 1228 else version(LDC) 1229 { 1230 // See #86 for why we had to resort to LLVM IR. 1231 // Plain code below was leading to catastrophic behaviour. 1232 // x86: Generates cvtdq2ps since LDC 1.1.0 -O0 1233 // ARM: Generats scvtf.4s since LDC 1.8.0 -O0 1234 enum ir = ` 1235 %r = sitofp <4 x i32> %0 to <4 x float> 1236 ret <4 x float> %r`; 1237 return cast(__m128) LDCInlineIR!(ir, float4, int4)(a); 1238 } 1239 else 1240 { 1241 __m128 res; // PERF =void; 1242 res.ptr[0] = cast(float)a.array[0]; 1243 res.ptr[1] = cast(float)a.array[1]; 1244 res.ptr[2] = cast(float)a.array[2]; 1245 res.ptr[3] = cast(float)a.array[3]; 1246 return res; 1247 } 1248 } 1249 unittest 1250 { 1251 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 1252 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 1253 } 1254 1255 /// Convert packed double-precision (64-bit) floating-point elements 1256 /// in `a` to packed 32-bit integers. 1257 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted 1258 { 1259 // PERF ARM32 1260 static if (LDC_with_SSE2) 1261 { 1262 return __builtin_ia32_cvtpd2dq(a); 1263 } 1264 else static if (GDC_with_SSE2) 1265 { 1266 return __builtin_ia32_cvtpd2dq(a); 1267 } 1268 else static if (LDC_with_ARM64) 1269 { 1270 // Get current rounding mode. 1271 uint fpscr = arm_get_fpcr(); 1272 long2 i; 1273 switch(fpscr & _MM_ROUND_MASK_ARM) 1274 { 1275 default: 1276 case _MM_ROUND_NEAREST_ARM: i = vcvtnq_s64_f64(a); break; 1277 case _MM_ROUND_DOWN_ARM: i = vcvtmq_s64_f64(a); break; 1278 case _MM_ROUND_UP_ARM: i = vcvtpq_s64_f64(a); break; 1279 case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break; 1280 } 1281 int4 zero = 0; 1282 return cast(__m128i) shufflevector!(int4, 0, 2, 4, 6)(cast(int4)i, zero); 1283 } 1284 else 1285 { 1286 // PERF ARM32 1287 __m128i r = _mm_setzero_si128(); 1288 r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]); 1289 r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]); 1290 return r; 1291 } 1292 } 1293 unittest 1294 { 1295 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0)); 1296 assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0); 1297 } 1298 1299 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1300 /// to packed 32-bit integers 1301 __m64 _mm_cvtpd_pi32 (__m128d v) @safe 1302 { 1303 return to_m64(_mm_cvtpd_epi32(v)); 1304 } 1305 unittest 1306 { 1307 int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0)); 1308 assert(A.array[0] == 55 && A.array[1] == 61); 1309 } 1310 1311 /// Convert packed double-precision (64-bit) floating-point elements 1312 /// in `a` to packed single-precision (32-bit) floating-point elements. 1313 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted 1314 { 1315 static if (LDC_with_SSE2) 1316 { 1317 return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately 1318 } 1319 else static if (GDC_with_SSE2) 1320 { 1321 return __builtin_ia32_cvtpd2ps(a); 1322 } 1323 else 1324 { 1325 __m128 r = void; 1326 r.ptr[0] = a.array[0]; 1327 r.ptr[1] = a.array[1]; 1328 r.ptr[2] = 0; 1329 r.ptr[3] = 0; 1330 return r; 1331 } 1332 } 1333 unittest 1334 { 1335 __m128d A = _mm_set_pd(5.25, 4.0); 1336 __m128 B = _mm_cvtpd_ps(A); 1337 assert(B.array == [4.0f, 5.25f, 0, 0]); 1338 } 1339 1340 /// Convert packed 32-bit integers in `v` to packed double-precision 1341 /// (64-bit) floating-point elements. 1342 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe 1343 { 1344 return _mm_cvtepi32_pd(to_m128i(v)); 1345 } 1346 unittest 1347 { 1348 __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5)); 1349 assert(A.array[0] == 4.0 && A.array[1] == -5.0); 1350 } 1351 1352 /// Convert packed single-precision (32-bit) floating-point elements 1353 /// in `a` to packed 32-bit integers 1354 __m128i _mm_cvtps_epi32 (__m128 a) @trusted 1355 { 1356 static if (LDC_with_SSE2) 1357 { 1358 return cast(__m128i) __builtin_ia32_cvtps2dq(a); 1359 } 1360 else static if (GDC_with_SSE2) 1361 { 1362 return __builtin_ia32_cvtps2dq(a); 1363 } 1364 else static if (LDC_with_ARM64) 1365 { 1366 // Get current rounding mode. 1367 uint fpscr = arm_get_fpcr(); 1368 switch(fpscr & _MM_ROUND_MASK_ARM) 1369 { 1370 default: 1371 case _MM_ROUND_NEAREST_ARM: return vcvtnq_s32_f32(a); 1372 case _MM_ROUND_DOWN_ARM: return vcvtmq_s32_f32(a); 1373 case _MM_ROUND_UP_ARM: return vcvtpq_s32_f32(a); 1374 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a); 1375 } 1376 } 1377 else 1378 { 1379 __m128i r = void; 1380 r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]); 1381 r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]); 1382 r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]); 1383 r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]); 1384 return r; 1385 } 1386 } 1387 unittest 1388 { 1389 // GDC bug #98607 1390 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607 1391 // GDC does not provide optimization barrier for rounding mode. 1392 // Workarounded with different literals. This bug will likely only manifest in unittest. 1393 // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't. 1394 1395 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1396 1397 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1398 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1399 assert(A.array == [1, -2, 54, -3]); 1400 1401 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1402 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f)); 1403 assert(A.array == [1, -3, 53, -3]); 1404 1405 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1406 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f)); 1407 assert(A.array == [2, -2, 54, -2]); 1408 1409 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1410 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f)); 1411 assert(A.array == [1, -2, 53, -2]); 1412 1413 _MM_SET_ROUNDING_MODE(savedRounding); 1414 } 1415 1416 /// Convert packed single-precision (32-bit) floating-point elements 1417 /// in `a` to packed double-precision (64-bit) floating-point elements. 1418 __m128d _mm_cvtps_pd (__m128 a) pure @trusted 1419 { 1420 version(LDC) 1421 { 1422 // Generates cvtps2pd since LDC 1.0 -O0 1423 enum ir = ` 1424 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1> 1425 %r = fpext <2 x float> %v to <2 x double> 1426 ret <2 x double> %r`; 1427 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a); 1428 } 1429 else static if (GDC_with_SSE2) 1430 { 1431 return __builtin_ia32_cvtps2pd(a); 1432 } 1433 else 1434 { 1435 double2 r = void; 1436 r.ptr[0] = a.array[0]; 1437 r.ptr[1] = a.array[1]; 1438 return r; 1439 } 1440 } 1441 unittest 1442 { 1443 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f)); 1444 assert(A.array[0] == 54.0); 1445 assert(A.array[1] == 54.0); 1446 } 1447 1448 /// Copy the lower double-precision (64-bit) floating-point element of `a`. 1449 double _mm_cvtsd_f64 (__m128d a) pure @safe 1450 { 1451 return a.array[0]; 1452 } 1453 1454 /// Convert the lower double-precision (64-bit) floating-point element 1455 /// in `a` to a 32-bit integer. 1456 int _mm_cvtsd_si32 (__m128d a) @safe 1457 { 1458 static if (LDC_with_SSE2) 1459 { 1460 return __builtin_ia32_cvtsd2si(a); 1461 } 1462 else static if (GDC_with_SSE2) 1463 { 1464 return __builtin_ia32_cvtsd2si(a); 1465 } 1466 else 1467 { 1468 return convertDoubleToInt32UsingMXCSR(a[0]); 1469 } 1470 } 1471 unittest 1472 { 1473 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0))); 1474 } 1475 1476 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer. 1477 long _mm_cvtsd_si64 (__m128d a) @trusted 1478 { 1479 version (LDC) 1480 { 1481 version (X86_64) 1482 { 1483 return __builtin_ia32_cvtsd2si64(a); 1484 } 1485 else 1486 { 1487 // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer 1488 // using SSE instructions only. So the builtin doesn't exit for this arch. 1489 return convertDoubleToInt64UsingMXCSR(a[0]); 1490 } 1491 } 1492 else 1493 { 1494 return convertDoubleToInt64UsingMXCSR(a.array[0]); 1495 } 1496 } 1497 unittest 1498 { 1499 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0))); 1500 1501 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1502 1503 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1504 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49))); 1505 1506 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1507 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1))); 1508 1509 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1510 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1))); 1511 1512 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1513 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9))); 1514 1515 _MM_SET_ROUNDING_MODE(savedRounding); 1516 } 1517 1518 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; /// 1519 1520 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 1521 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a` 1522 /// to the upper elements of result. 1523 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted 1524 { 1525 static if (GDC_with_SSE2) 1526 { 1527 return __builtin_ia32_cvtsd2ss(a, b); 1528 } 1529 else 1530 { 1531 // Generates cvtsd2ss since LDC 1.3 -O0 1532 a.ptr[0] = b.array[0]; 1533 return a; 1534 } 1535 } 1536 unittest 1537 { 1538 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0)); 1539 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]); 1540 } 1541 1542 /// Get the lower 32-bit integer in `a`. 1543 int _mm_cvtsi128_si32 (__m128i a) pure @safe 1544 { 1545 return a.array[0]; 1546 } 1547 1548 /// Get the lower 64-bit integer in `a`. 1549 long _mm_cvtsi128_si64 (__m128i a) pure @safe 1550 { 1551 long2 la = cast(long2)a; 1552 return la.array[0]; 1553 } 1554 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 1555 1556 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 1557 /// lower element of result, and copy the upper element from `a` to the upper element of result. 1558 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted 1559 { 1560 a.ptr[0] = cast(double)b; 1561 return a; 1562 } 1563 unittest 1564 { 1565 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 1566 assert(a.array == [42.0, 0]); 1567 } 1568 1569 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements. 1570 __m128i _mm_cvtsi32_si128 (int a) pure @trusted 1571 { 1572 int4 r = [0, 0, 0, 0]; 1573 r.ptr[0] = a; 1574 return r; 1575 } 1576 unittest 1577 { 1578 __m128i a = _mm_cvtsi32_si128(65); 1579 assert(a.array == [65, 0, 0, 0]); 1580 } 1581 1582 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 1583 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 1584 1585 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted 1586 { 1587 a.ptr[0] = cast(double)b; 1588 return a; 1589 } 1590 unittest 1591 { 1592 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 1593 assert(a.array == [42.0, 0]); 1594 } 1595 1596 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element. 1597 __m128i _mm_cvtsi64_si128 (long a) pure @trusted 1598 { 1599 long2 r = [0, 0]; 1600 r.ptr[0] = a; 1601 return cast(__m128i)(r); 1602 } 1603 1604 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; /// 1605 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; /// 1606 1607 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 1608 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 1609 // element of result. 1610 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted 1611 { 1612 a.ptr[0] = b.array[0]; 1613 return a; 1614 } 1615 unittest 1616 { 1617 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 1618 assert(a.array == [42.0, 0]); 1619 } 1620 1621 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation. 1622 long _mm_cvttss_si64 (__m128 a) pure @safe 1623 { 1624 return cast(long)(a.array[0]); // Generates cvttss2si as expected 1625 } 1626 unittest 1627 { 1628 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1629 } 1630 1631 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1632 /// Put zeroes in the upper elements of result. 1633 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted 1634 { 1635 static if (LDC_with_SSE2) 1636 { 1637 return __builtin_ia32_cvttpd2dq(a); 1638 } 1639 else static if (GDC_with_SSE2) 1640 { 1641 return __builtin_ia32_cvttpd2dq(a); 1642 } 1643 else 1644 { 1645 // Note: doesn't generate cvttpd2dq as of LDC 1.13 1646 __m128i r; // PERF =void; 1647 r.ptr[0] = cast(int)a.array[0]; 1648 r.ptr[1] = cast(int)a.array[1]; 1649 r.ptr[2] = 0; 1650 r.ptr[3] = 0; 1651 return r; 1652 } 1653 } 1654 unittest 1655 { 1656 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f)); 1657 assert(R.array == [-4, 45641, 0, 0]); 1658 } 1659 1660 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1661 /// to packed 32-bit integers with truncation. 1662 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe 1663 { 1664 return to_m64(_mm_cvttpd_epi32(v)); 1665 } 1666 unittest 1667 { 1668 int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f)); 1669 int[2] correct = [-4, 45641]; 1670 assert(R.array == correct); 1671 } 1672 1673 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1674 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted 1675 { 1676 // x86: Generates cvttps2dq since LDC 1.3 -O2 1677 // ARM64: generates fcvtze since LDC 1.8 -O2 1678 __m128i r; // PERF = void; 1679 r.ptr[0] = cast(int)a.array[0]; 1680 r.ptr[1] = cast(int)a.array[1]; 1681 r.ptr[2] = cast(int)a.array[2]; 1682 r.ptr[3] = cast(int)a.array[3]; 1683 return r; 1684 } 1685 unittest 1686 { 1687 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f)); 1688 assert(R.array == [-4, 45641, 0, 1]); 1689 } 1690 1691 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation. 1692 int _mm_cvttsd_si32 (__m128d a) 1693 { 1694 // Generates cvttsd2si since LDC 1.3 -O0 1695 return cast(int)a.array[0]; 1696 } 1697 1698 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation. 1699 long _mm_cvttsd_si64 (__m128d a) 1700 { 1701 // Generates cvttsd2si since LDC 1.3 -O0 1702 // but in 32-bit instead, it's a long sequence that resort to FPU 1703 return cast(long)a.array[0]; 1704 } 1705 1706 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; /// 1707 1708 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`. 1709 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe 1710 { 1711 pragma(inline, true); 1712 return a / b; 1713 } 1714 1715 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted 1716 { 1717 static if (GDC_with_SSE2) 1718 { 1719 return __builtin_ia32_divsd(a, b); 1720 } 1721 else version(DigitalMars) 1722 { 1723 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1724 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 1725 asm pure nothrow @nogc @trusted { nop;} 1726 a.array[0] = a.array[0] / b.array[0]; 1727 return a; 1728 } 1729 else 1730 { 1731 a.ptr[0] /= b.array[0]; 1732 return a; 1733 } 1734 } 1735 unittest 1736 { 1737 __m128d a = [2.0, 4.5]; 1738 a = _mm_div_sd(a, a); 1739 assert(a.array == [1.0, 4.5]); 1740 } 1741 1742 /// Extract a 16-bit integer from `v`, selected with `index`. 1743 /// Warning: the returned value is zero-extended to 32-bits. 1744 int _mm_extract_epi16(__m128i v, int index) pure @safe 1745 { 1746 short8 r = cast(short8)v; 1747 return cast(ushort)(r.array[index & 7]); 1748 } 1749 unittest 1750 { 1751 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1); 1752 assert(_mm_extract_epi16(A, 6) == 6); 1753 assert(_mm_extract_epi16(A, 0) == 65535); 1754 assert(_mm_extract_epi16(A, 5 + 8) == 5); 1755 } 1756 1757 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`. 1758 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted 1759 { 1760 short8 r = cast(short8)v; 1761 r.ptr[index & 7] = cast(short)i; 1762 return cast(__m128i)r; 1763 } 1764 unittest 1765 { 1766 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1767 short8 R = cast(short8) _mm_insert_epi16(A, 42, 6); 1768 short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7]; 1769 assert(R.array == correct); 1770 } 1771 1772 /// Perform a serializing operation on all load-from-memory instructions that were issued prior 1773 /// to this instruction. Guarantees that every load instruction that precedes, in program order, 1774 /// is globally visible before any load instruction which follows the fence in program order. 1775 void _mm_lfence() @trusted 1776 { 1777 version(GNU) 1778 { 1779 static if (GDC_with_SSE2) 1780 { 1781 __builtin_ia32_lfence(); 1782 } 1783 else version(X86) 1784 { 1785 asm pure nothrow @nogc @trusted 1786 { 1787 "lfence;\n" : : : ; 1788 } 1789 } 1790 else 1791 static assert(false); 1792 } 1793 else static if (LDC_with_SSE2) 1794 { 1795 __builtin_ia32_lfence(); 1796 } 1797 else static if (LDC_with_ARM64) 1798 { 1799 __builtin_arm_dmb(9); // dmb ishld 1800 } 1801 else static if (DMD_with_asm) 1802 { 1803 asm nothrow @nogc pure @safe 1804 { 1805 lfence; 1806 } 1807 } 1808 else version(LDC) 1809 { 1810 // When the architecture is unknown, generate a full memory barrier, 1811 // as the semantics of sfence do not really match those of atomics. 1812 llvm_memory_fence(); 1813 } 1814 else 1815 static assert(false); 1816 } 1817 unittest 1818 { 1819 _mm_lfence(); 1820 } 1821 1822 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1823 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1824 __m128d _mm_load_pd (const(double) * mem_addr) pure 1825 { 1826 pragma(inline, true); 1827 __m128d* aligned = cast(__m128d*)mem_addr; 1828 return *aligned; 1829 } 1830 unittest 1831 { 1832 align(16) double[2] S = [-5.0, 7.0]; 1833 __m128d R = _mm_load_pd(S.ptr); 1834 assert(R.array == S); 1835 } 1836 1837 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst. 1838 /// `mem_addr` does not need to be aligned on any particular boundary. 1839 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 1840 { 1841 double m = *mem_addr; 1842 __m128d r; // PERF =void; 1843 r.ptr[0] = m; 1844 r.ptr[1] = m; 1845 return r; 1846 } 1847 unittest 1848 { 1849 double what = 4; 1850 __m128d R = _mm_load_pd1(&what); 1851 double[2] correct = [4.0, 4]; 1852 assert(R.array == correct); 1853 } 1854 1855 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 1856 /// element. `mem_addr` does not need to be aligned on any particular boundary. 1857 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted 1858 { 1859 double2 r = [0, 0]; 1860 r.ptr[0] = *mem_addr; 1861 return r; 1862 } 1863 unittest 1864 { 1865 double x = -42; 1866 __m128d a = _mm_load_sd(&x); 1867 assert(a.array == [-42.0, 0.0]); 1868 } 1869 1870 /// Load 128-bits of integer data from memory into dst. 1871 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1872 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted // TODO: shoudln't be trusted because alignment, Issue #62 1873 { 1874 pragma(inline, true); 1875 return *mem_addr; 1876 } 1877 unittest 1878 { 1879 align(16) int[4] correct = [-1, 2, 3, 4]; 1880 int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr); 1881 assert(A.array == correct); 1882 } 1883 1884 alias _mm_load1_pd = _mm_load_pd1; /// 1885 1886 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 1887 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary. 1888 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted 1889 { 1890 pragma(inline, true); 1891 a.ptr[1] = *mem_addr; 1892 return a; 1893 } 1894 unittest 1895 { 1896 double A = 7.0; 1897 __m128d B = _mm_setr_pd(4.0, -5.0); 1898 __m128d R = _mm_loadh_pd(B, &A); 1899 double[2] correct = [ 4.0, 7.0 ]; 1900 assert(R.array == correct); 1901 } 1902 1903 /// Load 64-bit integer from memory into the first element of result. Zero out the other. 1904 // Note: strange signature since the memory doesn't have to aligned (Issue #60) 1905 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted // TODO signature 1906 { 1907 pragma(inline, true); 1908 auto pLong = cast(const(long)*)mem_addr; 1909 long2 r = [0, 0]; 1910 r.ptr[0] = *pLong; 1911 return cast(__m128i)(r); 1912 } 1913 unittest 1914 { 1915 long A = 0x7878787870707070; 1916 long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A); 1917 long[2] correct = [0x7878787870707070, 0]; 1918 assert(R.array == correct); 1919 } 1920 1921 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 1922 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary. 1923 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted 1924 { 1925 a.ptr[0] = *mem_addr; 1926 return a; 1927 } 1928 unittest 1929 { 1930 double A = 7.0; 1931 __m128d B = _mm_setr_pd(4.0, -5.0); 1932 __m128d R = _mm_loadl_pd(B, &A); 1933 double[2] correct = [ 7.0, -5.0 ]; 1934 assert(R.array == correct); 1935 } 1936 1937 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 1938 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1939 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted 1940 { 1941 __m128d a = *cast(__m128d*)(mem_addr); 1942 __m128d r; // PERF =void; 1943 r.ptr[0] = a.array[1]; 1944 r.ptr[1] = a.array[0]; 1945 return r; 1946 } 1947 unittest 1948 { 1949 align(16) double[2] A = [56.0, -74.0]; 1950 __m128d R = _mm_loadr_pd(A.ptr); 1951 double[2] correct = [-74.0, 56.0]; 1952 assert(R.array == correct); 1953 } 1954 1955 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1956 /// `mem_addr` does not need to be aligned on any particular boundary. 1957 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted 1958 { 1959 pragma(inline, true); 1960 static if (GDC_with_SSE2) 1961 { 1962 return __builtin_ia32_loadupd(mem_addr); 1963 } 1964 else version(LDC) 1965 { 1966 return loadUnaligned!(double2)(mem_addr); 1967 } 1968 else version(DigitalMars) 1969 { 1970 // Apparently inside __simd you can use aligned dereferences without fear. 1971 // That was issue 23048 on dlang's Bugzilla. 1972 static if (DMD_with_DSIMD) 1973 { 1974 return cast(__m128d)__simd(XMM.LODUPD, *cast(double2*)mem_addr); 1975 } 1976 else static if (SSESizedVectorsAreEmulated) 1977 { 1978 // Since this vector is emulated, it doesn't have alignement constraints 1979 // and as such we can just cast it. 1980 return *cast(__m128d*)(mem_addr); 1981 } 1982 else 1983 { 1984 __m128d result; 1985 result.ptr[0] = mem_addr[0]; 1986 result.ptr[1] = mem_addr[1]; 1987 return result; 1988 } 1989 } 1990 else 1991 { 1992 __m128d result; 1993 result.ptr[0] = mem_addr[0]; 1994 result.ptr[1] = mem_addr[1]; 1995 return result; 1996 } 1997 } 1998 unittest 1999 { 2000 double[2] A = [56.0, -75.0]; 2001 __m128d R = _mm_loadu_pd(A.ptr); 2002 double[2] correct = [56.0, -75.0]; 2003 assert(R.array == correct); 2004 } 2005 2006 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary. 2007 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 2008 { 2009 // PERF DMD 2010 pragma(inline, true); 2011 static if (GDC_with_SSE2) 2012 { 2013 return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr); 2014 } 2015 else version(LDC) 2016 { 2017 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 2018 } 2019 else 2020 { 2021 const(int)* p = cast(const(int)*)mem_addr; 2022 __m128i r = void; 2023 r.ptr[0] = p[0]; 2024 r.ptr[1] = p[1]; 2025 r.ptr[2] = p[2]; 2026 r.ptr[3] = p[3]; 2027 return r; 2028 } 2029 } 2030 unittest 2031 { 2032 align(16) int[4] correct = [-1, 2, -3, 4]; 2033 int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr); 2034 assert(A.array == correct); 2035 } 2036 2037 /// Load unaligned 32-bit integer from memory into the first element of result. 2038 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted 2039 { 2040 pragma(inline, true); 2041 int r = *cast(int*)(mem_addr); 2042 int4 result = [0, 0, 0, 0]; 2043 result.ptr[0] = r; 2044 return result; 2045 } 2046 unittest 2047 { 2048 int r = 42; 2049 __m128i A = _mm_loadu_si32(&r); 2050 int[4] correct = [42, 0, 0, 0]; 2051 assert(A.array == correct); 2052 } 2053 2054 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 2055 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 2056 /// and pack the results in destination. 2057 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted 2058 { 2059 static if (GDC_with_SSE2) 2060 { 2061 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2062 } 2063 else static if (LDC_with_SSE2) 2064 { 2065 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2066 } 2067 else static if (LDC_with_ARM64) 2068 { 2069 int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b)); 2070 int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b)); 2071 int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl)); 2072 int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph)); 2073 return vcombine_s32(rl, rh); 2074 } 2075 else 2076 { 2077 short8 sa = cast(short8)a; 2078 short8 sb = cast(short8)b; 2079 int4 r; 2080 foreach(i; 0..4) 2081 { 2082 r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1]; 2083 } 2084 return r; 2085 } 2086 } 2087 unittest 2088 { 2089 short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2090 short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2091 int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B); 2092 int[4] correct = [1, 13, -2147483648, 2*32767*32767]; 2093 assert(R.array == correct); 2094 } 2095 2096 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 2097 /// (elements are not stored when the highest bit is not set in the corresponding element) 2098 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 2099 /// boundary. 2100 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted 2101 { 2102 static if (GDC_with_SSE2) 2103 { 2104 return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr); 2105 } 2106 else static if (LDC_with_SSE2) 2107 { 2108 return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr); 2109 } 2110 else static if (LDC_with_ARM64) 2111 { 2112 // PERF: catastrophic on ARM32 2113 byte16 bmask = cast(byte16)mask; 2114 byte16 shift = 7; 2115 bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask 2116 mask = cast(__m128i) bmask; 2117 __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr); 2118 dest = (a & mask) | (dest & ~mask); 2119 storeUnaligned!__m128i(dest, cast(int*)mem_addr); 2120 } 2121 else 2122 { 2123 byte16 b = cast(byte16)a; 2124 byte16 m = cast(byte16)mask; 2125 byte* dest = cast(byte*)(mem_addr); 2126 foreach(j; 0..16) 2127 { 2128 if (m.array[j] & 128) 2129 { 2130 dest[j] = b.array[j]; 2131 } 2132 } 2133 } 2134 } 2135 unittest 2136 { 2137 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]; 2138 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0); 2139 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); 2140 _mm_maskmoveu_si128(A, mask, dest.ptr); 2141 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42]; 2142 assert(dest == correct); 2143 } 2144 2145 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values. 2146 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe 2147 { 2148 static if (GDC_with_SSE2) 2149 { 2150 return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b); 2151 } 2152 else version(LDC) 2153 { 2154 // x86: pmaxsw since LDC 1.0 -O1 2155 // ARM: smax.8h since LDC 1.5 -01 2156 short8 sa = cast(short8)a; 2157 short8 sb = cast(short8)b; 2158 short8 greater = greaterMask!short8(sa, sb); 2159 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2160 } 2161 else 2162 { 2163 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else 2164 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2165 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2166 return _mm_xor_si128(b, mask); 2167 } 2168 } 2169 unittest 2170 { 2171 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57), 2172 _mm_setr_epi16(-4,-8, 9, 7, 0,-32768, 0, 0)); 2173 short[8] correct = [32767, 1, 9, 7, 9, 7, 0, 0]; 2174 assert(R.array == correct); 2175 } 2176 2177 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values. 2178 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe 2179 { 2180 version(LDC) 2181 { 2182 // x86: pmaxub since LDC 1.0.0 -O1 2183 // ARM64: umax.16b since LDC 1.5.0 -O1 2184 // PERF: catastrophic on ARM32 2185 ubyte16 sa = cast(ubyte16)a; 2186 ubyte16 sb = cast(ubyte16)b; 2187 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2188 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2189 } 2190 else 2191 { 2192 __m128i value128 = _mm_set1_epi8(-128); 2193 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2194 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2195 __m128i mask = _mm_and_si128(aTob, higher); 2196 return _mm_xor_si128(b, mask); 2197 } 2198 } 2199 unittest 2200 { 2201 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2202 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2203 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 2204 assert(R.array == correct); 2205 } 2206 2207 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed maximum values. 2208 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted 2209 { 2210 static if (GDC_with_SSE2) 2211 { 2212 return __builtin_ia32_maxpd(a, b); 2213 } 2214 else 2215 { 2216 // x86: Generates maxpd starting with LDC 1.9 -O2 2217 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2218 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2219 return a; 2220 } 2221 } 2222 unittest 2223 { 2224 __m128d A = _mm_setr_pd(4.0, 1.0); 2225 __m128d B = _mm_setr_pd(1.0, 8.0); 2226 __m128d M = _mm_max_pd(A, B); 2227 assert(M.array[0] == 4.0); 2228 assert(M.array[1] == 8.0); 2229 } 2230 2231 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 2232 /// lower element of result, and copy the upper element from `a` to the upper element of result. 2233 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted 2234 { 2235 static if (GDC_with_SSE2) 2236 { 2237 return __builtin_ia32_maxsd(a, b); 2238 } 2239 else 2240 { 2241 __m128d r = a; 2242 // Generates maxsd starting with LDC 1.3 2243 r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2244 return r; 2245 } 2246 } 2247 unittest 2248 { 2249 __m128d A = _mm_setr_pd(1.0, 1.0); 2250 __m128d B = _mm_setr_pd(4.0, 2.0); 2251 __m128d M = _mm_max_sd(A, B); 2252 assert(M.array[0] == 4.0); 2253 assert(M.array[1] == 1.0); 2254 } 2255 2256 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 2257 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 2258 /// is globally visible before any memory instruction which follows the fence in program order. 2259 void _mm_mfence() @trusted // not pure! 2260 { 2261 version(GNU) 2262 { 2263 static if (GDC_with_SSE2) 2264 { 2265 __builtin_ia32_mfence(); 2266 } 2267 else version(X86) 2268 { 2269 asm pure nothrow @nogc @trusted 2270 { 2271 "mfence;\n" : : : ; 2272 } 2273 } 2274 else 2275 static assert(false); 2276 } 2277 else static if (LDC_with_SSE2) 2278 { 2279 __builtin_ia32_mfence(); 2280 } 2281 else static if (DMD_with_asm) 2282 { 2283 asm nothrow @nogc pure @safe 2284 { 2285 mfence; 2286 } 2287 } 2288 else version(LDC) 2289 { 2290 // Note: will generate the DMB ish instruction on ARM 2291 llvm_memory_fence(); 2292 } 2293 else 2294 static assert(false); 2295 } 2296 unittest 2297 { 2298 _mm_mfence(); 2299 } 2300 2301 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values. 2302 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe 2303 { 2304 static if (GDC_with_SSE2) 2305 { 2306 return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b); 2307 } 2308 else version(LDC) 2309 { 2310 // x86: pminsw since LDC 1.0 -O1 2311 // ARM64: smin.8h since LDC 1.5 -01 2312 short8 sa = cast(short8)a; 2313 short8 sb = cast(short8)b; 2314 short8 greater = greaterMask!short8(sa, sb); 2315 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2316 } 2317 else 2318 { 2319 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else 2320 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2321 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2322 return _mm_xor_si128(b, mask); 2323 } 2324 } 2325 unittest 2326 { 2327 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-32768), 2328 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 2329 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -32768]; 2330 assert(R.array == correct); 2331 } 2332 2333 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values. 2334 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe 2335 { 2336 version(LDC) 2337 { 2338 // x86: pminub since LDC 1.0.0 -O1 2339 // ARM: umin.16b since LDC 1.5.0 -O1 2340 // PERF: catastrophic on ARM32 2341 ubyte16 sa = cast(ubyte16)a; 2342 ubyte16 sb = cast(ubyte16)b; 2343 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2344 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2345 } 2346 else 2347 { 2348 __m128i value128 = _mm_set1_epi8(-128); 2349 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2350 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2351 __m128i mask = _mm_and_si128(aTob, lower); 2352 return _mm_xor_si128(b, mask); 2353 } 2354 } 2355 unittest 2356 { 2357 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2358 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2359 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 2360 assert(R.array == correct); 2361 } 2362 2363 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values. 2364 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted 2365 { 2366 static if (GDC_with_SSE2) 2367 { 2368 return __builtin_ia32_minpd(a, b); 2369 } 2370 else 2371 { 2372 // Generates minpd starting with LDC 1.9 2373 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2374 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2375 return a; 2376 } 2377 } 2378 unittest 2379 { 2380 __m128d A = _mm_setr_pd(1.0, 2.0); 2381 __m128d B = _mm_setr_pd(4.0, 1.0); 2382 __m128d M = _mm_min_pd(A, B); 2383 assert(M.array[0] == 1.0); 2384 assert(M.array[1] == 1.0); 2385 } 2386 2387 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 2388 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 2389 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe 2390 { 2391 static if (GDC_with_SSE2) 2392 { 2393 return __builtin_ia32_minsd(a, b); 2394 } 2395 else 2396 { 2397 // Generates minsd starting with LDC 1.3 2398 __m128d r = a; 2399 r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2400 return r; 2401 } 2402 } 2403 unittest 2404 { 2405 __m128d A = _mm_setr_pd(1.0, 3.0); 2406 __m128d B = _mm_setr_pd(4.0, 2.0); 2407 __m128d M = _mm_min_sd(A, B); 2408 assert(M.array[0] == 1.0); 2409 assert(M.array[1] == 3.0); 2410 } 2411 2412 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element. 2413 __m128i _mm_move_epi64 (__m128i a) pure @trusted 2414 { 2415 static if (GDC_with_SSE2) 2416 { 2417 // slightly better with GDC -O0 2418 return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 2419 } 2420 else 2421 { 2422 long2 result = [ 0, 0 ]; 2423 long2 la = cast(long2) a; 2424 result.ptr[0] = la.array[0]; 2425 return cast(__m128i)(result); 2426 } 2427 } 2428 unittest 2429 { 2430 long2 A = [13, 47]; 2431 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 2432 long[2] correct = [13, 0]; 2433 assert(B.array == correct); 2434 } 2435 2436 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 2437 /// the upper element from `a` to the upper element of dst. 2438 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted 2439 { 2440 static if (GDC_with_SSE2) 2441 { 2442 return __builtin_ia32_movsd(a, b); 2443 } 2444 else 2445 { 2446 b.ptr[1] = a.array[1]; 2447 return b; 2448 } 2449 } 2450 unittest 2451 { 2452 double2 A = [13.0, 47.0]; 2453 double2 B = [34.0, 58.0]; 2454 double2 C = _mm_move_sd(A, B); 2455 double[2] correct = [34.0, 47.0]; 2456 assert(C.array == correct); 2457 } 2458 2459 /// Create mask from the most significant bit of each 8-bit element in `v`. 2460 int _mm_movemask_epi8 (__m128i a) pure @trusted 2461 { 2462 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2463 static if (GDC_with_SSE2) 2464 { 2465 return __builtin_ia32_pmovmskb128(cast(ubyte16)a); 2466 } 2467 else static if (LDC_with_SSE2) 2468 { 2469 return __builtin_ia32_pmovmskb128(cast(byte16)a); 2470 } 2471 else static if (LDC_with_ARM64) 2472 { 2473 // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon 2474 // The other two solutions lead to unfound intrinsics in LLVM and that took a long time. 2475 // SO there might be something a bit faster, but this one is reasonable and branchless. 2476 byte8 mask_shift; 2477 mask_shift.ptr[0] = 7; 2478 mask_shift.ptr[1] = 6; 2479 mask_shift.ptr[2] = 5; 2480 mask_shift.ptr[3] = 4; 2481 mask_shift.ptr[4] = 3; 2482 mask_shift.ptr[5] = 2; 2483 mask_shift.ptr[6] = 1; 2484 mask_shift.ptr[7] = 0; 2485 byte8 mask_and = byte8(-128); 2486 byte8 lo = vget_low_u8(cast(byte16)a); 2487 byte8 hi = vget_high_u8(cast(byte16)a); 2488 lo = vand_u8(lo, mask_and); 2489 lo = vshr_u8(lo, mask_shift); 2490 hi = vand_u8(hi, mask_and); 2491 hi = vshr_u8(hi, mask_shift); 2492 lo = vpadd_u8(lo,lo); 2493 lo = vpadd_u8(lo,lo); 2494 lo = vpadd_u8(lo,lo); 2495 hi = vpadd_u8(hi,hi); 2496 hi = vpadd_u8(hi,hi); 2497 hi = vpadd_u8(hi,hi); 2498 return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]); 2499 } 2500 else 2501 { 2502 byte16 ai = cast(byte16)a; 2503 int r = 0; 2504 foreach(bit; 0..16) 2505 { 2506 if (ai.array[bit] < 0) r += (1 << bit); 2507 } 2508 return r; 2509 } 2510 } 2511 unittest 2512 { 2513 assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0))); 2514 } 2515 2516 /// Create mask from the most significant bit of each 16-bit element in `v`. #BONUS 2517 int _mm_movemask_epi16 (__m128i a) pure @trusted 2518 { 2519 return _mm_movemask_epi8(_mm_packs_epi16(a, _mm_setzero_si128())); 2520 } 2521 unittest 2522 { 2523 assert(0x9C == _mm_movemask_epi16(_mm_set_epi16(-1, 1, 2, -3, -32768, -1, 32767, 8))); 2524 } 2525 2526 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 2527 /// loating-point element in `v`. 2528 int _mm_movemask_pd(__m128d v) pure @safe 2529 { 2530 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2531 static if (GDC_with_SSE2) 2532 { 2533 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2534 /// packed double-precision (64-bit) floating-point element in `v`. 2535 return __builtin_ia32_movmskpd(v); 2536 } 2537 else static if (LDC_with_SSE2) 2538 { 2539 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2540 /// packed double-precision (64-bit) floating-point element in `v`. 2541 return __builtin_ia32_movmskpd(v); 2542 } 2543 else 2544 { 2545 long2 lv = cast(long2)v; 2546 int r = 0; 2547 if (lv.array[0] < 0) r += 1; 2548 if (lv.array[1] < 0) r += 2; 2549 return r; 2550 } 2551 } 2552 unittest 2553 { 2554 __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0); 2555 assert(_mm_movemask_pd(A) == 2); 2556 } 2557 2558 /// Copy the lower 64-bit integer in `v`. 2559 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe 2560 { 2561 long2 lv = cast(long2)v; 2562 return long1(lv.array[0]); 2563 } 2564 unittest 2565 { 2566 __m128i A = _mm_set_epi64x(-1, -2); 2567 __m64 R = _mm_movepi64_pi64(A); 2568 assert(R.array[0] == -2); 2569 } 2570 2571 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element. 2572 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted 2573 { 2574 long2 r; 2575 r.ptr[0] = a.array[0]; 2576 r.ptr[1] = 0; 2577 return cast(__m128i)r; 2578 } 2579 2580 // Note: generates pmuludq in LDC with -O1 2581 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted 2582 { 2583 __m128i zero = _mm_setzero_si128(); 2584 2585 static if (__VERSION__ >= 2088) 2586 { 2587 // Need LLVM9 to avoid this shufflevector 2588 long2 la, lb; 2589 la.ptr[0] = cast(uint)a.array[0]; 2590 la.ptr[1] = cast(uint)a.array[2]; 2591 lb.ptr[0] = cast(uint)b.array[0]; 2592 lb.ptr[1] = cast(uint)b.array[2]; 2593 } 2594 else 2595 { 2596 long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero); // TODO remove this use of shufflevector except for LDC 2597 long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero); 2598 } 2599 2600 version(DigitalMars) 2601 { 2602 // DMD has no long2 mul 2603 // long2 mul not supported before LDC 1.5 2604 la.ptr[0] *= lb.array[0]; 2605 la.ptr[1] *= lb.array[1]; 2606 return cast(__m128i)(la); 2607 } 2608 else 2609 { 2610 static if (__VERSION__ >= 2076) 2611 { 2612 return cast(__m128i)(la * lb); 2613 } 2614 else 2615 { 2616 // long2 mul not supported before LDC 1.5 2617 la.ptr[0] *= lb.array[0]; 2618 la.ptr[1] *= lb.array[1]; 2619 return cast(__m128i)(la); 2620 } 2621 } 2622 } 2623 unittest 2624 { 2625 __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff); 2626 __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff); 2627 __m128i C = _mm_mul_epu32(A, B); 2628 long2 LC = cast(long2)C; 2629 assert(LC.array[0] == 18446744065119617025uL); 2630 assert(LC.array[1] == 12723420444339690338uL); 2631 } 2632 2633 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 2634 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 2635 { 2636 pragma(inline, true); 2637 return a * b; 2638 } 2639 unittest 2640 { 2641 __m128d a = [-2.0, 1.5]; 2642 a = _mm_mul_pd(a, a); 2643 assert(a.array == [4.0, 2.25]); 2644 } 2645 2646 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 2647 /// element of result, and copy the upper element from `a` to the upper element of result. 2648 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted 2649 { 2650 version(DigitalMars) 2651 { 2652 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 2653 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 2654 asm pure nothrow @nogc @trusted { nop;} 2655 a.array[0] = a.array[0] * b.array[0]; 2656 return a; 2657 } 2658 else static if (GDC_with_SSE2) 2659 { 2660 return __builtin_ia32_mulsd(a, b); 2661 } 2662 else 2663 { 2664 a.ptr[0] *= b.array[0]; 2665 return a; 2666 } 2667 } 2668 unittest 2669 { 2670 __m128d a = [-2.0, 1.5]; 2671 a = _mm_mul_sd(a, a); 2672 assert(a.array == [4.0, 1.5]); 2673 } 2674 2675 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 2676 /// and get an unsigned 64-bit result. 2677 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe 2678 { 2679 return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b))); 2680 } 2681 unittest 2682 { 2683 __m64 A = _mm_set_pi32(42, 0xDEADBEEF); 2684 __m64 B = _mm_set_pi32(42, 0xCAFEBABE); 2685 __m64 C = _mm_mul_su32(A, B); 2686 assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL); 2687 } 2688 2689 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2690 /// high 16 bits of the intermediate integers. 2691 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted 2692 { 2693 static if (GDC_with_SSE2) 2694 { 2695 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2696 } 2697 else static if (LDC_with_SSE2) 2698 { 2699 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2700 } 2701 else 2702 { 2703 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h 2704 // PERF: it seems the simde solution has one less instruction in ARM64. 2705 // PERF: Catastrophic in ARM32. 2706 short8 sa = cast(short8)a; 2707 short8 sb = cast(short8)b; 2708 short8 r = void; 2709 r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16; 2710 r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16; 2711 r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16; 2712 r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16; 2713 r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16; 2714 r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16; 2715 r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16; 2716 r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16; 2717 return cast(__m128i)r; 2718 } 2719 } 2720 unittest 2721 { 2722 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2723 __m128i B = _mm_set1_epi16(16384); 2724 short8 R = cast(short8)_mm_mulhi_epi16(A, B); 2725 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1]; 2726 assert(R.array == correct); 2727 } 2728 2729 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2730 /// high 16 bits of the intermediate integers. 2731 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted 2732 { 2733 static if (GDC_with_SSE2) 2734 { 2735 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2736 } 2737 else static if (LDC_with_SSE2) 2738 { 2739 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2740 } 2741 else 2742 { 2743 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h 2744 // it seems the simde solution has one less instruction in ARM64 2745 // PERF: Catastrophic in ARM32. 2746 short8 sa = cast(short8)a; 2747 short8 sb = cast(short8)b; 2748 short8 r = void; 2749 r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 ); 2750 r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 ); 2751 r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 ); 2752 r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 ); 2753 r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 ); 2754 r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 ); 2755 r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 ); 2756 r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 ); 2757 return cast(__m128i)r; 2758 } 2759 } 2760 unittest 2761 { 2762 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2763 __m128i B = _mm_set1_epi16(16384); 2764 short8 R = cast(short8)_mm_mulhi_epu16(A, B); 2765 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1]; 2766 assert(R.array == correct); 2767 } 2768 2769 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 2770 /// bits of the intermediate integers. 2771 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe 2772 { 2773 return cast(__m128i)(cast(short8)a * cast(short8)b); 2774 } 2775 unittest 2776 { 2777 __m128i A = _mm_setr_epi16(16384, -16, 0, 3, 4, 1, 16, 7); 2778 __m128i B = _mm_set1_epi16(16384); 2779 short8 R = cast(short8)_mm_mullo_epi16(A, B); 2780 short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384]; 2781 assert(R.array == correct); 2782 } 2783 2784 /// Compute the bitwise NOT of 128 bits in `a`. #BONUS 2785 __m128i _mm_not_si128 (__m128i a) pure @safe 2786 { 2787 return ~a; 2788 } 2789 unittest 2790 { 2791 __m128i A = _mm_set1_epi32(-748); 2792 int4 notA = cast(int4) _mm_not_si128(A); 2793 int[4] correct = [747, 747, 747, 747]; 2794 assert(notA.array == correct); 2795 } 2796 2797 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 2798 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 2799 { 2800 pragma(inline, true); 2801 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 2802 } 2803 2804 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`. 2805 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 2806 { 2807 pragma(inline, true); 2808 return a | b; 2809 } 2810 2811 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation. 2812 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted 2813 { 2814 static if (GDC_with_SSE2) 2815 { 2816 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2817 } 2818 else static if (LDC_with_SSE2) 2819 { 2820 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2821 } 2822 else static if (LDC_with_ARM64) 2823 { 2824 short4 ra = vqmovn_s32(cast(int4)a); 2825 short4 rb = vqmovn_s32(cast(int4)b); 2826 return cast(__m128i)vcombine_s16(ra, rb); 2827 } 2828 else 2829 { 2830 // PERF: catastrophic on ARM32 2831 short8 r; 2832 r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]); 2833 r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]); 2834 r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]); 2835 r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]); 2836 r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]); 2837 r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]); 2838 r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]); 2839 r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]); 2840 return cast(__m128i)r; 2841 } 2842 } 2843 unittest 2844 { 2845 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 2846 short8 R = cast(short8) _mm_packs_epi32(A, A); 2847 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0]; 2848 assert(R.array == correct); 2849 } 2850 2851 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation. 2852 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted 2853 { 2854 static if (GDC_with_SSE2) 2855 { 2856 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2857 } 2858 else static if (LDC_with_SSE2) 2859 { 2860 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2861 } 2862 else static if (LDC_with_ARM64) 2863 { 2864 // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02 2865 byte8 ra = vqmovn_s16(cast(short8)a); 2866 byte8 rb = vqmovn_s16(cast(short8)b); 2867 return cast(__m128i)vcombine_s8(ra, rb); 2868 } 2869 else 2870 { 2871 // PERF: ARM32 is missing 2872 byte16 r; 2873 short8 sa = cast(short8)a; 2874 short8 sb = cast(short8)b; 2875 foreach(i; 0..8) 2876 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]); 2877 foreach(i; 0..8) 2878 r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]); 2879 return cast(__m128i)r; 2880 } 2881 } 2882 unittest 2883 { 2884 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0); 2885 byte16 R = cast(byte16) _mm_packs_epi16(A, A); 2886 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0, 2887 127, -128, 127, 0, 127, -128, 127, 0]; 2888 assert(R.array == correct); 2889 } 2890 2891 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation. 2892 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 2893 { 2894 // PERF DMD catastrophic 2895 static if (GDC_with_SSE2) 2896 { 2897 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2898 } 2899 else static if (LDC_with_SSE2) 2900 { 2901 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2902 } 2903 else static if (LDC_with_ARM64) 2904 { 2905 // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02 2906 byte8 ra = vqmovun_s16(cast(short8)a); 2907 byte8 rb = vqmovun_s16(cast(short8)b); 2908 return cast(__m128i)vcombine_s8(ra, rb); 2909 } 2910 else 2911 { 2912 short8 sa = cast(short8)a; 2913 short8 sb = cast(short8)b; 2914 align(16) ubyte[16] result = void; 2915 for (int i = 0; i < 8; ++i) 2916 { 2917 short s = sa[i]; 2918 if (s < 0) s = 0; 2919 if (s > 255) s = 255; 2920 result[i] = cast(ubyte)s; 2921 2922 s = sb[i]; 2923 if (s < 0) s = 0; 2924 if (s > 255) s = 255; 2925 result[i+8] = cast(ubyte)s; 2926 } 2927 return *cast(__m128i*)(result.ptr); 2928 } 2929 } 2930 unittest 2931 { 2932 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 2933 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 2934 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 2935 0, 255, 0, 255, 255, 2, 1, 0]; 2936 foreach(i; 0..16) 2937 assert(AA.array[i] == cast(byte)(correctResult[i])); 2938 } 2939 2940 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 2941 /// and power consumption of spin-wait loops. 2942 void _mm_pause() @trusted 2943 { 2944 version(GNU) 2945 { 2946 static if (GDC_with_SSE2) 2947 { 2948 __builtin_ia32_pause(); 2949 } 2950 else version(X86) 2951 { 2952 asm pure nothrow @nogc @trusted 2953 { 2954 "pause;\n" : : : ; 2955 } 2956 } 2957 else 2958 static assert(false); 2959 } 2960 else static if (LDC_with_SSE2) 2961 { 2962 __builtin_ia32_pause(); 2963 } 2964 else static if (DMD_with_asm) 2965 { 2966 asm nothrow @nogc pure @safe 2967 { 2968 rep; nop; // F3 90 = pause 2969 } 2970 } 2971 else version (LDC) 2972 { 2973 // PERF: Do nothing currently , could be the "yield" intruction on ARM. 2974 } 2975 else 2976 static assert(false); 2977 } 2978 unittest 2979 { 2980 _mm_pause(); 2981 } 2982 2983 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 2984 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 2985 /// low 16 bits of 64-bit elements in result. 2986 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted 2987 { 2988 static if (GDC_with_SSE2) 2989 { 2990 return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b); 2991 } 2992 else static if (LDC_with_SSE2) 2993 { 2994 return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b); 2995 } 2996 else static if (LDC_with_ARM64) 2997 { 2998 ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b)); 2999 3000 // PERF: Looks suboptimal vs addp 3001 ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]); 3002 ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]); 3003 ushort8 r = 0; 3004 r[0] = r0; 3005 r[4] = r4; 3006 return cast(__m128i) r; 3007 } 3008 else 3009 { 3010 // PERF: ARM32 is lacking 3011 byte16 ab = cast(byte16)a; 3012 byte16 bb = cast(byte16)b; 3013 ubyte[16] t; 3014 foreach(i; 0..16) 3015 { 3016 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]); 3017 if (diff < 0) diff = -diff; 3018 t[i] = cast(ubyte)(diff); 3019 } 3020 int4 r = _mm_setzero_si128(); 3021 r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 3022 r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15]; 3023 return r; 3024 } 3025 } 3026 unittest 3027 { 3028 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 3029 __m128i B = _mm_set1_epi8(1); 3030 __m128i R = _mm_sad_epu8(A, B); 3031 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 3032 0, 3033 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 3034 0]; 3035 assert(R.array == correct); 3036 } 3037 3038 /// Set packed 16-bit integers with the supplied values. 3039 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 3040 { 3041 short8 r = void; 3042 r.ptr[0] = e0; 3043 r.ptr[1] = e1; 3044 r.ptr[2] = e2; 3045 r.ptr[3] = e3; 3046 r.ptr[4] = e4; 3047 r.ptr[5] = e5; 3048 r.ptr[6] = e6; 3049 r.ptr[7] = e7; 3050 return cast(__m128i) r; 3051 } 3052 unittest 3053 { 3054 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 3055 short8 B = cast(short8) A; 3056 foreach(i; 0..8) 3057 assert(B.array[i] == i); 3058 } 3059 3060 /// Set packed 32-bit integers with the supplied values. 3061 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3062 { 3063 align(16) int[4] r = [e0, e1, e2, e3]; 3064 return *cast(int4*)&r; 3065 } 3066 unittest 3067 { 3068 __m128i A = _mm_set_epi32(3, 2, 1, 0); 3069 foreach(i; 0..4) 3070 assert(A.array[i] == i); 3071 } 3072 3073 /// Set packed 64-bit integers with the supplied values. 3074 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted 3075 { 3076 pragma(inline, true); 3077 long2 r = void; 3078 r.ptr[0] = e0.array[0]; 3079 r.ptr[1] = e1.array[0]; 3080 return cast(__m128i)(r); 3081 } 3082 unittest 3083 { 3084 __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678)); 3085 long2 B = cast(long2) A; 3086 assert(B.array[0] == 5678); 3087 assert(B.array[1] == 1234); 3088 } 3089 3090 /// Set packed 64-bit integers with the supplied values. 3091 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 3092 { 3093 pragma(inline, true); 3094 long2 r = void; 3095 r.ptr[0] = e0; 3096 r.ptr[1] = e1; 3097 return cast(__m128i)(r); 3098 } 3099 unittest 3100 { 3101 __m128i A = _mm_set_epi64x(1234, -5678); 3102 long2 B = cast(long2) A; 3103 assert(B.array[0] == -5678); 3104 assert(B.array[1] == 1234); 3105 } 3106 3107 /// Set packed 8-bit integers with the supplied values. 3108 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 3109 byte e11, byte e10, byte e9, byte e8, 3110 byte e7, byte e6, byte e5, byte e4, 3111 byte e3, byte e2, byte e1, byte e0) pure @trusted 3112 { 3113 align(16) byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 3114 e8, e9, e10, e11, e12, e13, e14, e15]; 3115 return *cast(__m128i*)(result.ptr); 3116 } 3117 // TODO unittest 3118 3119 /// Set packed double-precision (64-bit) floating-point elements with the supplied values. 3120 __m128d _mm_set_pd (double e1, double e0) pure @trusted 3121 { 3122 pragma(inline, true); 3123 double2 r = void; 3124 r.ptr[0] = e0; 3125 r.ptr[1] = e1; 3126 return r; 3127 } 3128 unittest 3129 { 3130 __m128d A = _mm_set_pd(61.0, 55.0); 3131 double[2] correct = [55.0, 61.0]; 3132 assert(A.array == correct); 3133 } 3134 3135 /// Broadcast double-precision (64-bit) floating-point value `a` to all element. 3136 __m128d _mm_set_pd1 (double a) pure @trusted 3137 { 3138 pragma(inline, true); 3139 __m128d r = void; 3140 r.ptr[0] = a; 3141 r.ptr[1] = a; 3142 return r; 3143 } 3144 unittest 3145 { 3146 __m128d A = _mm_set_pd1(61.0); 3147 double[2] correct = [61.0, 61.0]; 3148 assert(A.array == correct); 3149 } 3150 3151 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 3152 /// and zero the upper element. 3153 __m128d _mm_set_sd (double a) pure @trusted 3154 { 3155 double2 r = void; 3156 r.ptr[0] = a; 3157 r.ptr[1] = 0.0; 3158 return r; 3159 } 3160 unittest 3161 { 3162 __m128d A = _mm_set_sd(61.0); 3163 double[2] correct = [61.0, 0.0]; 3164 assert(A.array == correct); 3165 } 3166 3167 /// Broadcast 16-bit integer a to all elements of dst. 3168 __m128i _mm_set1_epi16 (short a) pure @trusted 3169 { 3170 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 3171 { 3172 short8 v = a; 3173 return cast(__m128i) v; 3174 } 3175 else 3176 { 3177 pragma(inline, true); 3178 return cast(__m128i)(short8(a)); 3179 } 3180 } 3181 unittest 3182 { 3183 short8 a = cast(short8) _mm_set1_epi16(31); 3184 for (int i = 0; i < 8; ++i) 3185 assert(a.array[i] == 31); 3186 } 3187 3188 /// Broadcast 32-bit integer `a` to all elements. 3189 __m128i _mm_set1_epi32 (int a) pure @trusted 3190 { 3191 pragma(inline, true); 3192 return cast(__m128i)(int4(a)); 3193 } 3194 unittest 3195 { 3196 int4 a = cast(int4) _mm_set1_epi32(31); 3197 for (int i = 0; i < 4; ++i) 3198 assert(a.array[i] == 31); 3199 } 3200 3201 /// Broadcast 64-bit integer `a` to all elements. 3202 __m128i _mm_set1_epi64 (__m64 a) pure @safe 3203 { 3204 return _mm_set_epi64(a, a); 3205 } 3206 unittest 3207 { 3208 long b = 0x1DEADCAFE; 3209 __m64 a; 3210 a.ptr[0] = b; 3211 long2 c = cast(long2) _mm_set1_epi64(a); 3212 assert(c.array[0] == b); 3213 assert(c.array[1] == b); 3214 } 3215 3216 /// Broadcast 64-bit integer `a` to all elements 3217 __m128i _mm_set1_epi64x (long a) pure @trusted 3218 { 3219 long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3220 return cast(__m128i)(b); 3221 } 3222 unittest 3223 { 3224 long b = 0x1DEADCAFE; 3225 long2 c = cast(long2) _mm_set1_epi64x(b); 3226 for (int i = 0; i < 2; ++i) 3227 assert(c.array[i] == b); 3228 } 3229 3230 /// Broadcast 8-bit integer `a` to all elements. 3231 __m128i _mm_set1_epi8 (byte a) pure @trusted 3232 { 3233 pragma(inline, true); 3234 byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3235 return cast(__m128i)(b); 3236 } 3237 unittest 3238 { 3239 byte16 b = cast(byte16) _mm_set1_epi8(31); 3240 for (int i = 0; i < 16; ++i) 3241 assert(b.array[i] == 31); 3242 } 3243 3244 alias _mm_set1_pd = _mm_set_pd1; 3245 3246 /// Set packed 16-bit integers with the supplied values in reverse order. 3247 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 3248 short e3, short e2, short e1, short e0) pure @trusted 3249 { 3250 short8 r = void; 3251 r.ptr[0] = e7; 3252 r.ptr[1] = e6; 3253 r.ptr[2] = e5; 3254 r.ptr[3] = e4; 3255 r.ptr[4] = e3; 3256 r.ptr[5] = e2; 3257 r.ptr[6] = e1; 3258 r.ptr[7] = e0; 3259 return cast(__m128i)(r); 3260 } 3261 unittest 3262 { 3263 short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0); 3264 short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0]; 3265 assert(A.array == correct); 3266 } 3267 3268 /// Set packed 32-bit integers with the supplied values in reverse order. 3269 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3270 { 3271 // Performs better than = void; with GDC 3272 pragma(inline, true); 3273 align(16) int[4] result = [e3, e2, e1, e0]; 3274 return *cast(__m128i*)(result.ptr); 3275 } 3276 unittest 3277 { 3278 int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647); 3279 int[4] correct = [-1, 0, -2147483648, 2147483647]; 3280 assert(A.array == correct); 3281 } 3282 3283 /// Set packed 64-bit integers with the supplied values in reverse order. 3284 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 3285 { 3286 long2 r = void; 3287 r.ptr[0] = e1; 3288 r.ptr[1] = e0; 3289 return cast(__m128i)(r); 3290 } 3291 unittest 3292 { 3293 long2 A = cast(long2) _mm_setr_epi64(-1, 0); 3294 long[2] correct = [-1, 0]; 3295 assert(A.array == correct); 3296 } 3297 3298 /// Set packed 8-bit integers with the supplied values in reverse order. 3299 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12, 3300 byte e11, byte e10, byte e9, byte e8, 3301 byte e7, byte e6, byte e5, byte e4, 3302 byte e3, byte e2, byte e1, byte e0) pure @trusted 3303 { 3304 align(16) byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 3305 e7, e6, e5, e4, e3, e2, e1, e0]; 3306 return *cast(__m128i*)(result.ptr); 3307 } 3308 // TODO unittest 3309 3310 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order. 3311 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 3312 { 3313 pragma(inline, true); 3314 double2 result; 3315 result.ptr[0] = e1; 3316 result.ptr[1] = e0; 3317 return result; 3318 } 3319 unittest 3320 { 3321 __m128d A = _mm_setr_pd(61.0, 55.0); 3322 double[2] correct = [61.0, 55.0]; 3323 assert(A.array == correct); 3324 } 3325 3326 /// Return vector of type `__m128d` with all elements set to zero. 3327 __m128d _mm_setzero_pd() pure @trusted 3328 { 3329 pragma(inline, true); 3330 double2 r = void; 3331 r.ptr[0] = 0.0; 3332 r.ptr[1] = 0.0; 3333 return r; 3334 } 3335 unittest 3336 { 3337 __m128d A = _mm_setzero_pd(); 3338 double[2] correct = [0.0, 0.0]; 3339 assert(A.array == correct); 3340 } 3341 3342 /// Return vector of type `__m128i` with all elements set to zero. 3343 __m128i _mm_setzero_si128() pure @trusted 3344 { 3345 pragma(inline, true); 3346 int4 r = void; 3347 r.ptr[0] = 0; 3348 r.ptr[1] = 0; 3349 r.ptr[2] = 0; 3350 r.ptr[3] = 0; 3351 return r; 3352 } 3353 unittest 3354 { 3355 __m128i A = _mm_setzero_si128(); 3356 int[4] correct = [0, 0, 0, 0]; 3357 assert(A.array == correct); 3358 } 3359 3360 /// Shuffle 32-bit integers in a using the control in `imm8`. 3361 /// See_also: `_MM_SHUFFLE`. 3362 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe 3363 { 3364 static if (GDC_with_SSE2) 3365 { 3366 return __builtin_ia32_pshufd(a, imm8); 3367 } 3368 else 3369 { 3370 return shufflevector!(int4, (imm8 >> 0) & 3, 3371 (imm8 >> 2) & 3, 3372 (imm8 >> 4) & 3, 3373 (imm8 >> 6) & 3)(a, a); // TODO remove this use of shufflevector except for LDC 3374 } 3375 } 3376 unittest 3377 { 3378 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 3379 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3380 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 3381 int[4] expectedB = [ 3, 2, 1, 0 ]; 3382 assert(B.array == expectedB); 3383 } 3384 3385 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`. 3386 /// See_also: `_MM_SHUFFLE2`. 3387 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe 3388 { 3389 static if (GDC_with_SSE2) 3390 { 3391 return __builtin_ia32_shufpd(a, b, imm8); 3392 } 3393 else 3394 { 3395 return shufflevector!(double2, 0 + ( imm8 & 1 ), 3396 2 + ( (imm8 >> 1) & 1 ))(a, b); // TODO remove this use of shufflevector except for LDC 3397 } 3398 } 3399 unittest 3400 { 3401 __m128d A = _mm_setr_pd(0.5, 2.0); 3402 __m128d B = _mm_setr_pd(4.0, 5.0); 3403 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 3404 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B); 3405 double[2] correct = [ 2.0, 5.0 ]; 3406 assert(R.array == correct); 3407 } 3408 3409 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 3410 /// 64 bits of result, with the low 64 bits being copied from from `a` to result. 3411 /// See also: `_MM_SHUFFLE`. 3412 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe 3413 { 3414 static if (GDC_with_SSE2) 3415 { 3416 return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8); 3417 } 3418 else 3419 { 3420 return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3, 3421 4 + ( (imm8 >> 0) & 3 ), 3422 4 + ( (imm8 >> 2) & 3 ), 3423 4 + ( (imm8 >> 4) & 3 ), 3424 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); // TODO remove this use of shufflevector except for LDC 3425 } 3426 } 3427 unittest 3428 { 3429 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3430 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3431 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 3432 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 3433 assert(C.array == expectedC); 3434 } 3435 3436 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 3437 /// bits of result, with the high 64 bits being copied from from `a` to result. 3438 /// See_also: `_MM_SHUFFLE`. 3439 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe 3440 { 3441 static if (GDC_with_SSE2) 3442 { 3443 return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8); 3444 } 3445 else 3446 { 3447 // TODO remove this use of shufflevector except for LDC 3448 return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ), 3449 ( (imm8 >> 2) & 3 ), 3450 ( (imm8 >> 4) & 3 ), 3451 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 3452 } 3453 } 3454 unittest 3455 { 3456 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3457 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3458 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 3459 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 3460 assert(B.array == expectedB); 3461 } 3462 3463 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros. 3464 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted 3465 { 3466 static if (LDC_with_SSE2) 3467 { 3468 return __builtin_ia32_pslld128(a, count); 3469 } 3470 else static if (GDC_with_SSE2) 3471 { 3472 return __builtin_ia32_pslld128(a, count); 3473 } 3474 else static if (DMD_with_32bit_asm) 3475 { 3476 asm pure nothrow @nogc @trusted 3477 { 3478 movdqu XMM0, a; 3479 movdqu XMM1, count; 3480 pslld XMM0, XMM1; 3481 movdqu a, XMM0; 3482 } 3483 return a; 3484 } 3485 else 3486 { 3487 int4 r = void; 3488 long2 lc = cast(long2)count; 3489 int bits = cast(int)(lc.array[0]); 3490 foreach(i; 0..4) 3491 r[i] = cast(uint)(a[i]) << bits; 3492 return r; 3493 } 3494 } 3495 3496 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros. 3497 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted 3498 { 3499 static if (LDC_with_SSE2) 3500 { 3501 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3502 } 3503 else static if (GDC_with_SSE2) 3504 { 3505 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3506 } 3507 else static if (DMD_with_32bit_asm) 3508 { 3509 asm pure nothrow @nogc @trusted 3510 { 3511 movdqu XMM0, a; 3512 movdqu XMM1, count; 3513 psllq XMM0, XMM1; 3514 movdqu a, XMM0; 3515 } 3516 return a; 3517 } 3518 else 3519 { 3520 // ARM: good since LDC 1.12 -O2 3521 // ~but -O0 version is catastrophic 3522 long2 r = void; 3523 long2 sa = cast(long2)a; 3524 long2 lc = cast(long2)count; 3525 int bits = cast(int)(lc.array[0]); 3526 foreach(i; 0..2) 3527 r.array[i] = cast(ulong)(sa.array[i]) << bits; 3528 return cast(__m128i)r; 3529 } 3530 } 3531 3532 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros. 3533 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 3534 { 3535 static if (LDC_with_SSE2) 3536 { 3537 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3538 } 3539 else static if (GDC_with_SSE2) 3540 { 3541 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3542 } 3543 else static if (DMD_with_32bit_asm) 3544 { 3545 asm pure nothrow @nogc 3546 { 3547 movdqu XMM0, a; 3548 movdqu XMM1, count; 3549 psllw XMM0, XMM1; 3550 movdqu a, XMM0; 3551 } 3552 return a; 3553 } 3554 else 3555 { 3556 short8 sa = cast(short8)a; 3557 long2 lc = cast(long2)count; 3558 int bits = cast(int)(lc.array[0]); 3559 short8 r = void; 3560 foreach(i; 0..8) 3561 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits); 3562 return cast(int4)r; 3563 } 3564 } 3565 3566 3567 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 3568 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted 3569 { 3570 static if (GDC_with_SSE2) 3571 { 3572 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3573 } 3574 else static if (LDC_with_SSE2) 3575 { 3576 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3577 } 3578 else 3579 { 3580 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3581 // D says "It's illegal to shift by the same or more bits 3582 // than the size of the quantity being shifted" 3583 // and it's UB instead. 3584 int4 r = _mm_setzero_si128(); 3585 3586 ubyte count = cast(ubyte) imm8; 3587 if (count > 31) 3588 return r; 3589 3590 foreach(i; 0..4) 3591 r.array[i] = cast(uint)(a.array[i]) << count; 3592 return r; 3593 } 3594 } 3595 unittest 3596 { 3597 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3598 __m128i B = _mm_slli_epi32(A, 1); 3599 __m128i B2 = _mm_slli_epi32(A, 1 + 256); 3600 int[4] expectedB = [ 0, 4, 6, -8]; 3601 assert(B.array == expectedB); 3602 assert(B2.array == expectedB); 3603 3604 __m128i C = _mm_slli_epi32(A, 0); 3605 int[4] expectedC = [ 0, 2, 3, -4]; 3606 assert(C.array == expectedC); 3607 3608 __m128i D = _mm_slli_epi32(A, 65); 3609 int[4] expectedD = [ 0, 0, 0, 0]; 3610 assert(D.array == expectedD); 3611 } 3612 3613 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros. 3614 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted 3615 { 3616 static if (GDC_with_SSE2) 3617 { 3618 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3619 } 3620 else static if (LDC_with_SSE2) 3621 { 3622 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3623 } 3624 else 3625 { 3626 long2 sa = cast(long2)a; 3627 3628 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3629 // D says "It's illegal to shift by the same or more bits 3630 // than the size of the quantity being shifted" 3631 // and it's UB instead. 3632 long2 r = cast(long2) _mm_setzero_si128(); 3633 ubyte count = cast(ubyte) imm8; 3634 if (count > 63) 3635 return cast(__m128i)r; 3636 3637 r.ptr[0] = cast(ulong)(sa.array[0]) << count; 3638 r.ptr[1] = cast(ulong)(sa.array[1]) << count; 3639 return cast(__m128i)r; 3640 } 3641 } 3642 unittest 3643 { 3644 __m128i A = _mm_setr_epi64(8, -4); 3645 long2 B = cast(long2) _mm_slli_epi64(A, 1); 3646 long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024); 3647 long[2] expectedB = [ 16, -8]; 3648 assert(B.array == expectedB); 3649 assert(B2.array == expectedB); 3650 3651 long2 C = cast(long2) _mm_slli_epi64(A, 0); 3652 long[2] expectedC = [ 8, -4]; 3653 assert(C.array == expectedC); 3654 3655 long2 D = cast(long2) _mm_slli_epi64(A, 64); 3656 long[2] expectedD = [ 0, -0]; 3657 assert(D.array == expectedD); 3658 } 3659 3660 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 3661 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted 3662 { 3663 static if (GDC_with_SSE2) 3664 { 3665 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3666 } 3667 else static if (LDC_with_SSE2) 3668 { 3669 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3670 } 3671 else static if (LDC_with_ARM64) 3672 { 3673 short8 sa = cast(short8)a; 3674 short8 r = cast(short8)_mm_setzero_si128(); 3675 ubyte count = cast(ubyte) imm8; 3676 if (count > 15) 3677 return cast(__m128i)r; 3678 r = sa << short8(count); 3679 return cast(__m128i)r; 3680 } 3681 else 3682 { 3683 short8 sa = cast(short8)a; 3684 short8 r = cast(short8)_mm_setzero_si128(); 3685 ubyte count = cast(ubyte) imm8; 3686 if (count > 15) 3687 return cast(__m128i)r; 3688 foreach(i; 0..8) 3689 r.ptr[i] = cast(short)(sa.array[i] << count); 3690 return cast(__m128i)r; 3691 } 3692 } 3693 unittest 3694 { 3695 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3696 short8 B = cast(short8)( _mm_slli_epi16(A, 1) ); 3697 short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) ); 3698 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 3699 assert(B.array == expectedB); 3700 assert(B2.array == expectedB); 3701 3702 short8 C = cast(short8)( _mm_slli_epi16(A, 16) ); 3703 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 3704 assert(C.array == expectedC); 3705 } 3706 3707 3708 /// Shift `a` left by `bytes` bytes while shifting in zeros. 3709 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted 3710 { 3711 static if (bytes & 0xF0) 3712 { 3713 return _mm_setzero_si128(); 3714 } 3715 else 3716 { 3717 static if (GDC_with_SSE2) 3718 { 3719 return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 3720 } 3721 else version(DigitalMars) 3722 { 3723 version(D_InlineAsm_X86) 3724 { 3725 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64 3726 { 3727 movdqu XMM0, op; 3728 pslldq XMM0, bytes; 3729 movdqu op, XMM0; 3730 } 3731 return op; 3732 } 3733 else 3734 { 3735 byte16 A = cast(byte16)op; 3736 byte16 R; 3737 for (int n = 15; n >= bytes; --n) 3738 R.ptr[n] = A.array[n-bytes]; 3739 for (int n = bytes-1; n >= 0; --n) 3740 R.ptr[n] = 0; 3741 return cast(__m128i)R; 3742 } 3743 } 3744 else 3745 { 3746 // TODO remove this use of shufflevector except for LDC 3747 return cast(__m128i) shufflevector!(byte16, 3748 16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes, 3749 22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes, 3750 28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes) 3751 (cast(byte16)_mm_setzero_si128(), cast(byte16)op); 3752 } 3753 } 3754 } 3755 unittest 3756 { 3757 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3758 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left 3759 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ]; 3760 assert(R.array == correct); 3761 3762 __m128i B = _mm_srli_si128!16(_mm_set1_epi32(-1)); 3763 int[4] expectedB = [0, 0, 0, 0]; 3764 assert(B.array == expectedB); 3765 } 3766 3767 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`. 3768 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted 3769 { 3770 version(LDC) 3771 { 3772 // Disappeared with LDC 1.11 3773 static if (__VERSION__ < 2081) 3774 return __builtin_ia32_sqrtpd(vec); 3775 else 3776 { 3777 vec.array[0] = llvm_sqrt(vec.array[0]); 3778 vec.array[1] = llvm_sqrt(vec.array[1]); 3779 return vec; 3780 } 3781 } 3782 else static if (GDC_with_SSE2) 3783 { 3784 return __builtin_ia32_sqrtpd(vec); 3785 } 3786 else 3787 { 3788 vec.ptr[0] = sqrt(vec.array[0]); 3789 vec.ptr[1] = sqrt(vec.array[1]); 3790 return vec; 3791 } 3792 } 3793 3794 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 3795 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 3796 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted 3797 { 3798 // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only. 3799 // "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 3800 // The quadword at bits 127:64 of the destination operand remains unchanged." 3801 version(LDC) 3802 { 3803 // Disappeared with LDC 1.11 3804 static if (__VERSION__ < 2081) 3805 { 3806 __m128d c = __builtin_ia32_sqrtsd(b); 3807 a[0] = c[0]; 3808 return a; 3809 } 3810 else 3811 { 3812 a.array[0] = llvm_sqrt(b.array[0]); 3813 return a; 3814 } 3815 } 3816 else static if (GDC_with_SSE2) 3817 { 3818 __m128d c = __builtin_ia32_sqrtsd(b); 3819 a.ptr[0] = c.array[0]; 3820 return a; 3821 } 3822 else 3823 { 3824 a.ptr[0] = sqrt(b.array[0]); 3825 return a; 3826 } 3827 } 3828 unittest 3829 { 3830 __m128d A = _mm_setr_pd(1.0, 3.0); 3831 __m128d B = _mm_setr_pd(4.0, 5.0); 3832 __m128d R = _mm_sqrt_sd(A, B); 3833 double[2] correct = [2.0, 3.0 ]; 3834 assert(R.array == correct); 3835 } 3836 3837 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits. 3838 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted 3839 { 3840 static if (GDC_with_SSE2) 3841 { 3842 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3843 } 3844 else static if (LDC_with_SSE2) 3845 { 3846 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3847 } 3848 else 3849 { 3850 short8 sa = cast(short8)a; 3851 long2 lc = cast(long2)count; 3852 int bits = cast(int)(lc.array[0]); 3853 short8 r = void; 3854 foreach(i; 0..8) 3855 r.ptr[i] = cast(short)(sa.array[i] >> bits); 3856 return cast(int4)r; 3857 } 3858 } 3859 3860 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits. 3861 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted 3862 { 3863 static if (LDC_with_SSE2) 3864 { 3865 return __builtin_ia32_psrad128(a, count); 3866 } 3867 else static if (GDC_with_SSE2) 3868 { 3869 return __builtin_ia32_psrad128(a, count); 3870 } 3871 else 3872 { 3873 int4 r = void; 3874 long2 lc = cast(long2)count; 3875 int bits = cast(int)(lc.array[0]); 3876 r.ptr[0] = (a.array[0] >> bits); 3877 r.ptr[1] = (a.array[1] >> bits); 3878 r.ptr[2] = (a.array[2] >> bits); 3879 r.ptr[3] = (a.array[3] >> bits); 3880 return r; 3881 } 3882 } 3883 3884 3885 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 3886 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted 3887 { 3888 static if (GDC_with_SSE2) 3889 { 3890 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 3891 } 3892 else static if (LDC_with_SSE2) 3893 { 3894 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 3895 } 3896 else static if (LDC_with_ARM64) 3897 { 3898 short8 sa = cast(short8)a; 3899 ubyte count = cast(ubyte)imm8; 3900 if (count > 15) 3901 count = 15; 3902 short8 r = sa >> short8(count); 3903 return cast(__m128i)r; 3904 } 3905 else 3906 { 3907 short8 sa = cast(short8)a; 3908 short8 r = void; 3909 3910 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3911 // D says "It's illegal to shift by the same or more bits 3912 // than the size of the quantity being shifted" 3913 // and it's UB instead. 3914 ubyte count = cast(ubyte)imm8; 3915 if (count > 15) 3916 count = 15; 3917 foreach(i; 0..8) 3918 r.ptr[i] = cast(short)(sa.array[i] >> count); 3919 return cast(int4)r; 3920 } 3921 } 3922 unittest 3923 { 3924 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3925 short8 B = cast(short8)( _mm_srai_epi16(A, 1) ); 3926 short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) ); 3927 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 3928 assert(B.array == expectedB); 3929 assert(B2.array == expectedB); 3930 3931 short8 C = cast(short8)( _mm_srai_epi16(A, 18) ); 3932 short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ]; 3933 assert(C.array == expectedC); 3934 } 3935 3936 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 3937 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted 3938 { 3939 static if (LDC_with_SSE2) 3940 { 3941 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 3942 } 3943 else static if (GDC_with_SSE2) 3944 { 3945 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 3946 } 3947 else 3948 { 3949 int4 r = void; 3950 3951 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3952 // D says "It's illegal to shift by the same or more bits 3953 // than the size of the quantity being shifted" 3954 // and it's UB instead. 3955 ubyte count = cast(ubyte) imm8; 3956 if (count > 31) 3957 count = 31; 3958 3959 r.ptr[0] = (a.array[0] >> count); 3960 r.ptr[1] = (a.array[1] >> count); 3961 r.ptr[2] = (a.array[2] >> count); 3962 r.ptr[3] = (a.array[3] >> count); 3963 return r; 3964 } 3965 } 3966 unittest 3967 { 3968 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3969 __m128i B = _mm_srai_epi32(A, 1); 3970 __m128i B2 = _mm_srai_epi32(A, 1 + 256); 3971 int[4] expectedB = [ 0, 1, 1, -2]; 3972 assert(B.array == expectedB); 3973 assert(B2.array == expectedB); 3974 3975 __m128i C = _mm_srai_epi32(A, 32); 3976 int[4] expectedC = [ 0, 0, 0, -1]; 3977 assert(C.array == expectedC); 3978 3979 __m128i D = _mm_srai_epi32(A, 0); 3980 int[4] expectedD = [ 0, 2, 3, -4]; 3981 assert(D.array == expectedD); 3982 } 3983 3984 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted 3985 { 3986 static if (LDC_with_SSE2) 3987 { 3988 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 3989 } 3990 else static if (GDC_with_SSE2) 3991 { 3992 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 3993 } 3994 else 3995 { 3996 short8 sa = cast(short8)a; 3997 long2 lc = cast(long2)count; 3998 int bits = cast(int)(lc.array[0]); 3999 short8 r = void; 4000 foreach(i; 0..8) 4001 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits); 4002 return cast(int4)r; 4003 } 4004 } 4005 4006 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted 4007 { 4008 static if (LDC_with_SSE2) 4009 { 4010 return __builtin_ia32_psrld128(a, count); 4011 } 4012 else static if (GDC_with_SSE2) 4013 { 4014 return __builtin_ia32_psrld128(a, count); 4015 } 4016 else 4017 { 4018 int4 r = void; 4019 long2 lc = cast(long2)count; 4020 int bits = cast(int)(lc.array[0]); 4021 r.ptr[0] = cast(uint)(a.array[0]) >> bits; 4022 r.ptr[1] = cast(uint)(a.array[1]) >> bits; 4023 r.ptr[2] = cast(uint)(a.array[2]) >> bits; 4024 r.ptr[3] = cast(uint)(a.array[3]) >> bits; 4025 return r; 4026 } 4027 } 4028 4029 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted 4030 { 4031 static if (LDC_with_SSE2) 4032 { 4033 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 4034 } 4035 else static if (GDC_with_SSE2) 4036 { 4037 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 4038 } 4039 else 4040 { 4041 // Workaround for https://issues.dlang.org/show_bug.cgi?id=23047 4042 // => avoid void initialization. 4043 long2 r; 4044 long2 sa = cast(long2)a; 4045 long2 lc = cast(long2)count; 4046 int bits = cast(int)(lc.array[0]); 4047 r.ptr[0] = cast(ulong)(sa.array[0]) >> bits; 4048 r.ptr[1] = cast(ulong)(sa.array[1]) >> bits; 4049 return cast(__m128i)r; 4050 } 4051 } 4052 4053 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 4054 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted 4055 { 4056 static if (GDC_with_SSE2) 4057 { 4058 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 4059 } 4060 else static if (LDC_with_SSE2) 4061 { 4062 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 4063 } 4064 else static if (LDC_with_ARM64) 4065 { 4066 short8 sa = cast(short8)a; 4067 short8 r = cast(short8) _mm_setzero_si128(); 4068 4069 ubyte count = cast(ubyte)imm8; 4070 if (count >= 16) 4071 return cast(__m128i)r; 4072 4073 r = sa >>> short8(count); // This facility offered with LDC, but not DMD. 4074 return cast(__m128i)r; 4075 } 4076 else 4077 { 4078 short8 sa = cast(short8)a; 4079 ubyte count = cast(ubyte)imm8; 4080 4081 short8 r = cast(short8) _mm_setzero_si128(); 4082 if (count >= 16) 4083 return cast(__m128i)r; 4084 4085 foreach(i; 0..8) 4086 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count); 4087 return cast(__m128i)r; 4088 } 4089 } 4090 unittest 4091 { 4092 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 4093 short8 B = cast(short8)( _mm_srli_epi16(A, 1) ); 4094 short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) ); 4095 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 4096 assert(B.array == expectedB); 4097 assert(B2.array == expectedB); 4098 4099 short8 C = cast(short8)( _mm_srli_epi16(A, 16) ); 4100 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0]; 4101 assert(C.array == expectedC); 4102 4103 short8 D = cast(short8)( _mm_srli_epi16(A, 0) ); 4104 short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ]; 4105 assert(D.array == expectedD); 4106 } 4107 4108 4109 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 4110 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted 4111 { 4112 static if (GDC_with_SSE2) 4113 { 4114 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 4115 } 4116 else static if (LDC_with_SSE2) 4117 { 4118 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 4119 } 4120 else 4121 { 4122 ubyte count = cast(ubyte) imm8; 4123 4124 // Note: the intrinsics guarantee imm8[0..7] is taken, however 4125 // D says "It's illegal to shift by the same or more bits 4126 // than the size of the quantity being shifted" 4127 // and it's UB instead. 4128 int4 r = _mm_setzero_si128(); 4129 if (count >= 32) 4130 return r; 4131 r.ptr[0] = a.array[0] >>> count; 4132 r.ptr[1] = a.array[1] >>> count; 4133 r.ptr[2] = a.array[2] >>> count; 4134 r.ptr[3] = a.array[3] >>> count; 4135 return r; 4136 } 4137 } 4138 unittest 4139 { 4140 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 4141 __m128i B = _mm_srli_epi32(A, 1); 4142 __m128i B2 = _mm_srli_epi32(A, 1 + 256); 4143 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 4144 assert(B.array == expectedB); 4145 assert(B2.array == expectedB); 4146 4147 __m128i C = _mm_srli_epi32(A, 255); 4148 int[4] expectedC = [ 0, 0, 0, 0 ]; 4149 assert(C.array == expectedC); 4150 } 4151 4152 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros. 4153 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted 4154 { 4155 static if (GDC_with_SSE2) 4156 { 4157 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4158 } 4159 else static if (LDC_with_SSE2) 4160 { 4161 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4162 } 4163 else 4164 { 4165 long2 r = cast(long2) _mm_setzero_si128(); 4166 long2 sa = cast(long2)a; 4167 4168 ubyte count = cast(ubyte) imm8; 4169 if (count >= 64) 4170 return cast(__m128i)r; 4171 4172 r.ptr[0] = sa.array[0] >>> count; 4173 r.ptr[1] = sa.array[1] >>> count; 4174 return cast(__m128i)r; 4175 } 4176 } 4177 unittest 4178 { 4179 __m128i A = _mm_setr_epi64(8, -4); 4180 long2 B = cast(long2) _mm_srli_epi64(A, 1); 4181 long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512); 4182 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 4183 assert(B.array == expectedB); 4184 assert(B2.array == expectedB); 4185 4186 long2 C = cast(long2) _mm_srli_epi64(A, 64); 4187 long[2] expectedC = [ 0, 0 ]; 4188 assert(C.array == expectedC); 4189 } 4190 4191 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4192 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe 4193 { 4194 static if (bytes & 0xF0) 4195 { 4196 return _mm_setzero_si128(); 4197 } 4198 else static if (GDC_with_SSE2) 4199 { 4200 return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8)); 4201 } 4202 else static if (DMD_with_32bit_asm) 4203 { 4204 asm pure nothrow @nogc @trusted 4205 { 4206 movdqu XMM0, v; 4207 psrldq XMM0, bytes; 4208 movdqu v, XMM0; 4209 } 4210 return v; 4211 } 4212 else 4213 { 4214 // TODO remove this use of shufflevector except for LDC 4215 return cast(__m128i) shufflevector!(byte16, 4216 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7, 4217 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15) 4218 (cast(byte16) v, cast(byte16)_mm_setzero_si128()); 4219 } 4220 } 4221 unittest 4222 { 4223 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1)); 4224 int[4] correct = [2, 3, 4, 0]; 4225 assert(R.array == correct); 4226 4227 __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1)); 4228 int[4] expectedA = [0, 0, 0, 0]; 4229 assert(A.array == expectedA); 4230 } 4231 4232 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4233 /// #BONUS 4234 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe 4235 { 4236 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v); 4237 } 4238 unittest 4239 { 4240 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)); 4241 float[4] correct = [3.0f, 4.0f, 0, 0]; 4242 assert(R.array == correct); 4243 } 4244 4245 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4246 /// #BONUS 4247 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe 4248 { 4249 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v); 4250 } 4251 4252 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4253 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4254 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted 4255 { 4256 pragma(inline, true); 4257 __m128d* aligned = cast(__m128d*)mem_addr; 4258 *aligned = a; 4259 } 4260 4261 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 4262 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4263 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted 4264 { 4265 __m128d* aligned = cast(__m128d*)mem_addr; 4266 __m128d r; // PERF =void; 4267 r.ptr[0] = a.array[0]; 4268 r.ptr[1] = a.array[0]; 4269 *aligned = r; 4270 } 4271 4272 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 4273 /// be aligned on any particular boundary. 4274 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 4275 { 4276 pragma(inline, true); 4277 *mem_addr = a.array[0]; 4278 } 4279 4280 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 4281 /// general-protection exception may be generated. 4282 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 4283 { 4284 pragma(inline, true); 4285 *mem_addr = a; 4286 } 4287 4288 alias _mm_store1_pd = _mm_store_pd1; /// 4289 4290 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory. 4291 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 4292 { 4293 pragma(inline, true); 4294 *mem_addr = a.array[1]; 4295 } 4296 4297 // Note: `mem_addr` doesn't have to actually be aligned, which breaks 4298 // expectations from the user point of view. This problem also exist in C++. 4299 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 4300 { 4301 pragma(inline, true); 4302 long* dest = cast(long*)mem_addr; 4303 long2 la = cast(long2)a; 4304 *dest = la.array[0]; 4305 } 4306 unittest 4307 { 4308 long[3] A = [1, 2, 3]; 4309 _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 4310 long[3] correct = [1, 0x1_0000_0000, 3]; 4311 assert(A == correct); 4312 } 4313 4314 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. 4315 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 4316 { 4317 pragma(inline, true); 4318 *mem_addr = a.array[0]; 4319 } 4320 4321 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order. `mem_addr` must be 4322 /// aligned on a 16-byte boundary or a general-protection exception may be generated. 4323 void _mm_storer_pd (double* mem_addr, __m128d a) pure @system 4324 { 4325 // TODO remove this use of shufflevector except for LDC 4326 __m128d* aligned = cast(__m128d*)mem_addr; 4327 *aligned = shufflevector!(double2, 1, 0)(a, a); 4328 } 4329 4330 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4331 /// `mem_addr` does not need to be aligned on any particular boundary. 4332 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @trusted // TODO: signature, should be system 4333 { 4334 // PERF DMD 4335 pragma(inline, true); 4336 static if (GDC_with_SSE2) 4337 { 4338 __builtin_ia32_storeupd(mem_addr, a); 4339 } 4340 else version(LDC) 4341 { 4342 storeUnaligned!double2(a, mem_addr); 4343 } 4344 else 4345 { 4346 mem_addr[0] = a.array[0]; 4347 mem_addr[1] = a.array[1]; 4348 } 4349 } 4350 unittest 4351 { 4352 __m128d A = _mm_setr_pd(3.0, 4.0); 4353 align(16) double[4] R = [0.0, 0, 0, 0]; 4354 double[2] correct = [3.0, 4.0]; 4355 _mm_storeu_pd(&R[1], A); 4356 assert(R[1..3] == correct); 4357 } 4358 4359 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 4360 /// boundary. 4361 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @trusted // TODO: signature is wrong, mem_addr is not aligned 4362 { 4363 // PERF: DMD 4364 pragma(inline, true); 4365 static if (GDC_with_SSE2) 4366 { 4367 __builtin_ia32_storedqu(cast(char*)mem_addr, cast(ubyte16)a); 4368 } 4369 else version(LDC) 4370 { 4371 storeUnaligned!__m128i(a, cast(int*)mem_addr); 4372 } 4373 else 4374 { 4375 int* p = cast(int*)mem_addr; 4376 p[0] = a.array[0]; 4377 p[1] = a.array[1]; 4378 p[2] = a.array[2]; 4379 p[3] = a.array[3]; 4380 } 4381 } 4382 unittest 4383 { 4384 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 4385 align(16) int[6] R = [0, 0, 0, 0, 0, 0]; 4386 int[4] correct = [1, 2, 3, 4]; 4387 _mm_storeu_si128(cast(__m128i*)(&R[1]), A); 4388 assert(R[1..5] == correct); 4389 } 4390 4391 /// Store 32-bit integer from the first element of `a` into memory. 4392 /// `mem_addr` does not need to be aligned on any particular boundary. 4393 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted 4394 { 4395 pragma(inline, true); 4396 int* dest = cast(int*)mem_addr; 4397 *dest = a.array[0]; 4398 } 4399 unittest 4400 { 4401 int[2] arr = [-24, 12]; 4402 _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7)); 4403 assert(arr == [-24, -1]); 4404 } 4405 4406 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 4407 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte 4408 /// boundary or a general-protection exception may be generated. 4409 void _mm_stream_pd (double* mem_addr, __m128d a) 4410 { 4411 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4412 __m128d* dest = cast(__m128d*)mem_addr; 4413 *dest = a; 4414 } 4415 4416 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 4417 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception 4418 /// may be generated. 4419 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) 4420 { 4421 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4422 __m128i* dest = cast(__m128i*)mem_addr; 4423 *dest = a; 4424 } 4425 4426 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 4427 /// pollution. If the cache line containing address mem_addr is already in the cache, 4428 /// the cache will be updated. 4429 void _mm_stream_si32 (int* mem_addr, int a) 4430 { 4431 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4432 *mem_addr = a; 4433 } 4434 4435 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 4436 /// cache pollution. If the cache line containing address mem_addr is already 4437 /// in the cache, the cache will be updated. 4438 void _mm_stream_si64 (long* mem_addr, long a) 4439 { 4440 // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4441 *mem_addr = a; 4442 } 4443 4444 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 4445 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 4446 { 4447 pragma(inline, true); 4448 return cast(__m128i)(cast(short8)a - cast(short8)b); 4449 } 4450 // TODO untitest 4451 4452 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 4453 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 4454 { 4455 pragma(inline, true); 4456 return cast(__m128i)(cast(int4)a - cast(int4)b); 4457 } 4458 // TODO untitest 4459 4460 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`. 4461 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 4462 { 4463 pragma(inline, true); 4464 return cast(__m128i)(cast(long2)a - cast(long2)b); 4465 } 4466 // TODO untitest 4467 4468 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 4469 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 4470 { 4471 pragma(inline, true); 4472 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 4473 } 4474 // TODO untitest 4475 4476 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 4477 /// floating-point elements in `a`. 4478 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 4479 { 4480 pragma(inline, true); 4481 return a - b; 4482 } 4483 // TODO untitest 4484 4485 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 4486 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the 4487 /// upper element of result. 4488 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted 4489 { 4490 version(DigitalMars) 4491 { 4492 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 4493 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 4494 asm pure nothrow @nogc @trusted { nop;} 4495 a[0] = a[0] - b[0]; 4496 return a; 4497 } 4498 else static if (GDC_with_SSE2) 4499 { 4500 return __builtin_ia32_subsd(a, b); 4501 } 4502 else 4503 { 4504 a.ptr[0] -= b.array[0]; 4505 return a; 4506 } 4507 } 4508 unittest 4509 { 4510 __m128d a = [1.5, -2.0]; 4511 a = _mm_sub_sd(a, a); 4512 assert(a.array == [0.0, -2.0]); 4513 } 4514 4515 /// Subtract 64-bit integer `b` from 64-bit integer `a`. 4516 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe 4517 { 4518 pragma(inline, true); 4519 return a - b; 4520 } 4521 // TODO unittest 4522 4523 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4524 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 4525 { 4526 version(LDC) 4527 { 4528 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4529 { 4530 // Generates PSUBSW since LDC 1.15 -O0 4531 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4532 4533 enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4534 enum ir = ` 4535 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4536 ret <8 x i16> %r`; 4537 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4538 } 4539 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4540 { 4541 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4542 short[8] res; // PERF: =void; 4543 short8 sa = cast(short8)a; 4544 short8 sb = cast(short8)b; 4545 foreach(i; 0..8) 4546 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4547 return _mm_loadu_si128(cast(int4*)res.ptr); 4548 } 4549 else static if (LDC_with_SSE2) 4550 { 4551 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 4552 } 4553 else 4554 static assert(false); 4555 } 4556 else static if (GDC_with_SSE2) 4557 { 4558 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 4559 } 4560 else 4561 { 4562 short[8] res; // PERF =void; 4563 short8 sa = cast(short8)a; 4564 short8 sb = cast(short8)b; 4565 foreach(i; 0..8) 4566 res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4567 return _mm_loadu_si128(cast(int4*)res.ptr); 4568 } 4569 } 4570 unittest 4571 { 4572 short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0), 4573 _mm_setr_epi16(-10 , 16, 5, 4, 3, 2, 1, 0)); 4574 static immutable short[8] correctResult = [32767, -32768, 0, 0, 0, 0, 0, 0]; 4575 assert(res.array == correctResult); 4576 } 4577 4578 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 4579 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 4580 { 4581 version(LDC) 4582 { 4583 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4584 { 4585 // x86: Generates PSUBSB since LDC 1.15 -O0 4586 // ARM: Generates sqsub.16b since LDC 1.21 -O0 4587 enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 4588 enum ir = ` 4589 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 4590 ret <16 x i8> %r`; 4591 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 4592 } 4593 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4594 { 4595 byte[16] res; // PERF =void; 4596 byte16 sa = cast(byte16)a; 4597 byte16 sb = cast(byte16)b; 4598 foreach(i; 0..16) 4599 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4600 return _mm_loadu_si128(cast(int4*)res.ptr); 4601 } 4602 else static if (LDC_with_SSE2) 4603 { 4604 return cast(__m128i) __builtin_ia32_psubsb128(cast(byte16) a, cast(byte16) b); 4605 } 4606 else 4607 static assert(false); 4608 } 4609 else static if (GDC_with_SSE2) 4610 { 4611 return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b); 4612 } 4613 else 4614 { 4615 byte[16] res; // PERF =void; 4616 byte16 sa = cast(byte16)a; 4617 byte16 sb = cast(byte16)b; 4618 foreach(i; 0..16) 4619 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4620 return _mm_loadu_si128(cast(int4*)res.ptr); 4621 } 4622 } 4623 unittest 4624 { 4625 byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4626 _mm_setr_epi8( 15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4627 static immutable byte[16] correctResult = [-128, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4628 assert(res.array == correctResult); 4629 } 4630 4631 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 4632 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 4633 { 4634 version(LDC) 4635 { 4636 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4637 { 4638 // x86: Generates PSUBUSW since LDC 1.15 -O0 4639 // ARM: Generates uqsub.8h since LDC 1.21 -O0 4640 enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4641 enum ir = ` 4642 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4643 ret <8 x i16> %r`; 4644 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4645 } 4646 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4647 { 4648 short[8] res; // PERF =void; 4649 short8 sa = cast(short8)a; 4650 short8 sb = cast(short8)b; 4651 foreach(i; 0..8) 4652 { 4653 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4654 res[i] = saturateSignedIntToUnsignedShort(sum); 4655 } 4656 return _mm_loadu_si128(cast(int4*)res.ptr); 4657 } 4658 else static if (LDC_with_SSE2) 4659 { 4660 return cast(__m128i) __builtin_ia32_psubusw128(a, b); 4661 } 4662 else 4663 static assert(false); 4664 } 4665 else static if (GDC_with_SSE2) 4666 { 4667 return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b); 4668 } 4669 else 4670 { 4671 short[8] res; // PERF =void; 4672 short8 sa = cast(short8)a; 4673 short8 sb = cast(short8)b; 4674 foreach(i; 0..8) 4675 { 4676 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4677 res[i] = saturateSignedIntToUnsignedShort(sum); 4678 } 4679 return _mm_loadu_si128(cast(int4*)res.ptr); 4680 } 4681 } 4682 unittest 4683 { 4684 short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534, 1, 5, 4, 3, 2, 1, 0), 4685 _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0)); 4686 static immutable short[8] correct = [ 0, 0, 1, 0, 0, 2, 0, 0]; 4687 assert(R.array == correct); 4688 } 4689 4690 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4691 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4692 { 4693 version(LDC) 4694 { 4695 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4696 { 4697 // x86: Generates PSUBUSB since LDC 1.15 -O0 4698 // ARM: Generates uqsub.16b since LDC 1.21 -O0 4699 enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 4700 enum ir = ` 4701 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 4702 ret <16 x i8> %r`; 4703 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 4704 } 4705 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4706 { 4707 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4708 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4709 { 4710 ubyte[16] res; // PERF =void; 4711 byte16 sa = cast(byte16)a; 4712 byte16 sb = cast(byte16)b; 4713 foreach(i; 0..16) 4714 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4715 return _mm_loadu_si128(cast(int4*)res.ptr); 4716 } 4717 } 4718 else static if (LDC_with_SSE2) 4719 { 4720 return __builtin_ia32_psubusb128(a, b); 4721 } 4722 else 4723 static assert(false); 4724 } 4725 else static if (GDC_with_SSE2) 4726 { 4727 return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b); 4728 } 4729 else 4730 { 4731 ubyte[16] res; // PERF =void; 4732 byte16 sa = cast(byte16)a; 4733 byte16 sb = cast(byte16)b; 4734 foreach(i; 0..16) 4735 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4736 return _mm_loadu_si128(cast(int4*)res.ptr); 4737 } 4738 } 4739 unittest 4740 { 4741 byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4742 _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4743 static immutable byte[16] correctResult = [ 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4744 assert(res.array == correctResult); 4745 } 4746 4747 // Note: the only difference between these intrinsics is the signalling 4748 // behaviour of quiet NaNs. This is incorrect but the case where 4749 // you would want to differentiate between qNaN and sNaN and then 4750 // treat them differently on purpose seems extremely rare. 4751 alias _mm_ucomieq_sd = _mm_comieq_sd; /// 4752 alias _mm_ucomige_sd = _mm_comige_sd; /// 4753 alias _mm_ucomigt_sd = _mm_comigt_sd; /// 4754 alias _mm_ucomile_sd = _mm_comile_sd; /// 4755 alias _mm_ucomilt_sd = _mm_comilt_sd; /// 4756 alias _mm_ucomineq_sd = _mm_comineq_sd; /// 4757 4758 /// Return vector of type `__m128d` with undefined elements. 4759 __m128d _mm_undefined_pd() pure @safe 4760 { 4761 pragma(inline, true); 4762 __m128d result = void; 4763 return result; 4764 } 4765 4766 /// Return vector of type `__m128i` with undefined elements. 4767 __m128i _mm_undefined_si128() pure @safe 4768 { 4769 pragma(inline, true); 4770 __m128i result = void; 4771 return result; 4772 } 4773 4774 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. 4775 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe 4776 { 4777 static if (GDC_with_SSE2) 4778 { 4779 return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b); 4780 } 4781 else static if (DMD_with_32bit_asm) 4782 { 4783 asm pure nothrow @nogc @trusted 4784 { 4785 movdqu XMM0, a; 4786 movdqu XMM1, b; 4787 punpckhwd XMM0, XMM1; 4788 movdqu a, XMM0; 4789 } 4790 return a; 4791 } 4792 else 4793 { 4794 // TODO remove this use of shufflevector except for LDC 4795 return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 4796 (cast(short8)a, cast(short8)b); 4797 } 4798 } 4799 unittest 4800 { 4801 __m128i A = _mm_setr_epi16(4, 5, 6, 7, 8, 9, 10, 11); 4802 __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19); 4803 short8 C = cast(short8)(_mm_unpackhi_epi16(A, B)); 4804 short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19]; 4805 assert(C.array == correct); 4806 } 4807 4808 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. 4809 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted 4810 { 4811 static if (GDC_with_SSE2) 4812 { 4813 return __builtin_ia32_punpckhdq128(a, b); 4814 } 4815 else version(DigitalMars) 4816 { 4817 __m128i r; 4818 r.ptr[0] = a.array[2]; 4819 r.ptr[1] = b.array[2]; 4820 r.ptr[2] = a.array[3]; 4821 r.ptr[3] = b.array[3]; 4822 return r; 4823 } 4824 else 4825 { 4826 // TODO remove this use of shufflevector except for LDC 4827 return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 4828 } 4829 } 4830 unittest 4831 { 4832 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 4833 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 4834 __m128i C = _mm_unpackhi_epi32(A, B); 4835 int[4] correct = [3, 7, 4, 8]; 4836 assert(C.array == correct); 4837 } 4838 4839 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`. 4840 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted 4841 { 4842 static if (GDC_with_SSE2) 4843 { 4844 return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b); 4845 } 4846 else 4847 { 4848 __m128i r = cast(__m128i)b; 4849 r[0] = a[2]; 4850 r[1] = a[3]; 4851 return r; 4852 } 4853 } 4854 unittest // Issue #36 4855 { 4856 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4857 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4858 long2 C = cast(long2)(_mm_unpackhi_epi64(A, B)); 4859 long[2] correct = [0x33333333_33333333, 0x55555555_55555555]; 4860 assert(C.array == correct); 4861 } 4862 4863 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. 4864 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe 4865 { 4866 static if (GDC_with_SSE2) 4867 { 4868 return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b); 4869 } 4870 else static if (DMD_with_32bit_asm) 4871 { 4872 asm pure nothrow @nogc @trusted 4873 { 4874 movdqu XMM0, a; 4875 movdqu XMM1, b; 4876 punpckhbw XMM0, XMM1; 4877 movdqu a, XMM0; 4878 } 4879 return a; 4880 } 4881 else 4882 { 4883 // TODO remove this use of shufflevector except for LDC 4884 return cast(__m128i)shufflevector!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 4885 12, 28, 13, 29, 14, 30, 15, 31) 4886 (cast(byte16)a, cast(byte16)b); 4887 } 4888 } 4889 unittest 4890 { 4891 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 4892 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 4893 byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B); 4894 byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]; 4895 assert(C.array == correct); 4896 } 4897 4898 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`. 4899 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe 4900 { 4901 static if (GDC_with_SSE2) 4902 { 4903 return __builtin_ia32_unpckhpd(a, b); 4904 } 4905 else 4906 { 4907 return shufflevector!(__m128d, 1, 3)(a, b); // TODO remove this use of shufflevector except for LDC 4908 } 4909 } 4910 unittest 4911 { 4912 __m128d A = _mm_setr_pd(4.0, 6.0); 4913 __m128d B = _mm_setr_pd(7.0, 9.0); 4914 __m128d C = _mm_unpackhi_pd(A, B); 4915 double[2] correct = [6.0, 9.0]; 4916 assert(C.array == correct); 4917 } 4918 4919 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. 4920 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe 4921 { 4922 static if (GDC_with_SSE2) 4923 { 4924 return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b); 4925 } 4926 else static if (DMD_with_32bit_asm) 4927 { 4928 asm pure nothrow @nogc @trusted 4929 { 4930 movdqu XMM0, a; 4931 movdqu XMM1, b; 4932 punpcklwd XMM0, XMM1; 4933 movdqu a, XMM0; 4934 } 4935 return a; 4936 } 4937 else 4938 { 4939 return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11) 4940 (cast(short8)a, cast(short8)b); // TODO remove this use of shufflevector except for LDC 4941 } 4942 } 4943 unittest 4944 { 4945 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 4946 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 4947 short8 C = cast(short8) _mm_unpacklo_epi16(A, B); 4948 short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11]; 4949 assert(C.array == correct); 4950 } 4951 4952 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. 4953 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted 4954 { 4955 static if (GDC_with_SSE2) 4956 { 4957 return __builtin_ia32_punpckldq128(a, b); 4958 } 4959 else version(DigitalMars) 4960 { 4961 __m128i r; 4962 r.ptr[0] = a.array[0]; 4963 r.ptr[1] = b.array[0]; 4964 r.ptr[2] = a.array[1]; 4965 r.ptr[3] = b.array[1]; 4966 return r; 4967 } 4968 else 4969 { 4970 return shufflevector!(int4, 0, 4, 1, 5)(cast(int4)a, cast(int4)b); // TODO remove this use of shufflevector except for LDC 4971 } 4972 } 4973 unittest 4974 { 4975 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 4976 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 4977 __m128i C = _mm_unpacklo_epi32(A, B); 4978 int[4] correct = [1, 5, 2, 6]; 4979 assert(C.array == correct); 4980 } 4981 4982 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`. 4983 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted 4984 { 4985 static if (GDC_with_SSE2) 4986 { 4987 return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b); 4988 } 4989 else 4990 { 4991 long2 lA = cast(long2)a; 4992 long2 lB = cast(long2)b; 4993 long2 R; // PERF =void; 4994 R.ptr[0] = lA.array[0]; 4995 R.ptr[1] = lB.array[0]; 4996 return cast(__m128i)R; 4997 } 4998 } 4999 unittest // Issue #36 5000 { 5001 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 5002 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 5003 long2 C = cast(long2)(_mm_unpacklo_epi64(A, B)); 5004 long[2] correct = [0x22222222_22222222, 0x44444444_44444444]; 5005 assert(C.array == correct); 5006 } 5007 5008 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. 5009 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe 5010 { 5011 static if (GDC_with_SSE2) 5012 { 5013 return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b); 5014 } 5015 else static if (DMD_with_32bit_asm) 5016 { 5017 asm pure nothrow @nogc @trusted 5018 { 5019 movdqu XMM0, a; 5020 movdqu XMM1, b; 5021 punpcklbw XMM0, XMM1; 5022 movdqu a, XMM0; 5023 } 5024 return a; 5025 } 5026 else 5027 { 5028 return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 5029 4, 20, 5, 21, 6, 22, 7, 23) 5030 (cast(byte16)a, cast(byte16)b); // TODO remove this use of shufflevector except for LDC 5031 } 5032 } 5033 unittest 5034 { 5035 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 5036 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 5037 byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B); 5038 byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]; 5039 assert(C.array == correct); 5040 } 5041 5042 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`. 5043 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe 5044 { 5045 static if (GDC_with_SSE2) 5046 { 5047 return __builtin_ia32_unpcklpd(a, b); 5048 } 5049 else 5050 { 5051 return shufflevector!(__m128d, 0, 2)(a, b); // TODO remove this use of shufflevector except for LDC 5052 } 5053 } 5054 unittest 5055 { 5056 __m128d A = _mm_setr_pd(4.0, 6.0); 5057 __m128d B = _mm_setr_pd(7.0, 9.0); 5058 __m128d C = _mm_unpacklo_pd(A, B); 5059 double[2] correct = [4.0, 7.0]; 5060 assert(C.array == correct); 5061 } 5062 5063 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 5064 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 5065 { 5066 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 5067 } 5068 // TODO unittest 5069 5070 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`. 5071 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 5072 { 5073 return a ^ b; 5074 } 5075 // TODO unittest 5076 5077 unittest 5078 { 5079 float distance(float[4] a, float[4] b) nothrow @nogc 5080 { 5081 __m128 va = _mm_loadu_ps(a.ptr); 5082 __m128 vb = _mm_loadu_ps(b.ptr); 5083 __m128 diffSquared = _mm_sub_ps(va, vb); 5084 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 5085 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared)); 5086 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum)); 5087 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 5088 } 5089 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 5090 }