1 /** 2 * SSE2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2 4 * 5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.emmintrin; 9 10 public import inteli.types; 11 public import inteli.xmmintrin; // SSE2 includes SSE1 12 import inteli.mmx; 13 import inteli.internals; 14 15 nothrow @nogc: 16 17 18 // SSE2 instructions 19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 20 21 /// Add packed 16-bit integers in `a` and `b`. 22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 23 { 24 pragma(inline, true); 25 return cast(__m128i)(cast(short8)a + cast(short8)b); 26 } 27 unittest 28 { 29 __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77); 30 short8 R = cast(short8) _mm_add_epi16(A, A); 31 short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154]; 32 assert(R.array == correct); 33 } 34 35 /// Add packed 32-bit integers in `a` and `b`. 36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 37 { 38 pragma(inline, true); 39 return cast(__m128i)(cast(int4)a + cast(int4)b); 40 } 41 unittest 42 { 43 __m128i A = _mm_setr_epi32( -7, -1, 0, 9); 44 int4 R = _mm_add_epi32(A, A); 45 int[4] correct = [ -14, -2, 0, 18 ]; 46 assert(R.array == correct); 47 } 48 49 /// Add packed 64-bit integers in `a` and `b`. 50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 51 { 52 pragma(inline, true); 53 return cast(__m128i)(cast(long2)a + cast(long2)b); 54 } 55 unittest 56 { 57 __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000); 58 long2 R = cast(long2) _mm_add_epi64(A, A); 59 long[2] correct = [ -2, 0 ]; 60 assert(R.array == correct); 61 } 62 63 /// Add packed 8-bit integers in `a` and `b`. 64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 65 { 66 pragma(inline, true); 67 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 68 } 69 unittest 70 { 71 __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78); 72 byte16 R = cast(byte16) _mm_add_epi8(A, A); 73 byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100]; 74 assert(R.array == correct); 75 } 76 77 /// Add the lower double-precision (64-bit) floating-point element 78 /// in `a` and `b`, store the result in the lower element of dst, 79 /// and copy the upper element from `a` to the upper element of destination. 80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 81 { 82 static if (GDC_with_SSE2) 83 { 84 return __builtin_ia32_addsd(a, b); 85 } 86 else version(DigitalMars) 87 { 88 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 89 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 90 asm pure nothrow @nogc @trusted { nop;} 91 a[0] = a[0] + b[0]; 92 return a; 93 } 94 else 95 { 96 a[0] += b[0]; 97 return a; 98 } 99 } 100 unittest 101 { 102 __m128d a = [1.5, -2.0]; 103 a = _mm_add_sd(a, a); 104 assert(a.array == [3.0, -2.0]); 105 } 106 107 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 108 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 109 { 110 pragma(inline, true); 111 return a + b; 112 } 113 unittest 114 { 115 __m128d a = [1.5, -2.0]; 116 a = _mm_add_pd(a, a); 117 assert(a.array == [3.0, -4.0]); 118 } 119 120 /// Add 64-bit integers `a` and `b`. 121 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe 122 { 123 pragma(inline, true); 124 return a + b; 125 } 126 127 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 128 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 129 { 130 static if (GDC_with_SSE2) 131 { 132 return cast(__m128i)__builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 133 } 134 else version(LDC) 135 { 136 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 137 { 138 // x86: Generates PADDSW since LDC 1.15 -O0 139 // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20 140 enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 141 enum ir = ` 142 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 143 ret <8 x i16> %r`; 144 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 145 } 146 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 147 { 148 short[8] res; 149 short8 sa = cast(short8)a; 150 short8 sb = cast(short8)b; 151 foreach(i; 0..8) 152 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 153 return _mm_loadu_si128(cast(int4*)res.ptr); 154 } 155 else 156 return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 157 } 158 else 159 { 160 short[8] res; 161 short8 sa = cast(short8)a; 162 short8 sb = cast(short8)b; 163 foreach(i; 0..8) 164 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 165 return _mm_loadu_si128(cast(int4*)res.ptr); 166 } 167 } 168 unittest 169 { 170 short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), 171 _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); 172 static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 173 assert(res.array == correctResult); 174 } 175 176 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 177 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 178 { 179 static if (GDC_with_SSE2) 180 { 181 return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b); 182 } 183 else version(LDC) 184 { 185 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 186 { 187 // x86: Generates PADDSB since LDC 1.15 -O0 188 // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20 189 enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 190 enum ir = ` 191 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 192 ret <16 x i8> %r`; 193 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 194 } 195 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 196 { 197 byte[16] res; 198 byte16 sa = cast(byte16)a; 199 byte16 sb = cast(byte16)b; 200 foreach(i; 0..16) 201 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 202 return _mm_loadu_si128(cast(int4*)res.ptr); 203 } 204 else 205 return cast(__m128i) __builtin_ia32_paddsb128(cast(byte16)a, cast(byte16)b); 206 } 207 else 208 { 209 byte[16] res; 210 byte16 sa = cast(byte16)a; 211 byte16 sb = cast(byte16)b; 212 foreach(i; 0..16) 213 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 214 return _mm_loadu_si128(cast(int4*)res.ptr); 215 } 216 } 217 unittest 218 { 219 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 220 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 221 static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14, 222 16, 18, 20, 22, 24, 26, 28, 30]; 223 assert(res.array == correctResult); 224 } 225 226 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 227 // PERF: #GDC version? 228 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 229 { 230 version(LDC) 231 { 232 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 233 { 234 // x86: Generates PADDUSB since LDC 1.15 -O0 235 // ARM: Generates uqadd.16b since LDC 1.21 -O1 236 enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 237 enum ir = ` 238 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 239 ret <16 x i8> %r`; 240 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 241 } 242 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 243 { 244 ubyte[16] res; 245 byte16 sa = cast(byte16)a; 246 byte16 sb = cast(byte16)b; 247 foreach(i; 0..16) 248 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 249 return _mm_loadu_si128(cast(int4*)res.ptr); 250 } 251 else 252 return __builtin_ia32_paddusb128(a, b); 253 } 254 else 255 { 256 ubyte[16] res; 257 byte16 sa = cast(byte16)a; 258 byte16 sb = cast(byte16)b; 259 foreach(i; 0..16) 260 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 261 return _mm_loadu_si128(cast(int4*)res.ptr); 262 } 263 } 264 unittest 265 { 266 byte16 res = cast(byte16) 267 _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0), 268 _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0)); 269 static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 270 0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 271 assert(res.array == correctResult); 272 } 273 274 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation. 275 // PERF: #GDC version? 276 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 277 { 278 version(LDC) 279 { 280 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 281 { 282 // x86: Generates PADDUSW since LDC 1.15 -O0 283 // ARM: Generates uqadd.8h since LDC 1.21 -O1 284 enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 285 enum ir = ` 286 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 287 ret <8 x i16> %r`; 288 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 289 } 290 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 291 { 292 ushort[8] res; 293 short8 sa = cast(short8)a; 294 short8 sb = cast(short8)b; 295 foreach(i; 0..8) 296 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 297 return _mm_loadu_si128(cast(int4*)res.ptr); 298 } 299 else 300 return __builtin_ia32_paddusw128(a, b); 301 } 302 else 303 { 304 ushort[8] res; 305 short8 sa = cast(short8)a; 306 short8 sb = cast(short8)b; 307 foreach(i; 0..8) 308 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 309 return _mm_loadu_si128(cast(int4*)res.ptr); 310 } 311 } 312 unittest 313 { 314 short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0), 315 _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0)); 316 static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6]; 317 assert(res.array == correctResult); 318 } 319 320 /// Compute the bitwise AND of packed double-precision (64-bit) 321 /// floating-point elements in `a` and `b`. 322 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 323 { 324 pragma(inline, true); 325 return cast(__m128d)( cast(long2)a & cast(long2)b ); 326 } 327 unittest 328 { 329 double a = 4.32; 330 double b = -78.99; 331 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 332 __m128d A = _mm_set_pd(a, b); 333 __m128d B = _mm_set_pd(b, a); 334 long2 R = cast(long2)( _mm_and_pd(A, B) ); 335 assert(R.array[0] == correct); 336 assert(R.array[1] == correct); 337 } 338 339 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`. 340 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 341 { 342 pragma(inline, true); 343 return a & b; 344 } 345 unittest 346 { 347 __m128i A = _mm_set1_epi32(7); 348 __m128i B = _mm_set1_epi32(14); 349 __m128i R = _mm_and_si128(A, B); 350 int[4] correct = [6, 6, 6, 6]; 351 assert(R.array == correct); 352 } 353 354 /// Compute the bitwise NOT of packed double-precision (64-bit) 355 /// floating-point elements in `a` and then AND with `b`. 356 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 357 { 358 return cast(__m128d)( ~(cast(long2)a) & cast(long2)b); 359 } 360 unittest 361 { 362 double a = 4.32; 363 double b = -78.99; 364 long correct = (~*cast(long*)(&a)) & ( *cast(long*)(&b)); 365 long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b)); 366 __m128d A = _mm_setr_pd(a, b); 367 __m128d B = _mm_setr_pd(b, a); 368 long2 R = cast(long2)( _mm_andnot_pd(A, B) ); 369 assert(R.array[0] == correct); 370 assert(R.array[1] == correct2); 371 } 372 373 /// Compute the bitwise NOT of 128 bits (representing integer data) 374 /// in `a` and then AND with `b`. 375 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 376 { 377 return (~a) & b; 378 } 379 unittest 380 { 381 __m128i A = _mm_set1_epi32(7); 382 __m128i B = _mm_set1_epi32(14); 383 __m128i R = _mm_andnot_si128(A, B); 384 int[4] correct = [8, 8, 8, 8]; 385 assert(R.array == correct); 386 } 387 388 /// Average packed unsigned 16-bit integers in `a` and `b`. 389 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted 390 { 391 static if (GDC_with_SSE2) 392 { 393 return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b); 394 } 395 else static if (LDC_with_ARM64) 396 { 397 return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b); 398 } 399 else version(LDC) 400 { 401 // Generates pavgw even in LDC 1.0, even in -O0 402 // But not in ARM 403 enum ir = ` 404 %ia = zext <8 x i16> %0 to <8 x i32> 405 %ib = zext <8 x i16> %1 to <8 x i32> 406 %isum = add <8 x i32> %ia, %ib 407 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 408 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 409 %r = trunc <8 x i32> %isums to <8 x i16> 410 ret <8 x i16> %r`; 411 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 412 } 413 else 414 { 415 short8 sa = cast(short8)a; 416 short8 sb = cast(short8)b; 417 short8 sr = void; 418 foreach(i; 0..8) 419 { 420 sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 ); 421 } 422 return cast(int4)sr; 423 } 424 } 425 unittest 426 { 427 __m128i A = _mm_set1_epi16(31); 428 __m128i B = _mm_set1_epi16(64); 429 short8 avg = cast(short8)(_mm_avg_epu16(A, B)); 430 foreach(i; 0..8) 431 assert(avg.array[i] == 48); 432 } 433 434 /// Average packed unsigned 8-bit integers in `a` and `b`. 435 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted 436 { 437 static if (GDC_with_SSE2) 438 { 439 return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b); 440 } 441 else static if (LDC_with_ARM64) 442 { 443 return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b); 444 } 445 else version(LDC) 446 { 447 // Generates pavgb even in LDC 1.0, even in -O0 448 // But not in ARM 449 enum ir = ` 450 %ia = zext <16 x i8> %0 to <16 x i16> 451 %ib = zext <16 x i8> %1 to <16 x i16> 452 %isum = add <16 x i16> %ia, %ib 453 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 454 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 455 %r = trunc <16 x i16> %isums to <16 x i8> 456 ret <16 x i8> %r`; 457 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 458 } 459 else 460 { 461 byte16 sa = cast(byte16)a; 462 byte16 sb = cast(byte16)b; 463 byte16 sr = void; 464 foreach(i; 0..16) 465 { 466 sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 ); 467 } 468 return cast(int4)sr; 469 } 470 } 471 unittest 472 { 473 __m128i A = _mm_set1_epi8(31); 474 __m128i B = _mm_set1_epi8(64); 475 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B)); 476 foreach(i; 0..16) 477 assert(avg.array[i] == 48); 478 } 479 480 /// Shift `a` left by `bytes` bytes while shifting in zeros. 481 alias _mm_bslli_si128 = _mm_slli_si128; 482 unittest 483 { 484 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 485 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 486 __m128i result = _mm_bslli_si128!5(toShift); 487 assert( (cast(byte16)result).array == exact); 488 } 489 490 /// Shift `v` right by `bytes` bytes while shifting in zeros. 491 alias _mm_bsrli_si128 = _mm_srli_si128; 492 unittest 493 { 494 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 495 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0]; 496 __m128i result = _mm_bsrli_si128!5(toShift); 497 assert( (cast(byte16)result).array == exact); 498 } 499 500 /// Cast vector of type `__m128d` to type `__m128`. 501 /// Note: Also possible with a regular `cast(__m128)(a)`. 502 __m128 _mm_castpd_ps (__m128d a) pure @safe 503 { 504 return cast(__m128)a; 505 } 506 507 /// Cast vector of type `__m128d` to type `__m128i`. 508 /// Note: Also possible with a regular `cast(__m128i)(a)`. 509 __m128i _mm_castpd_si128 (__m128d a) pure @safe 510 { 511 return cast(__m128i)a; 512 } 513 514 /// Cast vector of type `__m128` to type `__m128d`. 515 /// Note: Also possible with a regular `cast(__m128d)(a)`. 516 __m128d _mm_castps_pd (__m128 a) pure @safe 517 { 518 return cast(__m128d)a; 519 } 520 521 /// Cast vector of type `__m128` to type `__m128i`. 522 /// Note: Also possible with a regular `cast(__m128i)(a)`. 523 __m128i _mm_castps_si128 (__m128 a) pure @safe 524 { 525 return cast(__m128i)a; 526 } 527 528 /// Cast vector of type `__m128i` to type `__m128d`. 529 /// Note: Also possible with a regular `cast(__m128d)(a)`. 530 __m128d _mm_castsi128_pd (__m128i a) pure @safe 531 { 532 return cast(__m128d)a; 533 } 534 535 /// Cast vector of type `__m128i` to type `__m128`. 536 /// Note: Also possible with a regular `cast(__m128)(a)`. 537 __m128 _mm_castsi128_ps (__m128i a) pure @safe 538 { 539 return cast(__m128)a; 540 } 541 542 /// Invalidate and flush the cache line that contains `p` 543 /// from all levels of the cache hierarchy. 544 void _mm_clflush (const(void)* p) @trusted 545 { 546 static if (GDC_with_SSE2) 547 { 548 __builtin_ia32_clflush(p); 549 } 550 else static if (LDC_with_SSE2) 551 { 552 __builtin_ia32_clflush(cast(void*)p); 553 } 554 else version(D_InlineAsm_X86) 555 { 556 asm pure nothrow @nogc @safe 557 { 558 mov EAX, p; 559 clflush [EAX]; 560 } 561 } 562 else version(D_InlineAsm_X86_64) 563 { 564 asm pure nothrow @nogc @safe 565 { 566 mov RAX, p; 567 clflush [RAX]; 568 } 569 } 570 else 571 { 572 // Do nothing. Invalidating cacheline does 573 // not affect correctness. 574 } 575 } 576 unittest 577 { 578 ubyte[64] cacheline; 579 _mm_clflush(cacheline.ptr); 580 } 581 582 /// Compare packed 16-bit integers in `a` and `b` for equality. 583 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 584 { 585 static if (GDC_with_SSE2) 586 { 587 return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b); 588 } 589 else 590 { 591 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 592 } 593 } 594 unittest 595 { 596 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 597 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 598 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 599 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 600 assert(R.array == E); 601 } 602 603 /// Compare packed 32-bit integers in `a` and `b` for equality. 604 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 605 { 606 static if (GDC_with_SSE2) 607 { 608 return __builtin_ia32_pcmpeqd128(a, b); 609 } 610 else 611 { 612 return equalMask!__m128i(a, b); 613 } 614 } 615 unittest 616 { 617 int4 A = [-3, -2, -1, 0]; 618 int4 B = [ 4, -2, 2, 0]; 619 int[4] E = [ 0, -1, 0, -1]; 620 int4 R = cast(int4)(_mm_cmpeq_epi32(A, B)); 621 assert(R.array == E); 622 } 623 624 /// Compare packed 8-bit integers in `a` and `b` for equality. 625 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 626 { 627 static if (GDC_with_SSE2) 628 { 629 return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b); 630 } 631 else 632 { 633 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 634 } 635 } 636 unittest 637 { 638 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 639 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 640 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 641 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 642 assert(C.array == correct); 643 } 644 645 /// Compare packed double-precision (64-bit) floating-point elements 646 /// in `a` and `b` for equality. 647 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 648 { 649 static if (GDC_with_SSE2) 650 { 651 return __builtin_ia32_cmpeqpd(a, b); 652 } 653 else 654 { 655 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b); 656 } 657 } 658 659 /// Compare the lower double-precision (64-bit) floating-point elements 660 /// in `a` and `b` for equality, store the result in the lower element, 661 /// and copy the upper element from `a`. 662 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 663 { 664 static if (GDC_with_SSE2) 665 { 666 return __builtin_ia32_cmpeqsd(a, b); 667 } 668 else 669 { 670 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b); 671 } 672 } 673 674 /// Compare packed double-precision (64-bit) floating-point elements 675 /// in `a` and `b` for greater-than-or-equal. 676 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 677 { 678 static if (GDC_with_SSE2) 679 { 680 return __builtin_ia32_cmpgepd(a, b); 681 } 682 else 683 { 684 return cast(__m128d) cmppd!(FPComparison.oge)(a, b); 685 } 686 } 687 688 /// Compare the lower double-precision (64-bit) floating-point elements 689 /// in `a` and `b` for greater-than-or-equal, store the result in the 690 /// lower element, and copy the upper element from `a`. 691 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 692 { 693 // Note: There is no __builtin_ia32_cmpgesd builtin. 694 static if (GDC_with_SSE2) 695 { 696 return __builtin_ia32_cmpnltsd(b, a); 697 } 698 else 699 { 700 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b); 701 } 702 } 703 704 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 705 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 706 { 707 static if (GDC_with_SSE2) 708 { 709 return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b); 710 } 711 else 712 { 713 return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b); 714 } 715 } 716 unittest 717 { 718 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 719 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 720 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 721 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 722 assert(R.array == E); 723 } 724 725 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 726 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 727 { 728 static if (GDC_with_SSE2) 729 { 730 return __builtin_ia32_pcmpgtd128(a, b); 731 } 732 else 733 { 734 return cast(__m128i)( greaterMask!int4(a, b)); 735 } 736 } 737 unittest 738 { 739 int4 A = [-3, 2, -1, 0]; 740 int4 B = [ 4, -2, 2, 0]; 741 int[4] E = [ 0, -1, 0, 0]; 742 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 743 assert(R.array == E); 744 } 745 746 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 747 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 748 { 749 static if (GDC_with_SSE2) 750 { 751 return cast(__m128i) __builtin_ia32_pcmpgtb128(cast(ubyte16)a, cast(ubyte16)b); 752 } 753 else 754 { 755 return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b); 756 } 757 } 758 unittest 759 { 760 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 761 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 762 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 763 byte[16] correct = [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 764 __m128i D = _mm_cmpeq_epi8(A, B); 765 assert(C.array == correct); 766 } 767 768 /// Compare packed double-precision (64-bit) floating-point elements 769 /// in `a` and `b` for greater-than. 770 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 771 { 772 static if (GDC_with_SSE2) 773 { 774 return __builtin_ia32_cmpgtpd(a, b); 775 } 776 else 777 { 778 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b); 779 } 780 } 781 782 /// Compare the lower double-precision (64-bit) floating-point elements 783 /// in `a` and `b` for greater-than, store the result in the lower element, 784 /// and copy the upper element from `a`. 785 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 786 { 787 // Note: There is no __builtin_ia32_cmpgtsd builtin. 788 static if (GDC_with_SSE2) 789 { 790 return __builtin_ia32_cmpnlesd(b, a); 791 } 792 else 793 { 794 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b); 795 } 796 } 797 798 /// Compare packed double-precision (64-bit) floating-point elements 799 /// in `a` and `b` for less-than-or-equal. 800 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 801 { 802 static if (GDC_with_SSE2) 803 { 804 return __builtin_ia32_cmplepd(a, b); 805 } 806 else 807 { 808 return cast(__m128d) cmppd!(FPComparison.ole)(a, b); 809 } 810 } 811 812 /// Compare the lower double-precision (64-bit) floating-point elements 813 /// in `a` and `b` for less-than-or-equal, store the result in the 814 /// lower element, and copy the upper element from `a`. 815 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 816 { 817 static if (GDC_with_SSE2) 818 { 819 return __builtin_ia32_cmplesd(a, b); 820 } 821 else 822 { 823 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b); 824 } 825 } 826 827 /// Compare packed 16-bit integers in `a` and `b` for less-than. 828 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 829 { 830 return _mm_cmpgt_epi16(b, a); 831 } 832 833 /// Compare packed 32-bit integers in `a` and `b` for less-than. 834 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 835 { 836 return _mm_cmpgt_epi32(b, a); 837 } 838 839 /// Compare packed 8-bit integers in `a` and `b` for less-than. 840 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 841 { 842 return _mm_cmpgt_epi8(b, a); 843 } 844 845 /// Compare packed double-precision (64-bit) floating-point elements 846 /// in `a` and `b` for less-than. 847 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 848 { 849 static if (GDC_with_SSE2) 850 { 851 return __builtin_ia32_cmpltpd(a, b); 852 } 853 else 854 { 855 return cast(__m128d) cmppd!(FPComparison.olt)(a, b); 856 } 857 } 858 859 /// Compare the lower double-precision (64-bit) floating-point elements 860 /// in `a` and `b` for less-than, store the result in the lower 861 /// element, and copy the upper element from `a`. 862 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 863 { 864 static if (GDC_with_SSE2) 865 { 866 return __builtin_ia32_cmpltsd(a, b); 867 } 868 else 869 { 870 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b); 871 } 872 } 873 874 /// Compare packed double-precision (64-bit) floating-point elements 875 /// in `a` and `b` for not-equal. 876 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 877 { 878 static if (GDC_with_SSE2) 879 { 880 return __builtin_ia32_cmpneqpd(a, b); 881 } 882 else 883 { 884 return cast(__m128d) cmppd!(FPComparison.une)(a, b); 885 } 886 } 887 888 /// Compare the lower double-precision (64-bit) floating-point elements 889 /// in `a` and `b` for not-equal, store the result in the lower 890 /// element, and copy the upper element from `a`. 891 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 892 { 893 static if (GDC_with_SSE2) 894 { 895 return __builtin_ia32_cmpneqsd(a, b); 896 } 897 else 898 { 899 return cast(__m128d) cmpsd!(FPComparison.une)(a, b); 900 } 901 } 902 903 /// Compare packed double-precision (64-bit) floating-point elements 904 /// in `a` and `b` for not-greater-than-or-equal. 905 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 906 { 907 static if (GDC_with_SSE2) 908 { 909 return __builtin_ia32_cmpngepd(a, b); 910 } 911 else 912 { 913 return cast(__m128d) cmppd!(FPComparison.ult)(a, b); 914 } 915 } 916 917 /// Compare the lower double-precision (64-bit) floating-point elements 918 /// in `a` and `b` for not-greater-than-or-equal, store the result in 919 /// the lower element, and copy the upper element from `a`. 920 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 921 { 922 // Note: There is no __builtin_ia32_cmpngesd builtin. 923 static if (GDC_with_SSE2) 924 { 925 return __builtin_ia32_cmpltsd(b, a); 926 } 927 else 928 { 929 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b); 930 } 931 } 932 933 /// Compare packed double-precision (64-bit) floating-point elements 934 /// in `a` and `b` for not-greater-than. 935 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 936 { 937 static if (GDC_with_SSE2) 938 { 939 return __builtin_ia32_cmpngtpd(a, b); 940 } 941 else 942 { 943 return cast(__m128d) cmppd!(FPComparison.ule)(a, b); 944 } 945 } 946 947 /// Compare the lower double-precision (64-bit) floating-point elements 948 /// in `a` and `b` for not-greater-than, store the result in the 949 /// lower element, and copy the upper element from `a`. 950 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 951 { 952 // Note: There is no __builtin_ia32_cmpngtsd builtin. 953 static if (GDC_with_SSE2) 954 { 955 return __builtin_ia32_cmplesd(b, a); 956 } 957 else 958 { 959 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b); 960 } 961 } 962 963 /// Compare packed double-precision (64-bit) floating-point elements 964 /// in `a` and `b` for not-less-than-or-equal. 965 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 966 { 967 static if (GDC_with_SSE2) 968 { 969 return __builtin_ia32_cmpnlepd(a, b); 970 } 971 else 972 { 973 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b); 974 } 975 } 976 977 /// Compare the lower double-precision (64-bit) floating-point elements 978 /// in `a` and `b` for not-less-than-or-equal, store the result in the 979 /// lower element, and copy the upper element from `a`. 980 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 981 { 982 static if (GDC_with_SSE2) 983 { 984 return __builtin_ia32_cmpnlesd(a, b); 985 } 986 else 987 { 988 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b); 989 } 990 } 991 992 /// Compare packed double-precision (64-bit) floating-point elements 993 /// in `a` and `b` for not-less-than. 994 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 995 { 996 static if (GDC_with_SSE2) 997 { 998 return __builtin_ia32_cmpnltpd(a, b); 999 } 1000 else 1001 { 1002 return cast(__m128d) cmppd!(FPComparison.uge)(a, b); 1003 } 1004 } 1005 1006 /// Compare the lower double-precision (64-bit) floating-point elements 1007 /// in `a` and `b` for not-less-than, store the result in the lower 1008 /// element, and copy the upper element from `a`. 1009 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 1010 { 1011 static if (GDC_with_SSE2) 1012 { 1013 return __builtin_ia32_cmpnltsd(a, b); 1014 } 1015 else 1016 { 1017 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b); 1018 } 1019 } 1020 1021 /// Compare packed double-precision (64-bit) floating-point elements 1022 /// in `a` and `b` to see if neither is NaN. 1023 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 1024 { 1025 static if (GDC_with_SSE2) 1026 { 1027 return __builtin_ia32_cmpordpd(a, b); 1028 } 1029 else 1030 { 1031 return cast(__m128d) cmppd!(FPComparison.ord)(a, b); 1032 } 1033 } 1034 1035 /// Compare the lower double-precision (64-bit) floating-point elements 1036 /// in `a` and `b` to see if neither is NaN, store the result in the 1037 /// lower element, and copy the upper element from `a` to the upper element. 1038 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 1039 { 1040 static if (GDC_with_SSE2) 1041 { 1042 return __builtin_ia32_cmpordsd(a, b); 1043 } 1044 else 1045 { 1046 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b); 1047 } 1048 } 1049 1050 /// Compare packed double-precision (64-bit) floating-point elements 1051 /// in `a` and `b` to see if either is NaN. 1052 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 1053 { 1054 static if (GDC_with_SSE2) 1055 { 1056 return __builtin_ia32_cmpunordpd(a, b); 1057 } 1058 else 1059 { 1060 return cast(__m128d) cmppd!(FPComparison.uno)(a, b); 1061 } 1062 } 1063 1064 /// Compare the lower double-precision (64-bit) floating-point elements 1065 /// in `a` and `b` to see if either is NaN, store the result in the lower 1066 /// element, and copy the upper element from `a` to the upper element. 1067 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 1068 { 1069 static if (GDC_with_SSE2) 1070 { 1071 return __builtin_ia32_cmpunordsd(a, b); 1072 } 1073 else 1074 { 1075 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b); 1076 } 1077 } 1078 1079 /// Compare the lower double-precision (64-bit) floating-point element 1080 /// in `a` and `b` for equality, and return the boolean result (0 or 1). 1081 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe 1082 { 1083 // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 1084 // comisd instruction, it returns false in case of unordered instead. 1085 // 1086 // Actually C++ compilers disagree over the meaning of that instruction. 1087 // GCC will manage NaNs like the comisd instruction (return true if unordered), 1088 // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says. 1089 // We choose to do like the most numerous. It seems GCC is buggy with NaNs. 1090 return a.array[0] == b.array[0]; 1091 } 1092 unittest 1093 { 1094 assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1095 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1096 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1097 assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1098 assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1099 } 1100 1101 /// Compare the lower double-precision (64-bit) floating-point element 1102 /// in `a` and `b` for greater-than-or-equal, and return the boolean 1103 /// result (0 or 1). 1104 int _mm_comige_sd (__m128d a, __m128d b) pure @safe 1105 { 1106 return a.array[0] >= b.array[0]; 1107 } 1108 unittest 1109 { 1110 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1111 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1112 assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1113 assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1114 assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1115 assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1116 } 1117 1118 /// Compare the lower double-precision (64-bit) floating-point element 1119 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1). 1120 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe 1121 { 1122 return a.array[0] > b.array[0]; 1123 } 1124 unittest 1125 { 1126 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1127 assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1128 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1129 assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1130 assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1131 } 1132 1133 /// Compare the lower double-precision (64-bit) floating-point element 1134 /// in `a` and `b` for less-than-or-equal. 1135 int _mm_comile_sd (__m128d a, __m128d b) pure @safe 1136 { 1137 return a.array[0] <= b.array[0]; 1138 } 1139 unittest 1140 { 1141 assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1142 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1143 assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1144 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1145 assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1146 assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1147 } 1148 1149 /// Compare the lower double-precision (64-bit) floating-point element 1150 /// in `a` and `b` for less-than, and return the boolean result (0 or 1). 1151 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe 1152 { 1153 return a.array[0] < b.array[0]; 1154 } 1155 unittest 1156 { 1157 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1158 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1159 assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1160 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1161 assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1162 assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1163 } 1164 1165 /// Compare the lower double-precision (64-bit) floating-point element 1166 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1). 1167 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe 1168 { 1169 return a.array[0] != b.array[0]; 1170 } 1171 unittest 1172 { 1173 assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1174 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1175 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1176 assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1177 assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1178 } 1179 1180 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) 1181 /// floating-point elements. 1182 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted 1183 { 1184 version(LDC) 1185 { 1186 // Generates cvtdq2pd since LDC 1.0, even without optimizations 1187 enum ir = ` 1188 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 1189 %r = sitofp <2 x i32> %v to <2 x double> 1190 ret <2 x double> %r`; 1191 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a); 1192 } 1193 else static if (GDC_with_SSE2) 1194 { 1195 return __builtin_ia32_cvtdq2pd(a); 1196 } 1197 else 1198 { 1199 double2 r = void; 1200 r.ptr[0] = a.array[0]; 1201 r.ptr[1] = a.array[1]; 1202 return r; 1203 } 1204 } 1205 unittest 1206 { 1207 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54)); 1208 assert(A.array[0] == 54.0); 1209 assert(A.array[1] == 54.0); 1210 } 1211 1212 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 1213 /// floating-point elements. 1214 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted 1215 { 1216 static if (GDC_with_SSE2) 1217 { 1218 return __builtin_ia32_cvtdq2ps(a); 1219 } 1220 else 1221 { 1222 // x86: Generates cvtdq2ps since LDC 1.0.0 -O1 1223 // ARM: Generats scvtf.4s since LDC 1.8.0 -02 1224 __m128 res; 1225 res.ptr[0] = cast(float)a.array[0]; 1226 res.ptr[1] = cast(float)a.array[1]; 1227 res.ptr[2] = cast(float)a.array[2]; 1228 res.ptr[3] = cast(float)a.array[3]; 1229 return res; 1230 } 1231 } 1232 unittest 1233 { 1234 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 1235 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 1236 } 1237 1238 /// Convert packed double-precision (64-bit) floating-point elements 1239 /// in `a` to packed 32-bit integers. 1240 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted 1241 { 1242 // PERF ARM32 1243 static if (LDC_with_SSE2) 1244 { 1245 return __builtin_ia32_cvtpd2dq(a); 1246 } 1247 else static if (GDC_with_SSE2) 1248 { 1249 return __builtin_ia32_cvtpd2dq(a); 1250 } 1251 else static if (LDC_with_ARM64) 1252 { 1253 // Get current rounding mode. 1254 uint fpscr = arm_get_fpcr(); 1255 long2 i; 1256 switch(fpscr & _MM_ROUND_MASK_ARM) 1257 { 1258 default: 1259 case _MM_ROUND_NEAREST_ARM: i = vcvtnq_s64_f64(a); break; 1260 case _MM_ROUND_DOWN_ARM: i = vcvtmq_s64_f64(a); break; 1261 case _MM_ROUND_UP_ARM: i = vcvtpq_s64_f64(a); break; 1262 case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break; 1263 } 1264 int4 zero = 0; 1265 return cast(__m128i) shufflevector!(int4, 0, 2, 4, 6)(cast(int4)i, zero); 1266 } 1267 else 1268 { 1269 // PERF ARM32 1270 __m128i r = _mm_setzero_si128(); 1271 r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]); 1272 r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]); 1273 return r; 1274 } 1275 } 1276 unittest 1277 { 1278 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0)); 1279 assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0); 1280 } 1281 1282 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1283 /// to packed 32-bit integers 1284 __m64 _mm_cvtpd_pi32 (__m128d v) @safe 1285 { 1286 return to_m64(_mm_cvtpd_epi32(v)); 1287 } 1288 unittest 1289 { 1290 int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0)); 1291 assert(A.array[0] == 55 && A.array[1] == 61); 1292 } 1293 1294 /// Convert packed double-precision (64-bit) floating-point elements 1295 /// in `a` to packed single-precision (32-bit) floating-point elements. 1296 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted 1297 { 1298 static if (LDC_with_SSE2) 1299 { 1300 return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately 1301 } 1302 else static if (GDC_with_SSE2) 1303 { 1304 return __builtin_ia32_cvtpd2ps(a); 1305 } 1306 else 1307 { 1308 __m128 r = void; 1309 r.ptr[0] = a.array[0]; 1310 r.ptr[1] = a.array[1]; 1311 r.ptr[2] = 0; 1312 r.ptr[3] = 0; 1313 return r; 1314 } 1315 } 1316 unittest 1317 { 1318 __m128d A = _mm_set_pd(5.25, 4.0); 1319 __m128 B = _mm_cvtpd_ps(A); 1320 assert(B.array == [4.0f, 5.25f, 0, 0]); 1321 } 1322 1323 /// Convert packed 32-bit integers in `v` to packed double-precision 1324 /// (64-bit) floating-point elements. 1325 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe 1326 { 1327 return _mm_cvtepi32_pd(to_m128i(v)); 1328 } 1329 unittest 1330 { 1331 __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5)); 1332 assert(A.array[0] == 4.0 && A.array[1] == -5.0); 1333 } 1334 1335 /// Convert packed single-precision (32-bit) floating-point elements 1336 /// in `a` to packed 32-bit integers 1337 __m128i _mm_cvtps_epi32 (__m128 a) @trusted 1338 { 1339 static if (LDC_with_SSE2) 1340 { 1341 return cast(__m128i) __builtin_ia32_cvtps2dq(a); 1342 } 1343 else static if (GDC_with_SSE2) 1344 { 1345 return __builtin_ia32_cvtps2dq(a); 1346 } 1347 else static if (LDC_with_ARM64) 1348 { 1349 // Get current rounding mode. 1350 uint fpscr = arm_get_fpcr(); 1351 switch(fpscr & _MM_ROUND_MASK_ARM) 1352 { 1353 default: 1354 case _MM_ROUND_NEAREST_ARM: return vcvtnq_s32_f32(a); 1355 case _MM_ROUND_DOWN_ARM: return vcvtmq_s32_f32(a); 1356 case _MM_ROUND_UP_ARM: return vcvtpq_s32_f32(a); 1357 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a); 1358 } 1359 } 1360 else 1361 { 1362 __m128i r = void; 1363 r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]); 1364 r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]); 1365 r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]); 1366 r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]); 1367 return r; 1368 } 1369 } 1370 unittest 1371 { 1372 // GDC bug #98607 1373 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607 1374 // GDC does not provide optimization barrier for rounding mode. 1375 // Workarounded with different literals. This bug will likely only manifest in unittest. 1376 // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't. 1377 1378 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1379 1380 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1381 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1382 assert(A.array == [1, -2, 54, -3]); 1383 1384 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1385 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f)); 1386 assert(A.array == [1, -3, 53, -3]); 1387 1388 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1389 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f)); 1390 assert(A.array == [2, -2, 54, -2]); 1391 1392 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1393 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f)); 1394 assert(A.array == [1, -2, 53, -2]); 1395 1396 _MM_SET_ROUNDING_MODE(savedRounding); 1397 } 1398 1399 /// Convert packed single-precision (32-bit) floating-point elements 1400 /// in `a` to packed double-precision (64-bit) floating-point elements. 1401 __m128d _mm_cvtps_pd (__m128 a) pure @trusted 1402 { 1403 version(LDC) 1404 { 1405 // Generates cvtps2pd since LDC 1.0 -O0 1406 enum ir = ` 1407 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1> 1408 %r = fpext <2 x float> %v to <2 x double> 1409 ret <2 x double> %r`; 1410 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a); 1411 } 1412 else static if (GDC_with_SSE2) 1413 { 1414 return __builtin_ia32_cvtps2pd(a); 1415 } 1416 else 1417 { 1418 double2 r = void; 1419 r.ptr[0] = a.array[0]; 1420 r.ptr[1] = a.array[1]; 1421 return r; 1422 } 1423 } 1424 unittest 1425 { 1426 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f)); 1427 assert(A.array[0] == 54.0); 1428 assert(A.array[1] == 54.0); 1429 } 1430 1431 /// Copy the lower double-precision (64-bit) floating-point element of `a`. 1432 double _mm_cvtsd_f64 (__m128d a) pure @safe 1433 { 1434 return a.array[0]; 1435 } 1436 1437 /// Convert the lower double-precision (64-bit) floating-point element 1438 /// in `a` to a 32-bit integer. 1439 int _mm_cvtsd_si32 (__m128d a) @safe 1440 { 1441 static if (LDC_with_SSE2) 1442 { 1443 return __builtin_ia32_cvtsd2si(a); 1444 } 1445 else static if (GDC_with_SSE2) 1446 { 1447 return __builtin_ia32_cvtsd2si(a); 1448 } 1449 else 1450 { 1451 return convertDoubleToInt32UsingMXCSR(a[0]); 1452 } 1453 } 1454 unittest 1455 { 1456 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0))); 1457 } 1458 1459 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer. 1460 long _mm_cvtsd_si64 (__m128d a) @trusted 1461 { 1462 version (LDC) 1463 { 1464 version (X86_64) 1465 { 1466 return __builtin_ia32_cvtsd2si64(a); 1467 } 1468 else 1469 { 1470 // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer 1471 // using SSE instructions only. So the builtin doesn't exit for this arch. 1472 return convertDoubleToInt64UsingMXCSR(a[0]); 1473 } 1474 } 1475 else 1476 { 1477 return convertDoubleToInt64UsingMXCSR(a.array[0]); 1478 } 1479 } 1480 unittest 1481 { 1482 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0))); 1483 1484 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1485 1486 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1487 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49))); 1488 1489 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1490 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1))); 1491 1492 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1493 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1))); 1494 1495 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1496 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9))); 1497 1498 _MM_SET_ROUNDING_MODE(savedRounding); 1499 } 1500 1501 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; /// 1502 1503 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 1504 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a` 1505 /// to the upper elements of result. 1506 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted 1507 { 1508 static if (GDC_with_SSE2) 1509 { 1510 return __builtin_ia32_cvtsd2ss(a, b); 1511 } 1512 else 1513 { 1514 // Generates cvtsd2ss since LDC 1.3 -O0 1515 a.ptr[0] = b.array[0]; 1516 return a; 1517 } 1518 } 1519 unittest 1520 { 1521 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0)); 1522 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]); 1523 } 1524 1525 /// Get the lower 32-bit integer in `a`. 1526 int _mm_cvtsi128_si32 (__m128i a) pure @safe 1527 { 1528 return a.array[0]; 1529 } 1530 1531 /// Get the lower 64-bit integer in `a`. 1532 long _mm_cvtsi128_si64 (__m128i a) pure @safe 1533 { 1534 long2 la = cast(long2)a; 1535 return la.array[0]; 1536 } 1537 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 1538 1539 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 1540 /// lower element of result, and copy the upper element from `a` to the upper element of result. 1541 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted 1542 { 1543 a.ptr[0] = cast(double)b; 1544 return a; 1545 } 1546 unittest 1547 { 1548 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 1549 assert(a.array == [42.0, 0]); 1550 } 1551 1552 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements. 1553 __m128i _mm_cvtsi32_si128 (int a) pure @trusted 1554 { 1555 int4 r = [0, 0, 0, 0]; 1556 r.ptr[0] = a; 1557 return r; 1558 } 1559 unittest 1560 { 1561 __m128i a = _mm_cvtsi32_si128(65); 1562 assert(a.array == [65, 0, 0, 0]); 1563 } 1564 1565 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 1566 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 1567 1568 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted 1569 { 1570 a.ptr[0] = cast(double)b; 1571 return a; 1572 } 1573 unittest 1574 { 1575 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 1576 assert(a.array == [42.0, 0]); 1577 } 1578 1579 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element. 1580 __m128i _mm_cvtsi64_si128 (long a) pure @trusted 1581 { 1582 long2 r = [0, 0]; 1583 r.ptr[0] = a; 1584 return cast(__m128i)(r); 1585 } 1586 1587 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; /// 1588 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; /// 1589 1590 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 1591 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 1592 // element of result. 1593 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted 1594 { 1595 a.ptr[0] = b.array[0]; 1596 return a; 1597 } 1598 unittest 1599 { 1600 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 1601 assert(a.array == [42.0, 0]); 1602 } 1603 1604 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation. 1605 long _mm_cvttss_si64 (__m128 a) pure @safe 1606 { 1607 return cast(long)(a.array[0]); // Generates cvttss2si as expected 1608 } 1609 unittest 1610 { 1611 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1612 } 1613 1614 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1615 /// Put zeroes in the upper elements of result. 1616 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted 1617 { 1618 static if (LDC_with_SSE2) 1619 { 1620 return __builtin_ia32_cvttpd2dq(a); 1621 } 1622 else static if (GDC_with_SSE2) 1623 { 1624 return __builtin_ia32_cvttpd2dq(a); 1625 } 1626 else 1627 { 1628 // Note: doesn't generate cvttpd2dq as of LDC 1.13 1629 __m128i r; 1630 r.ptr[0] = cast(int)a.array[0]; 1631 r.ptr[1] = cast(int)a.array[1]; 1632 r.ptr[2] = 0; 1633 r.ptr[3] = 0; 1634 return r; 1635 } 1636 } 1637 unittest 1638 { 1639 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f)); 1640 assert(R.array == [-4, 45641, 0, 0]); 1641 } 1642 1643 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1644 /// to packed 32-bit integers with truncation. 1645 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe 1646 { 1647 return to_m64(_mm_cvttpd_epi32(v)); 1648 } 1649 unittest 1650 { 1651 int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f)); 1652 int[2] correct = [-4, 45641]; 1653 assert(R.array == correct); 1654 } 1655 1656 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1657 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted 1658 { 1659 // x86: Generates cvttps2dq since LDC 1.3 -O2 1660 // ARM64: generates fcvtze since LDC 1.8 -O2 1661 __m128i r; 1662 r.ptr[0] = cast(int)a.array[0]; 1663 r.ptr[1] = cast(int)a.array[1]; 1664 r.ptr[2] = cast(int)a.array[2]; 1665 r.ptr[3] = cast(int)a.array[3]; 1666 return r; 1667 } 1668 unittest 1669 { 1670 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f)); 1671 assert(R.array == [-4, 45641, 0, 1]); 1672 } 1673 1674 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation. 1675 int _mm_cvttsd_si32 (__m128d a) 1676 { 1677 // Generates cvttsd2si since LDC 1.3 -O0 1678 return cast(int)a.array[0]; 1679 } 1680 1681 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation. 1682 long _mm_cvttsd_si64 (__m128d a) 1683 { 1684 // Generates cvttsd2si since LDC 1.3 -O0 1685 // but in 32-bit instead, it's a long sequence that resort to FPU 1686 return cast(long)a.array[0]; 1687 } 1688 1689 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; /// 1690 1691 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`. 1692 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe 1693 { 1694 pragma(inline, true); 1695 return a / b; 1696 } 1697 1698 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted 1699 { 1700 static if (GDC_with_SSE2) 1701 { 1702 return __builtin_ia32_divsd(a, b); 1703 } 1704 else version(DigitalMars) 1705 { 1706 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1707 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 1708 asm pure nothrow @nogc @trusted { nop;} 1709 a.array[0] = a.array[0] / b.array[0]; 1710 return a; 1711 } 1712 else 1713 { 1714 a.ptr[0] /= b.array[0]; 1715 return a; 1716 } 1717 } 1718 unittest 1719 { 1720 __m128d a = [2.0, 4.5]; 1721 a = _mm_div_sd(a, a); 1722 assert(a.array == [1.0, 4.5]); 1723 } 1724 1725 /// Extract a 16-bit integer from `v`, selected with `index`. 1726 /// Warning: the returned value is zero-extended to 32-bits. 1727 int _mm_extract_epi16(__m128i v, int index) pure @safe 1728 { 1729 short8 r = cast(short8)v; 1730 return cast(ushort)(r.array[index & 7]); 1731 } 1732 unittest 1733 { 1734 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1); 1735 assert(_mm_extract_epi16(A, 6) == 6); 1736 assert(_mm_extract_epi16(A, 0) == 65535); 1737 assert(_mm_extract_epi16(A, 5 + 8) == 5); 1738 } 1739 1740 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`. 1741 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted 1742 { 1743 short8 r = cast(short8)v; 1744 r.ptr[index & 7] = cast(short)i; 1745 return cast(__m128i)r; 1746 } 1747 unittest 1748 { 1749 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1750 short8 R = cast(short8) _mm_insert_epi16(A, 42, 6); 1751 short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7]; 1752 assert(R.array == correct); 1753 } 1754 1755 1756 void _mm_lfence() @trusted 1757 { 1758 version(GNU) 1759 { 1760 1761 static if (GDC_with_SSE2) 1762 { 1763 __builtin_ia32_lfence(); 1764 } 1765 else version(X86) 1766 { 1767 asm pure nothrow @nogc @trusted 1768 { 1769 "lfence;\n" : : : ; 1770 } 1771 } 1772 else 1773 static assert(false); 1774 } 1775 else static if (LDC_with_SSE2) 1776 { 1777 __builtin_ia32_lfence(); 1778 } 1779 else static if (DMD_with_asm) 1780 { 1781 asm nothrow @nogc pure @safe 1782 { 1783 lfence; 1784 } 1785 } 1786 else version(LDC) 1787 { 1788 llvm_memory_fence(); // PERF actually generates mfence 1789 } 1790 else 1791 static assert(false); 1792 } 1793 unittest 1794 { 1795 _mm_lfence(); 1796 } 1797 1798 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1799 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1800 __m128d _mm_load_pd (const(double) * mem_addr) pure 1801 { 1802 pragma(inline, true); 1803 __m128d* aligned = cast(__m128d*)mem_addr; 1804 return *aligned; 1805 } 1806 unittest 1807 { 1808 align(16) double[2] S = [-5.0, 7.0]; 1809 __m128d R = _mm_load_pd(S.ptr); 1810 assert(R.array == S); 1811 } 1812 1813 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst. 1814 /// `mem_addr` does not need to be aligned on any particular boundary. 1815 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 1816 { 1817 double m = *mem_addr; 1818 __m128d r; 1819 r.ptr[0] = m; 1820 r.ptr[1] = m; 1821 return r; 1822 } 1823 unittest 1824 { 1825 double what = 4; 1826 __m128d R = _mm_load_pd1(&what); 1827 double[2] correct = [4.0, 4]; 1828 assert(R.array == correct); 1829 } 1830 1831 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 1832 /// element. `mem_addr` does not need to be aligned on any particular boundary. 1833 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted 1834 { 1835 double2 r = [0, 0]; 1836 r.ptr[0] = *mem_addr; 1837 return r; 1838 } 1839 unittest 1840 { 1841 double x = -42; 1842 __m128d a = _mm_load_sd(&x); 1843 assert(a.array == [-42.0, 0.0]); 1844 } 1845 1846 /// Load 128-bits of integer data from memory into dst. 1847 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1848 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted // TODO: shoudln't be trusted because alignment, Issue #62 1849 { 1850 pragma(inline, true); 1851 return *mem_addr; 1852 } 1853 unittest 1854 { 1855 align(16) int[4] correct = [-1, 2, 3, 4]; 1856 int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr); 1857 assert(A.array == correct); 1858 } 1859 1860 alias _mm_load1_pd = _mm_load_pd1; /// 1861 1862 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 1863 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary. 1864 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted 1865 { 1866 pragma(inline, true); 1867 a.ptr[1] = *mem_addr; 1868 return a; 1869 } 1870 unittest 1871 { 1872 double A = 7.0; 1873 __m128d B = _mm_setr_pd(4.0, -5.0); 1874 __m128d R = _mm_loadh_pd(B, &A); 1875 double[2] correct = [ 4.0, 7.0 ]; 1876 assert(R.array == correct); 1877 } 1878 1879 /// Load 64-bit integer from memory into the first element of result. Zero out the other. 1880 // Note: strange signature since the memory doesn't have to aligned (Issue #60) 1881 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted // TODO signature 1882 { 1883 pragma(inline, true); 1884 auto pLong = cast(const(long)*)mem_addr; 1885 long2 r = [0, 0]; 1886 r.ptr[0] = *pLong; 1887 return cast(__m128i)(r); 1888 } 1889 unittest 1890 { 1891 long A = 0x7878787870707070; 1892 long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A); 1893 long[2] correct = [0x7878787870707070, 0]; 1894 assert(R.array == correct); 1895 } 1896 1897 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 1898 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary. 1899 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted 1900 { 1901 a.ptr[0] = *mem_addr; 1902 return a; 1903 } 1904 unittest 1905 { 1906 double A = 7.0; 1907 __m128d B = _mm_setr_pd(4.0, -5.0); 1908 __m128d R = _mm_loadl_pd(B, &A); 1909 double[2] correct = [ 7.0, -5.0 ]; 1910 assert(R.array == correct); 1911 } 1912 1913 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 1914 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1915 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted 1916 { 1917 __m128d a = *cast(__m128d*)(mem_addr); 1918 __m128d r; 1919 r.ptr[0] = a.array[1]; 1920 r.ptr[1] = a.array[0]; 1921 return r; 1922 } 1923 unittest 1924 { 1925 align(16) double[2] A = [56.0, -74.0]; 1926 __m128d R = _mm_loadr_pd(A.ptr); 1927 double[2] correct = [-74.0, 56.0]; 1928 assert(R.array == correct); 1929 } 1930 1931 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1932 /// `mem_addr` does not need to be aligned on any particular boundary. 1933 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted 1934 { 1935 pragma(inline, true); 1936 static if (GDC_with_SSE2) 1937 { 1938 return __builtin_ia32_loadupd(mem_addr); 1939 } 1940 else version(LDC) 1941 { 1942 return loadUnaligned!(double2)(mem_addr); 1943 } 1944 else version(DigitalMars) 1945 { 1946 static if (DMD_with_DSIMD) 1947 { 1948 return cast(__m128d)__simd(XMM.LODUPD, *mem_addr); 1949 } 1950 else static if (SSESizedVectorsAreEmulated) 1951 { 1952 // Since this vector is emulated, it doesn't have alignement constraints 1953 // and as such we can just cast it. 1954 return *cast(__m128d*)(mem_addr); 1955 } 1956 else 1957 { 1958 __m128d result; 1959 result.ptr[0] = mem_addr[0]; 1960 result.ptr[1] = mem_addr[1]; 1961 return result; 1962 } 1963 } 1964 else 1965 { 1966 __m128d result; 1967 result.ptr[0] = mem_addr[0]; 1968 result.ptr[1] = mem_addr[1]; 1969 return result; 1970 } 1971 } 1972 unittest 1973 { 1974 double[2] A = [56.0, -75.0]; 1975 __m128d R = _mm_loadu_pd(A.ptr); 1976 double[2] correct = [56.0, -75.0]; 1977 assert(R.array == correct); 1978 } 1979 1980 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary. 1981 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 1982 { 1983 pragma(inline, true); 1984 static if (GDC_with_SSE2) 1985 { 1986 return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr); 1987 } 1988 else 1989 { 1990 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 1991 } 1992 } 1993 unittest 1994 { 1995 align(16) int[4] correct = [-1, 2, -3, 4]; 1996 int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr); 1997 assert(A.array == correct); 1998 } 1999 2000 /// Load unaligned 32-bit integer from memory into the first element of result. 2001 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted 2002 { 2003 pragma(inline, true); 2004 int r = *cast(int*)(mem_addr); 2005 int4 result = [0, 0, 0, 0]; 2006 result.ptr[0] = r; 2007 return result; 2008 } 2009 unittest 2010 { 2011 int r = 42; 2012 __m128i A = _mm_loadu_si32(&r); 2013 int[4] correct = [42, 0, 0, 0]; 2014 assert(A.array == correct); 2015 } 2016 2017 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 2018 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 2019 /// and pack the results in destination. 2020 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted 2021 { 2022 static if (GDC_with_SSE2) 2023 { 2024 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2025 } 2026 else static if (LDC_with_SSE2) 2027 { 2028 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2029 } 2030 else static if (LDC_with_ARM64) 2031 { 2032 int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b)); 2033 int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b)); 2034 int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl)); 2035 int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph)); 2036 return vcombine_s32(rl, rh); 2037 } 2038 else 2039 { 2040 short8 sa = cast(short8)a; 2041 short8 sb = cast(short8)b; 2042 int4 r; 2043 foreach(i; 0..4) 2044 { 2045 r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1]; 2046 } 2047 return r; 2048 } 2049 } 2050 unittest 2051 { 2052 short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2053 short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2054 int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B); 2055 int[4] correct = [1, 13, -2147483648, 2*32767*32767]; 2056 assert(R.array == correct); 2057 } 2058 2059 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 2060 /// (elements are not stored when the highest bit is not set in the corresponding element) 2061 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 2062 /// boundary. 2063 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted 2064 { 2065 static if (GDC_with_SSE2) 2066 { 2067 return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr); 2068 } 2069 else static if (LDC_with_SSE2) 2070 { 2071 return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr); 2072 } 2073 else static if (LDC_with_ARM64) 2074 { 2075 // PERF: catastrophic on ARM32 2076 byte16 bmask = cast(byte16)mask; 2077 byte16 shift = 7; 2078 bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask 2079 mask = cast(__m128i) bmask; 2080 __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr); 2081 dest = (a & mask) | (dest & ~mask); 2082 storeUnaligned!__m128i(dest, cast(int*)mem_addr); 2083 } 2084 else 2085 { 2086 byte16 b = cast(byte16)a; 2087 byte16 m = cast(byte16)mask; 2088 byte* dest = cast(byte*)(mem_addr); 2089 foreach(j; 0..16) 2090 { 2091 if (m.array[j] & 128) 2092 { 2093 dest[j] = b.array[j]; 2094 } 2095 } 2096 } 2097 } 2098 unittest 2099 { 2100 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]; 2101 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0); 2102 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); 2103 _mm_maskmoveu_si128(A, mask, dest.ptr); 2104 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42]; 2105 assert(dest == correct); 2106 } 2107 2108 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values. 2109 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe 2110 { 2111 static if (GDC_with_SSE2) 2112 { 2113 return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b); 2114 } 2115 else version(LDC) 2116 { 2117 // x86: pmaxsw since LDC 1.0 -O1 2118 // ARM: smax.8h since LDC 1.5 -01 2119 short8 sa = cast(short8)a; 2120 short8 sb = cast(short8)b; 2121 short8 greater = greaterMask!short8(sa, sb); 2122 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2123 } 2124 else 2125 { 2126 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else 2127 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2128 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2129 return _mm_xor_si128(b, mask); 2130 } 2131 } 2132 unittest 2133 { 2134 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57), 2135 _mm_setr_epi16(-4,-8, 9, 7, 0,-32768, 0, 0)); 2136 short[8] correct = [32767, 1, 9, 7, 9, 7, 0, 0]; 2137 assert(R.array == correct); 2138 } 2139 2140 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values. 2141 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe 2142 { 2143 version(LDC) 2144 { 2145 // x86: pmaxub since LDC 1.0.0 -O1 2146 // ARM64: umax.16b since LDC 1.5.0 -O1 2147 // PERF: catastrophic on ARM32 2148 ubyte16 sa = cast(ubyte16)a; 2149 ubyte16 sb = cast(ubyte16)b; 2150 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2151 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2152 } 2153 else 2154 { 2155 __m128i value128 = _mm_set1_epi8(-128); 2156 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2157 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2158 __m128i mask = _mm_and_si128(aTob, higher); 2159 return _mm_xor_si128(b, mask); 2160 } 2161 } 2162 unittest 2163 { 2164 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2165 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2166 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 2167 assert(R.array == correct); 2168 } 2169 2170 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed maximum values. 2171 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted 2172 { 2173 static if (GDC_with_SSE2) 2174 { 2175 return __builtin_ia32_maxpd(a, b); 2176 } 2177 else 2178 { 2179 // x86: Generates maxpd starting with LDC 1.9 -O2 2180 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2181 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2182 return a; 2183 } 2184 } 2185 unittest 2186 { 2187 __m128d A = _mm_setr_pd(4.0, 1.0); 2188 __m128d B = _mm_setr_pd(1.0, 8.0); 2189 __m128d M = _mm_max_pd(A, B); 2190 assert(M.array[0] == 4.0); 2191 assert(M.array[1] == 8.0); 2192 } 2193 2194 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 2195 /// lower element of result, and copy the upper element from `a` to the upper element of result. 2196 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted 2197 { 2198 static if (GDC_with_SSE2) 2199 { 2200 return __builtin_ia32_maxsd(a, b); 2201 } 2202 else 2203 { 2204 __m128d r = a; 2205 // Generates maxsd starting with LDC 1.3 2206 r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2207 return r; 2208 } 2209 } 2210 unittest 2211 { 2212 __m128d A = _mm_setr_pd(1.0, 1.0); 2213 __m128d B = _mm_setr_pd(4.0, 2.0); 2214 __m128d M = _mm_max_sd(A, B); 2215 assert(M.array[0] == 4.0); 2216 assert(M.array[1] == 1.0); 2217 } 2218 2219 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 2220 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 2221 /// is globally visible before any memory instruction which follows the fence in program order. 2222 void _mm_mfence() @trusted 2223 { 2224 version(GNU) 2225 { 2226 static if (GDC_with_SSE2) 2227 { 2228 __builtin_ia32_mfence(); 2229 } 2230 else version(X86) 2231 { 2232 asm pure nothrow @nogc @trusted 2233 { 2234 "mfence;\n" : : : ; 2235 } 2236 } 2237 else 2238 static assert(false); 2239 } 2240 else static if (LDC_with_SSE2) 2241 { 2242 __builtin_ia32_mfence(); 2243 } 2244 else static if (DMD_with_asm) 2245 { 2246 asm nothrow @nogc pure @safe 2247 { 2248 mfence; 2249 } 2250 } 2251 else version(LDC) 2252 { 2253 void _mm_mfence() pure @safe 2254 { 2255 // Note: will generate the DMB instruction on ARM 2256 llvm_memory_fence(); 2257 } 2258 } 2259 else 2260 static assert(false); 2261 } 2262 unittest 2263 { 2264 _mm_mfence(); 2265 } 2266 2267 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values. 2268 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe 2269 { 2270 static if (GDC_with_SSE2) 2271 { 2272 return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b); 2273 } 2274 else version(LDC) 2275 { 2276 // x86: pminsw since LDC 1.0 -O1 2277 // ARM64: smin.8h since LDC 1.5 -01 2278 short8 sa = cast(short8)a; 2279 short8 sb = cast(short8)b; 2280 short8 greater = greaterMask!short8(sa, sb); 2281 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2282 } 2283 else 2284 { 2285 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else 2286 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2287 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2288 return _mm_xor_si128(b, mask); 2289 } 2290 } 2291 unittest 2292 { 2293 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-32768), 2294 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 2295 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -32768]; 2296 assert(R.array == correct); 2297 } 2298 2299 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values. 2300 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe 2301 { 2302 version(LDC) 2303 { 2304 // x86: pminub since LDC 1.0.0 -O1 2305 // ARM: umin.16b since LDC 1.5.0 -O1 2306 // PERF: catastrophic on ARM32 2307 ubyte16 sa = cast(ubyte16)a; 2308 ubyte16 sb = cast(ubyte16)b; 2309 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2310 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2311 } 2312 else 2313 { 2314 __m128i value128 = _mm_set1_epi8(-128); 2315 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2316 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2317 __m128i mask = _mm_and_si128(aTob, lower); 2318 return _mm_xor_si128(b, mask); 2319 } 2320 } 2321 unittest 2322 { 2323 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2324 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2325 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 2326 assert(R.array == correct); 2327 } 2328 2329 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values. 2330 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted 2331 { 2332 static if (GDC_with_SSE2) 2333 { 2334 return __builtin_ia32_minpd(a, b); 2335 } 2336 else 2337 { 2338 // Generates minpd starting with LDC 1.9 2339 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2340 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2341 return a; 2342 } 2343 } 2344 unittest 2345 { 2346 __m128d A = _mm_setr_pd(1.0, 2.0); 2347 __m128d B = _mm_setr_pd(4.0, 1.0); 2348 __m128d M = _mm_min_pd(A, B); 2349 assert(M.array[0] == 1.0); 2350 assert(M.array[1] == 1.0); 2351 } 2352 2353 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 2354 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 2355 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe 2356 { 2357 static if (GDC_with_SSE2) 2358 { 2359 return __builtin_ia32_minsd(a, b); 2360 } 2361 else 2362 { 2363 // Generates minsd starting with LDC 1.3 2364 __m128d r = a; 2365 r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2366 return r; 2367 } 2368 } 2369 unittest 2370 { 2371 __m128d A = _mm_setr_pd(1.0, 3.0); 2372 __m128d B = _mm_setr_pd(4.0, 2.0); 2373 __m128d M = _mm_min_sd(A, B); 2374 assert(M.array[0] == 1.0); 2375 assert(M.array[1] == 3.0); 2376 } 2377 2378 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element. 2379 __m128i _mm_move_epi64 (__m128i a) pure @trusted 2380 { 2381 static if (GDC_with_SSE2) 2382 { 2383 // slightly better with GDC -O0 2384 return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 2385 } 2386 else 2387 { 2388 long2 result = [ 0, 0 ]; 2389 long2 la = cast(long2) a; 2390 result.ptr[0] = la.array[0]; 2391 return cast(__m128i)(result); 2392 } 2393 } 2394 unittest 2395 { 2396 long2 A = [13, 47]; 2397 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 2398 long[2] correct = [13, 0]; 2399 assert(B.array == correct); 2400 } 2401 2402 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 2403 /// the upper element from `a` to the upper element of dst. 2404 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted 2405 { 2406 static if (GDC_with_SSE2) 2407 { 2408 return __builtin_ia32_movsd(a, b); 2409 } 2410 else 2411 { 2412 b.ptr[1] = a.array[1]; 2413 return b; 2414 } 2415 } 2416 unittest 2417 { 2418 double2 A = [13.0, 47.0]; 2419 double2 B = [34.0, 58.0]; 2420 double2 C = _mm_move_sd(A, B); 2421 double[2] correct = [34.0, 47.0]; 2422 assert(C.array == correct); 2423 } 2424 2425 /// Create mask from the most significant bit of each 8-bit element in `v`. 2426 int _mm_movemask_epi8 (__m128i a) pure @trusted 2427 { 2428 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2429 static if (GDC_with_SSE2) 2430 { 2431 return __builtin_ia32_pmovmskb128(cast(ubyte16)a); 2432 } 2433 else static if (LDC_with_SSE2) 2434 { 2435 return __builtin_ia32_pmovmskb128(cast(byte16)a); 2436 } 2437 else static if (LDC_with_ARM64) 2438 { 2439 // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon 2440 // The other two solutions lead to unfound intrinsics in LLVM and that took a long time. 2441 // SO there might be something a bit faster, but this one is reasonable and branchless. 2442 byte8 mask_shift; 2443 mask_shift.ptr[0] = 7; 2444 mask_shift.ptr[1] = 6; 2445 mask_shift.ptr[2] = 5; 2446 mask_shift.ptr[3] = 4; 2447 mask_shift.ptr[4] = 3; 2448 mask_shift.ptr[5] = 2; 2449 mask_shift.ptr[6] = 1; 2450 mask_shift.ptr[7] = 0; 2451 byte8 mask_and = byte8(-128); 2452 byte8 lo = vget_low_u8(cast(byte16)a); 2453 byte8 hi = vget_high_u8(cast(byte16)a); 2454 lo = vand_u8(lo, mask_and); 2455 lo = vshr_u8(lo, mask_shift); 2456 hi = vand_u8(hi, mask_and); 2457 hi = vshr_u8(hi, mask_shift); 2458 lo = vpadd_u8(lo,lo); 2459 lo = vpadd_u8(lo,lo); 2460 lo = vpadd_u8(lo,lo); 2461 hi = vpadd_u8(hi,hi); 2462 hi = vpadd_u8(hi,hi); 2463 hi = vpadd_u8(hi,hi); 2464 return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]); 2465 } 2466 else 2467 { 2468 byte16 ai = cast(byte16)a; 2469 int r = 0; 2470 foreach(bit; 0..16) 2471 { 2472 if (ai.array[bit] < 0) r += (1 << bit); 2473 } 2474 return r; 2475 } 2476 } 2477 unittest 2478 { 2479 assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0))); 2480 } 2481 2482 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 2483 /// loating-point element in `v`. 2484 int _mm_movemask_pd(__m128d v) pure @safe 2485 { 2486 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2487 static if (GDC_with_SSE2) 2488 { 2489 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2490 /// packed double-precision (64-bit) floating-point element in `v`. 2491 return __builtin_ia32_movmskpd(v); 2492 } 2493 else static if (LDC_with_SSE2) 2494 { 2495 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2496 /// packed double-precision (64-bit) floating-point element in `v`. 2497 return __builtin_ia32_movmskpd(v); 2498 } 2499 else 2500 { 2501 long2 lv = cast(long2)v; 2502 int r = 0; 2503 if (lv.array[0] < 0) r += 1; 2504 if (lv.array[1] < 0) r += 2; 2505 return r; 2506 } 2507 } 2508 unittest 2509 { 2510 __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0); 2511 assert(_mm_movemask_pd(A) == 2); 2512 } 2513 2514 /// Copy the lower 64-bit integer in `v`. 2515 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe 2516 { 2517 long2 lv = cast(long2)v; 2518 return long1(lv.array[0]); 2519 } 2520 unittest 2521 { 2522 __m128i A = _mm_set_epi64x(-1, -2); 2523 __m64 R = _mm_movepi64_pi64(A); 2524 assert(R.array[0] == -2); 2525 } 2526 2527 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element. 2528 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted 2529 { 2530 long2 r; 2531 r.ptr[0] = a.array[0]; 2532 r.ptr[1] = 0; 2533 return cast(__m128i)r; 2534 } 2535 2536 // Note: generates pmuludq in LDC with -O1 2537 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted 2538 { 2539 __m128i zero = _mm_setzero_si128(); 2540 2541 static if (__VERSION__ >= 2088) 2542 { 2543 // Need LLVM9 to avoid this shufflevector 2544 long2 la, lb; 2545 la.ptr[0] = cast(uint)a.array[0]; 2546 la.ptr[1] = cast(uint)a.array[2]; 2547 lb.ptr[0] = cast(uint)b.array[0]; 2548 lb.ptr[1] = cast(uint)b.array[2]; 2549 } 2550 else 2551 { 2552 long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero); 2553 long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero); 2554 } 2555 2556 version(DigitalMars) 2557 { 2558 // DMD has no long2 mul 2559 // long2 mul not supported before LDC 1.5 2560 la.ptr[0] *= lb.array[0]; 2561 la.ptr[1] *= lb.array[1]; 2562 return cast(__m128i)(la); 2563 } 2564 else 2565 { 2566 static if (__VERSION__ >= 2076) 2567 { 2568 return cast(__m128i)(la * lb); 2569 } 2570 else 2571 { 2572 // long2 mul not supported before LDC 1.5 2573 la.ptr[0] *= lb.array[0]; 2574 la.ptr[1] *= lb.array[1]; 2575 return cast(__m128i)(la); 2576 } 2577 } 2578 } 2579 unittest 2580 { 2581 __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff); 2582 __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff); 2583 __m128i C = _mm_mul_epu32(A, B); 2584 long2 LC = cast(long2)C; 2585 assert(LC.array[0] == 18446744065119617025uL); 2586 assert(LC.array[1] == 12723420444339690338uL); 2587 } 2588 2589 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 2590 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 2591 { 2592 pragma(inline, true); 2593 return a * b; 2594 } 2595 unittest 2596 { 2597 __m128d a = [-2.0, 1.5]; 2598 a = _mm_mul_pd(a, a); 2599 assert(a.array == [4.0, 2.25]); 2600 } 2601 2602 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 2603 /// element of result, and copy the upper element from `a` to the upper element of result. 2604 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted 2605 { 2606 version(DigitalMars) 2607 { 2608 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 2609 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 2610 asm pure nothrow @nogc @trusted { nop;} 2611 a.array[0] = a.array[0] * b.array[0]; 2612 return a; 2613 } 2614 else static if (GDC_with_SSE2) 2615 { 2616 return __builtin_ia32_mulsd(a, b); 2617 } 2618 else 2619 { 2620 a.ptr[0] *= b.array[0]; 2621 return a; 2622 } 2623 } 2624 unittest 2625 { 2626 __m128d a = [-2.0, 1.5]; 2627 a = _mm_mul_sd(a, a); 2628 assert(a.array == [4.0, 1.5]); 2629 } 2630 2631 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 2632 /// and get an unsigned 64-bit result. 2633 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe 2634 { 2635 return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b))); 2636 } 2637 unittest 2638 { 2639 __m64 A = _mm_set_pi32(42, 0xDEADBEEF); 2640 __m64 B = _mm_set_pi32(42, 0xCAFEBABE); 2641 __m64 C = _mm_mul_su32(A, B); 2642 assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL); 2643 } 2644 2645 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2646 /// high 16 bits of the intermediate integers. 2647 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted 2648 { 2649 static if (GDC_with_SSE2) 2650 { 2651 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2652 } 2653 else static if (LDC_with_SSE2) 2654 { 2655 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2656 } 2657 else 2658 { 2659 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h 2660 // PERF: it seems the simde solution has one less instruction in ARM64. 2661 // PERF: Catastrophic in ARM32. 2662 short8 sa = cast(short8)a; 2663 short8 sb = cast(short8)b; 2664 short8 r = void; 2665 r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16; 2666 r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16; 2667 r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16; 2668 r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16; 2669 r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16; 2670 r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16; 2671 r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16; 2672 r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16; 2673 return cast(__m128i)r; 2674 } 2675 } 2676 unittest 2677 { 2678 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2679 __m128i B = _mm_set1_epi16(16384); 2680 short8 R = cast(short8)_mm_mulhi_epi16(A, B); 2681 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1]; 2682 assert(R.array == correct); 2683 } 2684 2685 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2686 /// high 16 bits of the intermediate integers. 2687 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted 2688 { 2689 static if (GDC_with_SSE2) 2690 { 2691 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2692 } 2693 else static if (LDC_with_SSE2) 2694 { 2695 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2696 } 2697 else 2698 { 2699 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h 2700 // it seems the simde solution has one less instruction in ARM64 2701 // PERF: Catastrophic in ARM32. 2702 short8 sa = cast(short8)a; 2703 short8 sb = cast(short8)b; 2704 short8 r = void; 2705 r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 ); 2706 r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 ); 2707 r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 ); 2708 r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 ); 2709 r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 ); 2710 r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 ); 2711 r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 ); 2712 r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 ); 2713 return cast(__m128i)r; 2714 } 2715 } 2716 unittest 2717 { 2718 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2719 __m128i B = _mm_set1_epi16(16384); 2720 short8 R = cast(short8)_mm_mulhi_epu16(A, B); 2721 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1]; 2722 assert(R.array == correct); 2723 } 2724 2725 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 2726 /// bits of the intermediate integers. 2727 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe 2728 { 2729 return cast(__m128i)(cast(short8)a * cast(short8)b); 2730 } 2731 unittest 2732 { 2733 __m128i A = _mm_setr_epi16(16384, -16, 0, 3, 4, 1, 16, 7); 2734 __m128i B = _mm_set1_epi16(16384); 2735 short8 R = cast(short8)_mm_mullo_epi16(A, B); 2736 short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384]; 2737 assert(R.array == correct); 2738 } 2739 2740 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 2741 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 2742 { 2743 pragma(inline, true); 2744 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 2745 } 2746 2747 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`. 2748 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 2749 { 2750 pragma(inline, true); 2751 return a | b; 2752 } 2753 2754 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation. 2755 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted 2756 { 2757 static if (GDC_with_SSE2) 2758 { 2759 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2760 } 2761 else static if (LDC_with_SSE2) 2762 { 2763 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2764 } 2765 else static if (LDC_with_ARM64) 2766 { 2767 short4 ra = vqmovn_s32(cast(int4)a); 2768 short4 rb = vqmovn_s32(cast(int4)b); 2769 return cast(__m128i)vcombine_s16(ra, rb); 2770 } 2771 else 2772 { 2773 // PERF: catastrophic on ARM32 2774 short8 r; 2775 r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]); 2776 r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]); 2777 r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]); 2778 r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]); 2779 r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]); 2780 r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]); 2781 r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]); 2782 r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]); 2783 return cast(__m128i)r; 2784 } 2785 } 2786 unittest 2787 { 2788 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 2789 short8 R = cast(short8) _mm_packs_epi32(A, A); 2790 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0]; 2791 assert(R.array == correct); 2792 } 2793 2794 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation. 2795 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted 2796 { 2797 static if (GDC_with_SSE2) 2798 { 2799 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2800 } 2801 else static if (LDC_with_SSE2) 2802 { 2803 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2804 } 2805 else static if (LDC_with_ARM64) 2806 { 2807 // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02 2808 byte8 ra = vqmovn_s16(cast(short8)a); 2809 byte8 rb = vqmovn_s16(cast(short8)b); 2810 return cast(__m128i)vcombine_s8(ra, rb); 2811 } 2812 else 2813 { 2814 // PERF: ARM32 is missing 2815 byte16 r; 2816 short8 sa = cast(short8)a; 2817 short8 sb = cast(short8)b; 2818 foreach(i; 0..8) 2819 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]); 2820 foreach(i; 0..8) 2821 r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]); 2822 return cast(__m128i)r; 2823 } 2824 } 2825 unittest 2826 { 2827 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0); 2828 byte16 R = cast(byte16) _mm_packs_epi16(A, A); 2829 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0, 2830 127, -128, 127, 0, 127, -128, 127, 0]; 2831 assert(R.array == correct); 2832 } 2833 2834 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation. 2835 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 2836 { 2837 static if (GDC_with_SSE2) 2838 { 2839 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2840 } 2841 else static if (LDC_with_SSE2) 2842 { 2843 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2844 } 2845 else static if (LDC_with_ARM64) 2846 { 2847 // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02 2848 byte8 ra = vqmovun_s16(cast(short8)a); 2849 byte8 rb = vqmovun_s16(cast(short8)b); 2850 return cast(__m128i)vcombine_s8(ra, rb); 2851 } 2852 else 2853 { 2854 short8 sa = cast(short8)a; 2855 short8 sb = cast(short8)b; 2856 ubyte[16] result = void; 2857 for (int i = 0; i < 8; ++i) 2858 { 2859 short s = sa[i]; 2860 if (s < 0) s = 0; 2861 if (s > 255) s = 255; 2862 result[i] = cast(ubyte)s; 2863 2864 s = sb[i]; 2865 if (s < 0) s = 0; 2866 if (s > 255) s = 255; 2867 result[i+8] = cast(ubyte)s; 2868 } 2869 return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr); 2870 } 2871 } 2872 unittest 2873 { 2874 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 2875 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 2876 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 2877 0, 255, 0, 255, 255, 2, 1, 0]; 2878 foreach(i; 0..16) 2879 assert(AA.array[i] == cast(byte)(correctResult[i])); 2880 } 2881 2882 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 2883 /// and power consumption of spin-wait loops. 2884 void _mm_pause() @trusted 2885 { 2886 version(GNU) 2887 { 2888 static if (GDC_with_SSE2) 2889 { 2890 __builtin_ia32_pause(); 2891 } 2892 else version(X86) 2893 { 2894 asm pure nothrow @nogc @trusted 2895 { 2896 "pause;\n" : : : ; 2897 } 2898 } 2899 else 2900 static assert(false); 2901 } 2902 else static if (LDC_with_SSE2) 2903 { 2904 __builtin_ia32_pause(); 2905 } 2906 else static if (DMD_with_asm) 2907 { 2908 asm nothrow @nogc pure @safe 2909 { 2910 rep; nop; // F3 90 = pause 2911 } 2912 } 2913 else version (LDC) 2914 { 2915 // PERF: Do nothing currently , could be the "yield" intruction on ARM. 2916 } 2917 else 2918 static assert(false); 2919 } 2920 unittest 2921 { 2922 _mm_pause(); 2923 } 2924 2925 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 2926 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 2927 /// low 16 bits of 64-bit elements in result. 2928 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted 2929 { 2930 static if (GDC_with_SSE2) 2931 { 2932 return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b); 2933 } 2934 else static if (LDC_with_SSE2) 2935 { 2936 return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b); 2937 } 2938 else static if (LDC_with_ARM64) 2939 { 2940 ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b)); 2941 2942 // PERF: Looks suboptimal vs addp 2943 ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]); 2944 ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]); 2945 ushort8 r = 0; 2946 r[0] = r0; 2947 r[4] = r4; 2948 return cast(__m128i) r; 2949 } 2950 else 2951 { 2952 // PERF: ARM32 is lacking 2953 byte16 ab = cast(byte16)a; 2954 byte16 bb = cast(byte16)b; 2955 ubyte[16] t; 2956 foreach(i; 0..16) 2957 { 2958 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]); 2959 if (diff < 0) diff = -diff; 2960 t[i] = cast(ubyte)(diff); 2961 } 2962 int4 r = _mm_setzero_si128(); 2963 r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 2964 r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15]; 2965 return r; 2966 } 2967 } 2968 unittest 2969 { 2970 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 2971 __m128i B = _mm_set1_epi8(1); 2972 __m128i R = _mm_sad_epu8(A, B); 2973 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 2974 0, 2975 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 2976 0]; 2977 assert(R.array == correct); 2978 } 2979 2980 /// Set packed 16-bit integers with the supplied values. 2981 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 2982 { 2983 short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7]; 2984 return cast(__m128i) loadUnaligned!(short8)(result.ptr); 2985 } 2986 unittest 2987 { 2988 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 2989 short8 B = cast(short8) A; 2990 foreach(i; 0..8) 2991 assert(B.array[i] == i); 2992 } 2993 2994 /// Set packed 32-bit integers with the supplied values. 2995 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 2996 { 2997 pragma(inline, true); 2998 int[4] result = [e0, e1, e2, e3]; 2999 return loadUnaligned!(int4)(result.ptr); 3000 } 3001 unittest 3002 { 3003 __m128i A = _mm_set_epi32(3, 2, 1, 0); 3004 foreach(i; 0..4) 3005 assert(A.array[i] == i); 3006 } 3007 3008 /// Set packed 64-bit integers with the supplied values. 3009 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted 3010 { 3011 pragma(inline, true); 3012 long[2] result = [e0.array[0], e1.array[0]]; 3013 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 3014 } 3015 unittest 3016 { 3017 __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678)); 3018 long2 B = cast(long2) A; 3019 assert(B.array[0] == 5678); 3020 assert(B.array[1] == 1234); 3021 } 3022 3023 /// Set packed 64-bit integers with the supplied values. 3024 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 3025 { 3026 pragma(inline, true); 3027 long[2] result = [e0, e1]; 3028 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 3029 } 3030 unittest 3031 { 3032 __m128i A = _mm_set_epi64x(1234, 5678); 3033 long2 B = cast(long2) A; 3034 assert(B.array[0] == 5678); 3035 assert(B.array[1] == 1234); 3036 } 3037 3038 /// Set packed 8-bit integers with the supplied values. 3039 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 3040 byte e11, byte e10, byte e9, byte e8, 3041 byte e7, byte e6, byte e5, byte e4, 3042 byte e3, byte e2, byte e1, byte e0) pure @trusted 3043 { 3044 byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 3045 e8, e9, e10, e11, e12, e13, e14, e15]; 3046 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 3047 } 3048 3049 /// Set packed double-precision (64-bit) floating-point elements with the supplied values. 3050 __m128d _mm_set_pd (double e1, double e0) pure @trusted 3051 { 3052 pragma(inline, true); 3053 double[2] result = [e0, e1]; 3054 return loadUnaligned!(double2)(result.ptr); 3055 } 3056 unittest 3057 { 3058 __m128d A = _mm_set_pd(61.0, 55.0); 3059 double[2] correct = [55.0, 61.0]; 3060 assert(A.array == correct); 3061 } 3062 3063 /// Broadcast double-precision (64-bit) floating-point value `a` to all element. 3064 __m128d _mm_set_pd1 (double a) pure @trusted 3065 { 3066 pragma(inline, true); 3067 double[2] result = [a, a]; 3068 return loadUnaligned!(double2)(result.ptr); 3069 } 3070 unittest 3071 { 3072 __m128d A = _mm_set_pd1(61.0); 3073 double[2] correct = [61.0, 61.0]; 3074 assert(A.array == correct); 3075 } 3076 3077 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 3078 /// and zero the upper element. 3079 __m128d _mm_set_sd (double a) pure @trusted 3080 { 3081 double[2] result = [a, 0]; 3082 return loadUnaligned!(double2)(result.ptr); 3083 } 3084 3085 /// Broadcast 16-bit integer a to all elements of dst. 3086 __m128i _mm_set1_epi16 (short a) pure @trusted 3087 { 3088 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 3089 { 3090 short8 v = a; 3091 return cast(__m128i) v; 3092 } 3093 else 3094 { 3095 pragma(inline, true); 3096 return cast(__m128i)(short8(a)); 3097 } 3098 } 3099 unittest 3100 { 3101 short8 a = cast(short8) _mm_set1_epi16(31); 3102 for (int i = 0; i < 8; ++i) 3103 assert(a.array[i] == 31); 3104 } 3105 3106 /// Broadcast 32-bit integer `a` to all elements. 3107 __m128i _mm_set1_epi32 (int a) pure @trusted 3108 { 3109 pragma(inline, true); 3110 return cast(__m128i)(int4(a)); 3111 } 3112 unittest 3113 { 3114 int4 a = cast(int4) _mm_set1_epi32(31); 3115 for (int i = 0; i < 4; ++i) 3116 assert(a.array[i] == 31); 3117 } 3118 3119 /// Broadcast 64-bit integer `a` to all elements. 3120 __m128i _mm_set1_epi64 (__m64 a) pure @safe 3121 { 3122 return _mm_set_epi64(a, a); 3123 } 3124 unittest 3125 { 3126 long b = 0x1DEADCAFE; 3127 __m64 a; 3128 a.ptr[0] = b; 3129 long2 c = cast(long2) _mm_set1_epi64(a); 3130 assert(c.array[0] == b); 3131 assert(c.array[1] == b); 3132 } 3133 3134 /// Broadcast 64-bit integer `a` to all elements 3135 __m128i _mm_set1_epi64x (long a) pure @trusted 3136 { 3137 long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3138 return cast(__m128i)(b); 3139 } 3140 unittest 3141 { 3142 long b = 0x1DEADCAFE; 3143 long2 c = cast(long2) _mm_set1_epi64x(b); 3144 for (int i = 0; i < 2; ++i) 3145 assert(c.array[i] == b); 3146 } 3147 3148 /// Broadcast 8-bit integer `a` to all elements. 3149 __m128i _mm_set1_epi8 (byte a) pure @trusted 3150 { 3151 pragma(inline, true); 3152 byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3153 return cast(__m128i)(b); 3154 } 3155 unittest 3156 { 3157 byte16 b = cast(byte16) _mm_set1_epi8(31); 3158 for (int i = 0; i < 16; ++i) 3159 assert(b.array[i] == 31); 3160 } 3161 3162 alias _mm_set1_pd = _mm_set_pd1; 3163 3164 /// Set packed 16-bit integers with the supplied values in reverse order. 3165 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 3166 short e3, short e2, short e1, short e0) pure @trusted 3167 { 3168 short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0]; 3169 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 3170 } 3171 unittest 3172 { 3173 short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0); 3174 short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0]; 3175 assert(A.array == correct); 3176 } 3177 3178 /// Set packed 32-bit integers with the supplied values in reverse order. 3179 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3180 { 3181 pragma(inline, true); 3182 int[4] result = [e3, e2, e1, e0]; 3183 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 3184 } 3185 unittest 3186 { 3187 int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647); 3188 int[4] correct = [-1, 0, -2147483648, 2147483647]; 3189 assert(A.array == correct); 3190 } 3191 3192 /// Set packed 64-bit integers with the supplied values in reverse order. 3193 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 3194 { 3195 long[2] result = [e1, e0]; 3196 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 3197 } 3198 unittest 3199 { 3200 long2 A = cast(long2) _mm_setr_epi64(-1, 0); 3201 long[2] correct = [-1, 0]; 3202 assert(A.array == correct); 3203 } 3204 3205 /// Set packed 8-bit integers with the supplied values in reverse order. 3206 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12, 3207 byte e11, byte e10, byte e9, byte e8, 3208 byte e7, byte e6, byte e5, byte e4, 3209 byte e3, byte e2, byte e1, byte e0) pure @trusted 3210 { 3211 byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 3212 e7, e6, e5, e4, e3, e2, e1, e0]; 3213 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 3214 } 3215 3216 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order. 3217 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 3218 { 3219 pragma(inline, true); 3220 double2 result; 3221 result.ptr[0] = e1; 3222 result.ptr[1] = e0; 3223 return result; 3224 } 3225 unittest 3226 { 3227 __m128d A = _mm_setr_pd(61.0, 55.0); 3228 double[2] correct = [61.0, 55.0]; 3229 assert(A.array == correct); 3230 } 3231 3232 /// Return vector of type `__m128d` with all elements set to zero. 3233 __m128d _mm_setzero_pd () pure @trusted 3234 { 3235 pragma(inline, true); 3236 // Note: using loadUnaligned has better -O0 codegen compared to .ptr 3237 double[2] result = [0.0, 0.0]; 3238 return loadUnaligned!(double2)(result.ptr); 3239 } 3240 3241 /// Return vector of type `__m128i` with all elements set to zero. 3242 __m128i _mm_setzero_si128() pure @trusted 3243 { 3244 pragma(inline, true); 3245 // Note: using loadUnaligned has better -O0 codegen compared to .ptr 3246 int[4] result = [0, 0, 0, 0]; 3247 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 3248 } 3249 3250 /// Shuffle 32-bit integers in a using the control in `imm8`. 3251 /// See_also: `_MM_SHUFFLE`. 3252 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe 3253 { 3254 static if (GDC_with_SSE2) 3255 { 3256 return __builtin_ia32_pshufd(a, imm8); 3257 } 3258 else 3259 { 3260 return shufflevector!(int4, (imm8 >> 0) & 3, 3261 (imm8 >> 2) & 3, 3262 (imm8 >> 4) & 3, 3263 (imm8 >> 6) & 3)(a, a); 3264 } 3265 } 3266 unittest 3267 { 3268 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 3269 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3270 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 3271 int[4] expectedB = [ 3, 2, 1, 0 ]; 3272 assert(B.array == expectedB); 3273 } 3274 3275 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`. 3276 /// See_also: `_MM_SHUFFLE2`. 3277 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe 3278 { 3279 static if (GDC_with_SSE2) 3280 { 3281 return __builtin_ia32_shufpd(a, b, imm8); 3282 } 3283 else 3284 { 3285 return shufflevector!(double2, 0 + ( imm8 & 1 ), 3286 2 + ( (imm8 >> 1) & 1 ))(a, b); 3287 } 3288 } 3289 unittest 3290 { 3291 __m128d A = _mm_setr_pd(0.5, 2.0); 3292 __m128d B = _mm_setr_pd(4.0, 5.0); 3293 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 3294 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B); 3295 double[2] correct = [ 2.0, 5.0 ]; 3296 assert(R.array == correct); 3297 } 3298 3299 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 3300 /// 64 bits of result, with the low 64 bits being copied from from `a` to result. 3301 /// See also: `_MM_SHUFFLE`. 3302 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe 3303 { 3304 static if (GDC_with_SSE2) 3305 { 3306 return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8); 3307 } 3308 else 3309 { 3310 return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3, 3311 4 + ( (imm8 >> 0) & 3 ), 3312 4 + ( (imm8 >> 2) & 3 ), 3313 4 + ( (imm8 >> 4) & 3 ), 3314 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 3315 } 3316 } 3317 unittest 3318 { 3319 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3320 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3321 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 3322 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 3323 assert(C.array == expectedC); 3324 } 3325 3326 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 3327 /// bits of result, with the high 64 bits being copied from from `a` to result. 3328 /// See_also: `_MM_SHUFFLE`. 3329 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe 3330 { 3331 static if (GDC_with_SSE2) 3332 { 3333 return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8); 3334 } 3335 else 3336 { 3337 return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ), 3338 ( (imm8 >> 2) & 3 ), 3339 ( (imm8 >> 4) & 3 ), 3340 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 3341 } 3342 } 3343 unittest 3344 { 3345 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3346 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3347 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 3348 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 3349 assert(B.array == expectedB); 3350 } 3351 3352 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros. 3353 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted 3354 { 3355 static if (LDC_with_SSE2) 3356 { 3357 return __builtin_ia32_pslld128(a, count); 3358 } 3359 else static if (GDC_with_SSE2) 3360 { 3361 return __builtin_ia32_pslld128(a, count); 3362 } 3363 else static if (DMD_with_32bit_asm) 3364 { 3365 asm pure nothrow @nogc @trusted 3366 { 3367 movdqu XMM0, a; 3368 movdqu XMM1, count; 3369 pslld XMM0, XMM1; 3370 movdqu a, XMM0; 3371 } 3372 return a; 3373 } 3374 else 3375 { 3376 int4 r = void; 3377 long2 lc = cast(long2)count; 3378 int bits = cast(int)(lc.array[0]); 3379 foreach(i; 0..4) 3380 r[i] = cast(uint)(a[i]) << bits; 3381 return r; 3382 } 3383 } 3384 3385 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros. 3386 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted 3387 { 3388 static if (LDC_with_SSE2) 3389 { 3390 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3391 } 3392 else static if (GDC_with_SSE2) 3393 { 3394 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3395 } 3396 else static if (DMD_with_32bit_asm) 3397 { 3398 asm pure nothrow @nogc @trusted 3399 { 3400 movdqu XMM0, a; 3401 movdqu XMM1, count; 3402 psllq XMM0, XMM1; 3403 movdqu a, XMM0; 3404 } 3405 return a; 3406 } 3407 else 3408 { 3409 // ARM: good since LDC 1.12 -O2 3410 // ~but -O0 version is catastrophic 3411 long2 r = void; 3412 long2 sa = cast(long2)a; 3413 long2 lc = cast(long2)count; 3414 int bits = cast(int)(lc.array[0]); 3415 foreach(i; 0..2) 3416 r.array[i] = cast(ulong)(sa.array[i]) << bits; 3417 return cast(__m128i)r; 3418 } 3419 } 3420 3421 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros. 3422 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 3423 { 3424 static if (LDC_with_SSE2) 3425 { 3426 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3427 } 3428 else static if (GDC_with_SSE2) 3429 { 3430 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3431 } 3432 else static if (DMD_with_32bit_asm) 3433 { 3434 asm pure nothrow @nogc 3435 { 3436 movdqu XMM0, a; 3437 movdqu XMM1, count; 3438 psllw XMM0, XMM1; 3439 movdqu a, XMM0; 3440 } 3441 return a; 3442 } 3443 else 3444 { 3445 short8 sa = cast(short8)a; 3446 long2 lc = cast(long2)count; 3447 int bits = cast(int)(lc.array[0]); 3448 short8 r = void; 3449 foreach(i; 0..8) 3450 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits); 3451 return cast(int4)r; 3452 } 3453 } 3454 3455 3456 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 3457 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted 3458 { 3459 static if (GDC_with_SSE2) 3460 { 3461 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3462 } 3463 else static if (LDC_with_SSE2) 3464 { 3465 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3466 } 3467 else 3468 { 3469 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3470 // D says "It's illegal to shift by the same or more bits 3471 // than the size of the quantity being shifted" 3472 // and it's UB instead. 3473 int4 r = _mm_setzero_si128(); 3474 3475 ubyte count = cast(ubyte) imm8; 3476 if (count > 31) 3477 return r; 3478 3479 foreach(i; 0..4) 3480 r.array[i] = cast(uint)(a.array[i]) << count; 3481 return r; 3482 } 3483 } 3484 unittest 3485 { 3486 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3487 __m128i B = _mm_slli_epi32(A, 1); 3488 __m128i B2 = _mm_slli_epi32(A, 1 + 256); 3489 int[4] expectedB = [ 0, 4, 6, -8]; 3490 assert(B.array == expectedB); 3491 assert(B2.array == expectedB); 3492 3493 __m128i C = _mm_slli_epi32(A, 0); 3494 int[4] expectedC = [ 0, 2, 3, -4]; 3495 assert(C.array == expectedC); 3496 3497 __m128i D = _mm_slli_epi32(A, 65); 3498 int[4] expectedD = [ 0, 0, 0, 0]; 3499 assert(D.array == expectedD); 3500 } 3501 3502 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros. 3503 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted 3504 { 3505 static if (GDC_with_SSE2) 3506 { 3507 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3508 } 3509 else static if (LDC_with_SSE2) 3510 { 3511 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3512 } 3513 else 3514 { 3515 long2 sa = cast(long2)a; 3516 3517 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3518 // D says "It's illegal to shift by the same or more bits 3519 // than the size of the quantity being shifted" 3520 // and it's UB instead. 3521 long2 r = cast(long2) _mm_setzero_si128(); 3522 ubyte count = cast(ubyte) imm8; 3523 if (count > 63) 3524 return cast(__m128i)r; 3525 3526 r.ptr[0] = cast(ulong)(sa.array[0]) << count; 3527 r.ptr[1] = cast(ulong)(sa.array[1]) << count; 3528 return cast(__m128i)r; 3529 } 3530 } 3531 unittest 3532 { 3533 __m128i A = _mm_setr_epi64(8, -4); 3534 long2 B = cast(long2) _mm_slli_epi64(A, 1); 3535 long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024); 3536 long[2] expectedB = [ 16, -8]; 3537 assert(B.array == expectedB); 3538 assert(B2.array == expectedB); 3539 3540 long2 C = cast(long2) _mm_slli_epi64(A, 0); 3541 long[2] expectedC = [ 8, -4]; 3542 assert(C.array == expectedC); 3543 3544 long2 D = cast(long2) _mm_slli_epi64(A, 64); 3545 long[2] expectedD = [ 0, -0]; 3546 assert(D.array == expectedD); 3547 } 3548 3549 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 3550 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted 3551 { 3552 static if (GDC_with_SSE2) 3553 { 3554 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3555 } 3556 else static if (LDC_with_SSE2) 3557 { 3558 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3559 } 3560 else static if (LDC_with_ARM64) 3561 { 3562 short8 sa = cast(short8)a; 3563 short8 r = cast(short8)_mm_setzero_si128(); 3564 ubyte count = cast(ubyte) imm8; 3565 if (count > 15) 3566 return cast(__m128i)r; 3567 r = sa << short8(count); 3568 return cast(__m128i)r; 3569 } 3570 else 3571 { 3572 short8 sa = cast(short8)a; 3573 short8 r = cast(short8)_mm_setzero_si128(); 3574 ubyte count = cast(ubyte) imm8; 3575 if (count > 15) 3576 return cast(__m128i)r; 3577 foreach(i; 0..8) 3578 r.ptr[i] = cast(short)(sa.array[i] << count); 3579 return cast(__m128i)r; 3580 } 3581 } 3582 unittest 3583 { 3584 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3585 short8 B = cast(short8)( _mm_slli_epi16(A, 1) ); 3586 short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) ); 3587 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 3588 assert(B.array == expectedB); 3589 assert(B2.array == expectedB); 3590 3591 short8 C = cast(short8)( _mm_slli_epi16(A, 16) ); 3592 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 3593 assert(C.array == expectedC); 3594 } 3595 3596 3597 /// Shift `a` left by `bytes` bytes while shifting in zeros. 3598 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted 3599 { 3600 static if (bytes & 0xF0) 3601 { 3602 return _mm_setzero_si128(); 3603 } 3604 else 3605 { 3606 static if (GDC_with_SSE2) 3607 { 3608 return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 3609 } 3610 else version(DigitalMars) 3611 { 3612 version(D_InlineAsm_X86) 3613 { 3614 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64 3615 { 3616 movdqu XMM0, op; 3617 pslldq XMM0, bytes; 3618 movdqu op, XMM0; 3619 } 3620 return op; 3621 } 3622 else 3623 { 3624 byte16 A = cast(byte16)op; 3625 byte16 R; 3626 for (int n = 15; n >= bytes; --n) 3627 R.ptr[n] = A.array[n-bytes]; 3628 for (int n = bytes-1; n >= 0; --n) 3629 R.ptr[n] = 0; 3630 return cast(__m128i)R; 3631 } 3632 } 3633 else 3634 { 3635 return cast(__m128i) shufflevector!(byte16, 3636 16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes, 3637 22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes, 3638 28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes) 3639 (cast(byte16)_mm_setzero_si128(), cast(byte16)op); 3640 } 3641 } 3642 } 3643 unittest 3644 { 3645 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3646 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left 3647 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ]; 3648 assert(R.array == correct); 3649 3650 __m128i B = _mm_srli_si128!16(_mm_set1_epi32(-1)); 3651 int[4] expectedB = [0, 0, 0, 0]; 3652 assert(B.array == expectedB); 3653 } 3654 3655 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`. 3656 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted 3657 { 3658 version(LDC) 3659 { 3660 // Disappeared with LDC 1.11 3661 static if (__VERSION__ < 2081) 3662 return __builtin_ia32_sqrtpd(vec); 3663 else 3664 { 3665 vec.array[0] = llvm_sqrt(vec.array[0]); 3666 vec.array[1] = llvm_sqrt(vec.array[1]); 3667 return vec; 3668 } 3669 } 3670 else static if (GDC_with_SSE2) 3671 { 3672 return __builtin_ia32_sqrtpd(vec); 3673 } 3674 else 3675 { 3676 vec.ptr[0] = sqrt(vec.array[0]); 3677 vec.ptr[1] = sqrt(vec.array[1]); 3678 return vec; 3679 } 3680 } 3681 3682 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 3683 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 3684 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted 3685 { 3686 // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only. 3687 // "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 3688 // The quadword at bits 127:64 of the destination operand remains unchanged." 3689 version(LDC) 3690 { 3691 // Disappeared with LDC 1.11 3692 static if (__VERSION__ < 2081) 3693 { 3694 __m128d c = __builtin_ia32_sqrtsd(b); 3695 a[0] = c[0]; 3696 return a; 3697 } 3698 else 3699 { 3700 a.array[0] = llvm_sqrt(b.array[0]); 3701 return a; 3702 } 3703 } 3704 else static if (GDC_with_SSE2) 3705 { 3706 __m128d c = __builtin_ia32_sqrtsd(b); 3707 a.ptr[0] = c.array[0]; 3708 return a; 3709 } 3710 else 3711 { 3712 a.ptr[0] = sqrt(b.array[0]); 3713 return a; 3714 } 3715 } 3716 unittest 3717 { 3718 __m128d A = _mm_setr_pd(1.0, 3.0); 3719 __m128d B = _mm_setr_pd(4.0, 5.0); 3720 __m128d R = _mm_sqrt_sd(A, B); 3721 double[2] correct = [2.0, 3.0 ]; 3722 assert(R.array == correct); 3723 } 3724 3725 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits. 3726 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted 3727 { 3728 static if (GDC_with_SSE2) 3729 { 3730 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3731 } 3732 else static if (LDC_with_SSE2) 3733 { 3734 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3735 } 3736 else 3737 { 3738 short8 sa = cast(short8)a; 3739 long2 lc = cast(long2)count; 3740 int bits = cast(int)(lc.array[0]); 3741 short8 r = void; 3742 foreach(i; 0..8) 3743 r.ptr[i] = cast(short)(sa.array[i] >> bits); 3744 return cast(int4)r; 3745 } 3746 } 3747 3748 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits. 3749 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted 3750 { 3751 static if (LDC_with_SSE2) 3752 { 3753 return __builtin_ia32_psrad128(a, count); 3754 } 3755 else static if (GDC_with_SSE2) 3756 { 3757 return __builtin_ia32_psrad128(a, count); 3758 } 3759 else 3760 { 3761 int4 r = void; 3762 long2 lc = cast(long2)count; 3763 int bits = cast(int)(lc.array[0]); 3764 r.ptr[0] = (a.array[0] >> bits); 3765 r.ptr[1] = (a.array[1] >> bits); 3766 r.ptr[2] = (a.array[2] >> bits); 3767 r.ptr[3] = (a.array[3] >> bits); 3768 return r; 3769 } 3770 } 3771 3772 3773 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 3774 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted 3775 { 3776 static if (GDC_with_SSE2) 3777 { 3778 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 3779 } 3780 else static if (LDC_with_SSE2) 3781 { 3782 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 3783 } 3784 else static if (LDC_with_ARM64) 3785 { 3786 short8 sa = cast(short8)a; 3787 ubyte count = cast(ubyte)imm8; 3788 if (count > 15) 3789 count = 15; 3790 short8 r = sa >> short8(count); 3791 return cast(__m128i)r; 3792 } 3793 else 3794 { 3795 short8 sa = cast(short8)a; 3796 short8 r = void; 3797 3798 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3799 // D says "It's illegal to shift by the same or more bits 3800 // than the size of the quantity being shifted" 3801 // and it's UB instead. 3802 ubyte count = cast(ubyte)imm8; 3803 if (count > 15) 3804 count = 15; 3805 foreach(i; 0..8) 3806 r.ptr[i] = cast(short)(sa.array[i] >> count); 3807 return cast(int4)r; 3808 } 3809 } 3810 unittest 3811 { 3812 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3813 short8 B = cast(short8)( _mm_srai_epi16(A, 1) ); 3814 short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) ); 3815 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 3816 assert(B.array == expectedB); 3817 assert(B2.array == expectedB); 3818 3819 short8 C = cast(short8)( _mm_srai_epi16(A, 18) ); 3820 short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ]; 3821 assert(C.array == expectedC); 3822 } 3823 3824 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 3825 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted 3826 { 3827 static if (LDC_with_SSE2) 3828 { 3829 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 3830 } 3831 else static if (GDC_with_SSE2) 3832 { 3833 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 3834 } 3835 else 3836 { 3837 int4 r = void; 3838 3839 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3840 // D says "It's illegal to shift by the same or more bits 3841 // than the size of the quantity being shifted" 3842 // and it's UB instead. 3843 ubyte count = cast(ubyte) imm8; 3844 if (count > 31) 3845 count = 31; 3846 3847 r.ptr[0] = (a.array[0] >> count); 3848 r.ptr[1] = (a.array[1] >> count); 3849 r.ptr[2] = (a.array[2] >> count); 3850 r.ptr[3] = (a.array[3] >> count); 3851 return r; 3852 } 3853 } 3854 unittest 3855 { 3856 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3857 __m128i B = _mm_srai_epi32(A, 1); 3858 __m128i B2 = _mm_srai_epi32(A, 1 + 256); 3859 int[4] expectedB = [ 0, 1, 1, -2]; 3860 assert(B.array == expectedB); 3861 assert(B2.array == expectedB); 3862 3863 __m128i C = _mm_srai_epi32(A, 32); 3864 int[4] expectedC = [ 0, 0, 0, -1]; 3865 assert(C.array == expectedC); 3866 3867 __m128i D = _mm_srai_epi32(A, 0); 3868 int[4] expectedD = [ 0, 2, 3, -4]; 3869 assert(D.array == expectedD); 3870 } 3871 3872 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted 3873 { 3874 static if (LDC_with_SSE2) 3875 { 3876 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 3877 } 3878 else static if (GDC_with_SSE2) 3879 { 3880 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 3881 } 3882 else 3883 { 3884 short8 sa = cast(short8)a; 3885 long2 lc = cast(long2)count; 3886 int bits = cast(int)(lc.array[0]); 3887 short8 r = void; 3888 foreach(i; 0..8) 3889 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits); 3890 return cast(int4)r; 3891 } 3892 } 3893 3894 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted 3895 { 3896 static if (LDC_with_SSE2) 3897 { 3898 return __builtin_ia32_psrld128(a, count); 3899 } 3900 else static if (GDC_with_SSE2) 3901 { 3902 return __builtin_ia32_psrld128(a, count); 3903 } 3904 else 3905 { 3906 int4 r = void; 3907 long2 lc = cast(long2)count; 3908 int bits = cast(int)(lc.array[0]); 3909 r.ptr[0] = cast(uint)(a.array[0]) >> bits; 3910 r.ptr[1] = cast(uint)(a.array[1]) >> bits; 3911 r.ptr[2] = cast(uint)(a.array[2]) >> bits; 3912 r.ptr[3] = cast(uint)(a.array[3]) >> bits; 3913 return r; 3914 } 3915 } 3916 3917 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted 3918 { 3919 static if (LDC_with_SSE2) 3920 { 3921 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 3922 } 3923 else static if (GDC_with_SSE2) 3924 { 3925 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 3926 } 3927 else 3928 { 3929 long2 r = void; 3930 long2 sa = cast(long2)a; 3931 long2 lc = cast(long2)count; 3932 int bits = cast(int)(lc.array[0]); 3933 r.ptr[0] = cast(ulong)(sa.array[0]) >> bits; 3934 r.ptr[1] = cast(ulong)(sa.array[1]) >> bits; 3935 return cast(__m128i)r; 3936 } 3937 } 3938 3939 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 3940 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted 3941 { 3942 static if (GDC_with_SSE2) 3943 { 3944 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 3945 } 3946 else static if (LDC_with_SSE2) 3947 { 3948 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 3949 } 3950 else static if (LDC_with_ARM64) 3951 { 3952 short8 sa = cast(short8)a; 3953 short8 r = cast(short8) _mm_setzero_si128(); 3954 3955 ubyte count = cast(ubyte)imm8; 3956 if (count >= 16) 3957 return cast(__m128i)r; 3958 3959 r = sa >>> short8(count); // This facility offered with LDC, but not DMD. 3960 return cast(__m128i)r; 3961 } 3962 else 3963 { 3964 short8 sa = cast(short8)a; 3965 ubyte count = cast(ubyte)imm8; 3966 3967 short8 r = cast(short8) _mm_setzero_si128(); 3968 if (count >= 16) 3969 return cast(__m128i)r; 3970 3971 foreach(i; 0..8) 3972 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count); 3973 return cast(__m128i)r; 3974 } 3975 } 3976 unittest 3977 { 3978 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3979 short8 B = cast(short8)( _mm_srli_epi16(A, 1) ); 3980 short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) ); 3981 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 3982 assert(B.array == expectedB); 3983 assert(B2.array == expectedB); 3984 3985 short8 C = cast(short8)( _mm_srli_epi16(A, 16) ); 3986 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0]; 3987 assert(C.array == expectedC); 3988 3989 short8 D = cast(short8)( _mm_srli_epi16(A, 0) ); 3990 short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ]; 3991 assert(D.array == expectedD); 3992 } 3993 3994 3995 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 3996 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted 3997 { 3998 static if (GDC_with_SSE2) 3999 { 4000 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 4001 } 4002 else static if (LDC_with_SSE2) 4003 { 4004 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 4005 } 4006 else 4007 { 4008 ubyte count = cast(ubyte) imm8; 4009 4010 // Note: the intrinsics guarantee imm8[0..7] is taken, however 4011 // D says "It's illegal to shift by the same or more bits 4012 // than the size of the quantity being shifted" 4013 // and it's UB instead. 4014 int4 r = _mm_setzero_si128(); 4015 if (count >= 32) 4016 return r; 4017 r.ptr[0] = a.array[0] >>> count; 4018 r.ptr[1] = a.array[1] >>> count; 4019 r.ptr[2] = a.array[2] >>> count; 4020 r.ptr[3] = a.array[3] >>> count; 4021 return r; 4022 } 4023 } 4024 unittest 4025 { 4026 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 4027 __m128i B = _mm_srli_epi32(A, 1); 4028 __m128i B2 = _mm_srli_epi32(A, 1 + 256); 4029 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 4030 assert(B.array == expectedB); 4031 assert(B2.array == expectedB); 4032 4033 __m128i C = _mm_srli_epi32(A, 255); 4034 int[4] expectedC = [ 0, 0, 0, 0 ]; 4035 assert(C.array == expectedC); 4036 } 4037 4038 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros. 4039 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted 4040 { 4041 static if (GDC_with_SSE2) 4042 { 4043 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4044 } 4045 else static if (LDC_with_SSE2) 4046 { 4047 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4048 } 4049 else 4050 { 4051 long2 r = cast(long2) _mm_setzero_si128(); 4052 long2 sa = cast(long2)a; 4053 4054 ubyte count = cast(ubyte) imm8; 4055 if (count >= 64) 4056 return cast(__m128i)r; 4057 4058 r.ptr[0] = sa.array[0] >>> count; 4059 r.ptr[1] = sa.array[1] >>> count; 4060 return cast(__m128i)r; 4061 } 4062 } 4063 unittest 4064 { 4065 __m128i A = _mm_setr_epi64(8, -4); 4066 long2 B = cast(long2) _mm_srli_epi64(A, 1); 4067 long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512); 4068 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 4069 assert(B.array == expectedB); 4070 assert(B2.array == expectedB); 4071 4072 long2 C = cast(long2) _mm_srli_epi64(A, 64); 4073 long[2] expectedC = [ 0, 0 ]; 4074 assert(C.array == expectedC); 4075 } 4076 4077 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4078 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe 4079 { 4080 static if (bytes & 0xF0) 4081 { 4082 return _mm_setzero_si128(); 4083 } 4084 else static if (GDC_with_SSE2) 4085 { 4086 return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8)); 4087 } 4088 else static if (DMD_with_32bit_asm) 4089 { 4090 asm pure nothrow @nogc @trusted 4091 { 4092 movdqu XMM0, v; 4093 psrldq XMM0, bytes; 4094 movdqu v, XMM0; 4095 } 4096 return v; 4097 } 4098 else 4099 { 4100 return cast(__m128i) shufflevector!(byte16, 4101 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7, 4102 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15) 4103 (cast(byte16) v, cast(byte16)_mm_setzero_si128()); 4104 } 4105 } 4106 unittest 4107 { 4108 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1)); 4109 int[4] correct = [2, 3, 4, 0]; 4110 assert(R.array == correct); 4111 4112 __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1)); 4113 int[4] expectedA = [0, 0, 0, 0]; 4114 assert(A.array == expectedA); 4115 } 4116 4117 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4118 /// #BONUS 4119 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe 4120 { 4121 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v); 4122 } 4123 unittest 4124 { 4125 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)); 4126 float[4] correct = [3.0f, 4.0f, 0, 0]; 4127 assert(R.array == correct); 4128 } 4129 4130 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4131 /// #BONUS 4132 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe 4133 { 4134 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v); 4135 } 4136 4137 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4138 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4139 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted 4140 { 4141 pragma(inline, true); 4142 __m128d* aligned = cast(__m128d*)mem_addr; 4143 *aligned = a; 4144 } 4145 4146 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 4147 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4148 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted 4149 { 4150 __m128d* aligned = cast(__m128d*)mem_addr; 4151 __m128d r; 4152 r.ptr[0] = a.array[0]; 4153 r.ptr[1] = a.array[0]; 4154 *aligned = r; 4155 } 4156 4157 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 4158 /// be aligned on any particular boundary. 4159 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 4160 { 4161 pragma(inline, true); 4162 *mem_addr = a.array[0]; 4163 } 4164 4165 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 4166 /// general-protection exception may be generated. 4167 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 4168 { 4169 pragma(inline, true); 4170 *mem_addr = a; 4171 } 4172 4173 alias _mm_store1_pd = _mm_store_pd1; /// 4174 4175 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory. 4176 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 4177 { 4178 pragma(inline, true); 4179 *mem_addr = a.array[1]; 4180 } 4181 4182 // Note: `mem_addr` doesn't have to actually be aligned, which breaks 4183 // expectations from the user point of view. This problem also exist in C++. 4184 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 4185 { 4186 pragma(inline, true); 4187 long* dest = cast(long*)mem_addr; 4188 long2 la = cast(long2)a; 4189 *dest = la.array[0]; 4190 } 4191 unittest 4192 { 4193 long[3] A = [1, 2, 3]; 4194 _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 4195 long[3] correct = [1, 0x1_0000_0000, 3]; 4196 assert(A == correct); 4197 } 4198 4199 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. 4200 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 4201 { 4202 pragma(inline, true); 4203 *mem_addr = a.array[0]; 4204 } 4205 4206 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order. `mem_addr` must be 4207 /// aligned on a 16-byte boundary or a general-protection exception may be generated. 4208 void _mm_storer_pd (double* mem_addr, __m128d a) pure 4209 { 4210 __m128d* aligned = cast(__m128d*)mem_addr; 4211 *aligned = shufflevector!(double2, 1, 0)(a, a); 4212 } 4213 4214 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4215 /// `mem_addr` does not need to be aligned on any particular boundary. 4216 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe 4217 { 4218 pragma(inline, true); 4219 storeUnaligned!double2(a, mem_addr); 4220 } 4221 4222 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 4223 /// boundary. 4224 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe 4225 { 4226 pragma(inline, true); 4227 storeUnaligned!__m128i(a, cast(int*)mem_addr); 4228 } 4229 4230 /// Store 32-bit integer from the first element of `a` into memory. 4231 /// `mem_addr` does not need to be aligned on any particular boundary. 4232 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted 4233 { 4234 pragma(inline, true); 4235 int* dest = cast(int*)mem_addr; 4236 *dest = a.array[0]; 4237 } 4238 unittest 4239 { 4240 int[2] arr = [-24, 12]; 4241 _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7)); 4242 assert(arr == [-24, -1]); 4243 } 4244 4245 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 4246 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte 4247 /// boundary or a general-protection exception may be generated. 4248 void _mm_stream_pd (double* mem_addr, __m128d a) 4249 { 4250 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4251 __m128d* dest = cast(__m128d*)mem_addr; 4252 *dest = a; 4253 } 4254 4255 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 4256 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception 4257 /// may be generated. 4258 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) 4259 { 4260 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4261 __m128i* dest = cast(__m128i*)mem_addr; 4262 *dest = a; 4263 } 4264 4265 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 4266 /// pollution. If the cache line containing address mem_addr is already in the cache, 4267 /// the cache will be updated. 4268 void _mm_stream_si32 (int* mem_addr, int a) 4269 { 4270 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4271 *mem_addr = a; 4272 } 4273 4274 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 4275 /// cache pollution. If the cache line containing address mem_addr is already 4276 /// in the cache, the cache will be updated. 4277 void _mm_stream_si64 (long* mem_addr, long a) 4278 { 4279 // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4280 *mem_addr = a; 4281 } 4282 4283 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 4284 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 4285 { 4286 pragma(inline, true); 4287 return cast(__m128i)(cast(short8)a - cast(short8)b); 4288 } 4289 4290 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 4291 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 4292 { 4293 pragma(inline, true); 4294 return cast(__m128i)(cast(int4)a - cast(int4)b); 4295 } 4296 4297 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`. 4298 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 4299 { 4300 pragma(inline, true); 4301 return cast(__m128i)(cast(long2)a - cast(long2)b); 4302 } 4303 4304 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 4305 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 4306 { 4307 pragma(inline, true); 4308 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 4309 } 4310 4311 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 4312 /// floating-point elements in `a`. 4313 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 4314 { 4315 pragma(inline, true); 4316 return a - b; 4317 } 4318 4319 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 4320 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the 4321 /// upper element of result. 4322 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted 4323 { 4324 version(DigitalMars) 4325 { 4326 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 4327 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 4328 asm pure nothrow @nogc @trusted { nop;} 4329 a[0] = a[0] - b[0]; 4330 return a; 4331 } 4332 else static if (GDC_with_SSE2) 4333 { 4334 return __builtin_ia32_subsd(a, b); 4335 } 4336 else 4337 { 4338 a.ptr[0] -= b.array[0]; 4339 return a; 4340 } 4341 } 4342 unittest 4343 { 4344 __m128d a = [1.5, -2.0]; 4345 a = _mm_sub_sd(a, a); 4346 assert(a.array == [0.0, -2.0]); 4347 } 4348 4349 /// Subtract 64-bit integer `b` from 64-bit integer `a`. 4350 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe 4351 { 4352 pragma(inline, true); 4353 return a - b; 4354 } 4355 4356 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4357 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 4358 { 4359 version(LDC) 4360 { 4361 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4362 { 4363 // Generates PSUBSW since LDC 1.15 -O0 4364 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4365 4366 enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4367 enum ir = ` 4368 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4369 ret <8 x i16> %r`; 4370 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4371 } 4372 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4373 { 4374 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4375 short[8] res; 4376 short8 sa = cast(short8)a; 4377 short8 sb = cast(short8)b; 4378 foreach(i; 0..8) 4379 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4380 return _mm_loadu_si128(cast(int4*)res.ptr); 4381 } 4382 else static if (LDC_with_SSE2) 4383 { 4384 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 4385 } 4386 else 4387 static assert(false); 4388 } 4389 else static if (GDC_with_SSE2) 4390 { 4391 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 4392 } 4393 else 4394 { 4395 short[8] res; 4396 short8 sa = cast(short8)a; 4397 short8 sb = cast(short8)b; 4398 foreach(i; 0..8) 4399 res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4400 return _mm_loadu_si128(cast(int4*)res.ptr); 4401 } 4402 } 4403 unittest 4404 { 4405 short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0), 4406 _mm_setr_epi16(-10 , 16, 5, 4, 3, 2, 1, 0)); 4407 static immutable short[8] correctResult = [32767, -32768, 0, 0, 0, 0, 0, 0]; 4408 assert(res.array == correctResult); 4409 } 4410 4411 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 4412 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 4413 { 4414 version(LDC) 4415 { 4416 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4417 { 4418 // x86: Generates PSUBSB since LDC 1.15 -O0 4419 // ARM: Generates sqsub.16b since LDC 1.21 -O0 4420 enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 4421 enum ir = ` 4422 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 4423 ret <16 x i8> %r`; 4424 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 4425 } 4426 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4427 { 4428 byte[16] res; 4429 byte16 sa = cast(byte16)a; 4430 byte16 sb = cast(byte16)b; 4431 foreach(i; 0..16) 4432 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4433 return _mm_loadu_si128(cast(int4*)res.ptr); 4434 } 4435 else static if (LDC_with_SSE2) 4436 { 4437 return cast(__m128i) __builtin_ia32_psubsb128(cast(byte16) a, cast(byte16) b); 4438 } 4439 else 4440 static assert(false); 4441 } 4442 else static if (GDC_with_SSE2) 4443 { 4444 return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b); 4445 } 4446 else 4447 { 4448 byte[16] res; 4449 byte16 sa = cast(byte16)a; 4450 byte16 sb = cast(byte16)b; 4451 foreach(i; 0..16) 4452 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4453 return _mm_loadu_si128(cast(int4*)res.ptr); 4454 } 4455 } 4456 unittest 4457 { 4458 byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4459 _mm_setr_epi8( 15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4460 static immutable byte[16] correctResult = [-128, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4461 assert(res.array == correctResult); 4462 } 4463 4464 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 4465 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 4466 { 4467 version(LDC) 4468 { 4469 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4470 { 4471 // x86: Generates PSUBUSW since LDC 1.15 -O0 4472 // ARM: Generates uqsub.8h since LDC 1.21 -O0 4473 enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4474 enum ir = ` 4475 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4476 ret <8 x i16> %r`; 4477 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4478 } 4479 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4480 { 4481 short[8] res; 4482 short8 sa = cast(short8)a; 4483 short8 sb = cast(short8)b; 4484 foreach(i; 0..8) 4485 { 4486 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4487 res[i] = saturateSignedIntToUnsignedShort(sum); 4488 } 4489 return _mm_loadu_si128(cast(int4*)res.ptr); 4490 } 4491 else static if (LDC_with_SSE2) 4492 { 4493 return cast(__m128i) __builtin_ia32_psubusw128(a, b); 4494 } 4495 else 4496 static assert(false); 4497 } 4498 else static if (GDC_with_SSE2) 4499 { 4500 return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b); 4501 } 4502 else 4503 { 4504 short[8] res; 4505 short8 sa = cast(short8)a; 4506 short8 sb = cast(short8)b; 4507 foreach(i; 0..8) 4508 { 4509 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4510 res[i] = saturateSignedIntToUnsignedShort(sum); 4511 } 4512 return _mm_loadu_si128(cast(int4*)res.ptr); 4513 } 4514 } 4515 unittest 4516 { 4517 short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534, 1, 5, 4, 3, 2, 1, 0), 4518 _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0)); 4519 static immutable short[8] correct = [ 0, 0, 1, 0, 0, 2, 0, 0]; 4520 assert(R.array == correct); 4521 } 4522 4523 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4524 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4525 { 4526 version(LDC) 4527 { 4528 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4529 { 4530 // x86: Generates PSUBUSB since LDC 1.15 -O0 4531 // ARM: Generates uqsub.16b since LDC 1.21 -O0 4532 enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 4533 enum ir = ` 4534 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 4535 ret <16 x i8> %r`; 4536 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 4537 } 4538 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4539 { 4540 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4541 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4542 { 4543 ubyte[16] res; 4544 byte16 sa = cast(byte16)a; 4545 byte16 sb = cast(byte16)b; 4546 foreach(i; 0..16) 4547 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4548 return _mm_loadu_si128(cast(int4*)res.ptr); 4549 } 4550 } 4551 else static if (LDC_with_SSE2) 4552 { 4553 return __builtin_ia32_psubusb128(a, b); 4554 } 4555 else 4556 static assert(false); 4557 } 4558 else static if (GDC_with_SSE2) 4559 { 4560 return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b); 4561 } 4562 else 4563 { 4564 ubyte[16] res; 4565 byte16 sa = cast(byte16)a; 4566 byte16 sb = cast(byte16)b; 4567 foreach(i; 0..16) 4568 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4569 return _mm_loadu_si128(cast(int4*)res.ptr); 4570 } 4571 } 4572 unittest 4573 { 4574 byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4575 _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4576 static immutable byte[16] correctResult = [ 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4577 assert(res.array == correctResult); 4578 } 4579 4580 // Note: the only difference between these intrinsics is the signalling 4581 // behaviour of quiet NaNs. This is incorrect but the case where 4582 // you would want to differentiate between qNaN and sNaN and then 4583 // treat them differently on purpose seems extremely rare. 4584 alias _mm_ucomieq_sd = _mm_comieq_sd; /// 4585 alias _mm_ucomige_sd = _mm_comige_sd; /// 4586 alias _mm_ucomigt_sd = _mm_comigt_sd; /// 4587 alias _mm_ucomile_sd = _mm_comile_sd; /// 4588 alias _mm_ucomilt_sd = _mm_comilt_sd; /// 4589 alias _mm_ucomineq_sd = _mm_comineq_sd; /// 4590 4591 /// Return vector of type `__m128d` with undefined elements. 4592 __m128d _mm_undefined_pd() pure @safe 4593 { 4594 pragma(inline, true); 4595 __m128d result = void; 4596 return result; 4597 } 4598 4599 /// Return vector of type `__m128i` with undefined elements. 4600 __m128i _mm_undefined_si128() pure @safe 4601 { 4602 pragma(inline, true); 4603 __m128i result = void; 4604 return result; 4605 } 4606 4607 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. 4608 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe 4609 { 4610 static if (GDC_with_SSE2) 4611 { 4612 return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b); 4613 } 4614 else static if (DMD_with_32bit_asm) 4615 { 4616 asm pure nothrow @nogc @trusted 4617 { 4618 movdqu XMM0, a; 4619 movdqu XMM1, b; 4620 punpckhwd XMM0, XMM1; 4621 movdqu a, XMM0; 4622 } 4623 return a; 4624 } 4625 else 4626 { 4627 return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 4628 (cast(short8)a, cast(short8)b); 4629 } 4630 } 4631 unittest 4632 { 4633 __m128i A = _mm_setr_epi16(4, 5, 6, 7, 8, 9, 10, 11); 4634 __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19); 4635 short8 C = cast(short8)(_mm_unpackhi_epi16(A, B)); 4636 short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19]; 4637 assert(C.array == correct); 4638 } 4639 4640 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. 4641 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted 4642 { 4643 static if (GDC_with_SSE2) 4644 { 4645 return __builtin_ia32_punpckhdq128(a, b); 4646 } 4647 else version(DigitalMars) 4648 { 4649 __m128i r; 4650 r.ptr[0] = a.array[2]; 4651 r.ptr[1] = b.array[2]; 4652 r.ptr[2] = a.array[3]; 4653 r.ptr[3] = b.array[3]; 4654 return r; 4655 } 4656 else 4657 { 4658 return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 4659 } 4660 } 4661 unittest 4662 { 4663 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 4664 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 4665 __m128i C = _mm_unpackhi_epi32(A, B); 4666 int[4] correct = [3, 7, 4, 8]; 4667 assert(C.array == correct); 4668 } 4669 4670 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`. 4671 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted 4672 { 4673 static if (GDC_with_SSE2) 4674 { 4675 return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b); 4676 } 4677 else 4678 { 4679 __m128i r = cast(__m128i)b; 4680 r[0] = a[2]; 4681 r[1] = a[3]; 4682 return r; 4683 } 4684 } 4685 unittest // Issue #36 4686 { 4687 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4688 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4689 long2 C = cast(long2)(_mm_unpackhi_epi64(A, B)); 4690 long[2] correct = [0x33333333_33333333, 0x55555555_55555555]; 4691 assert(C.array == correct); 4692 } 4693 4694 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. 4695 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe 4696 { 4697 static if (GDC_with_SSE2) 4698 { 4699 return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b); 4700 } 4701 else static if (DMD_with_32bit_asm) 4702 { 4703 asm pure nothrow @nogc @trusted 4704 { 4705 movdqu XMM0, a; 4706 movdqu XMM1, b; 4707 punpckhbw XMM0, XMM1; 4708 movdqu a, XMM0; 4709 } 4710 return a; 4711 } 4712 else 4713 { 4714 return cast(__m128i)shufflevector!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 4715 12, 28, 13, 29, 14, 30, 15, 31) 4716 (cast(byte16)a, cast(byte16)b); 4717 } 4718 } 4719 unittest 4720 { 4721 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 4722 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 4723 byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B); 4724 byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]; 4725 assert(C.array == correct); 4726 } 4727 4728 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`. 4729 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe 4730 { 4731 static if (GDC_with_SSE2) 4732 { 4733 return __builtin_ia32_unpckhpd(a, b); 4734 } 4735 else 4736 { 4737 return shufflevector!(__m128d, 1, 3)(a, b); 4738 } 4739 } 4740 unittest 4741 { 4742 __m128d A = _mm_setr_pd(4.0, 6.0); 4743 __m128d B = _mm_setr_pd(7.0, 9.0); 4744 __m128d C = _mm_unpackhi_pd(A, B); 4745 double[2] correct = [6.0, 9.0]; 4746 assert(C.array == correct); 4747 } 4748 4749 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. 4750 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe 4751 { 4752 static if (GDC_with_SSE2) 4753 { 4754 return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b); 4755 } 4756 else static if (DMD_with_32bit_asm) 4757 { 4758 asm pure nothrow @nogc @trusted 4759 { 4760 movdqu XMM0, a; 4761 movdqu XMM1, b; 4762 punpcklwd XMM0, XMM1; 4763 movdqu a, XMM0; 4764 } 4765 return a; 4766 } 4767 else 4768 { 4769 return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11) 4770 (cast(short8)a, cast(short8)b); 4771 } 4772 } 4773 unittest 4774 { 4775 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 4776 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 4777 short8 C = cast(short8) _mm_unpacklo_epi16(A, B); 4778 short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11]; 4779 assert(C.array == correct); 4780 } 4781 4782 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. 4783 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted 4784 { 4785 static if (GDC_with_SSE2) 4786 { 4787 return __builtin_ia32_punpckldq128(a, b); 4788 } 4789 else version(DigitalMars) 4790 { 4791 __m128i r; 4792 r.ptr[0] = a.array[0]; 4793 r.ptr[1] = b.array[0]; 4794 r.ptr[2] = a.array[1]; 4795 r.ptr[3] = b.array[1]; 4796 return r; 4797 } 4798 else 4799 { 4800 return shufflevector!(int4, 0, 4, 1, 5)(cast(int4)a, cast(int4)b); 4801 } 4802 } 4803 unittest 4804 { 4805 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 4806 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 4807 __m128i C = _mm_unpacklo_epi32(A, B); 4808 int[4] correct = [1, 5, 2, 6]; 4809 assert(C.array == correct); 4810 } 4811 4812 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`. 4813 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted 4814 { 4815 static if (GDC_with_SSE2) 4816 { 4817 return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b); 4818 } 4819 else 4820 { 4821 long2 lA = cast(long2)a; 4822 long2 lB = cast(long2)b; 4823 long2 R; 4824 R.ptr[0] = lA.array[0]; 4825 R.ptr[1] = lB.array[0]; 4826 return cast(__m128i)R; 4827 } 4828 } 4829 unittest // Issue #36 4830 { 4831 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4832 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4833 long2 C = cast(long2)(_mm_unpacklo_epi64(A, B)); 4834 long[2] correct = [0x22222222_22222222, 0x44444444_44444444]; 4835 assert(C.array == correct); 4836 } 4837 4838 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. 4839 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe 4840 { 4841 static if (GDC_with_SSE2) 4842 { 4843 return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b); 4844 } 4845 else static if (DMD_with_32bit_asm) 4846 { 4847 asm pure nothrow @nogc @trusted 4848 { 4849 movdqu XMM0, a; 4850 movdqu XMM1, b; 4851 punpcklbw XMM0, XMM1; 4852 movdqu a, XMM0; 4853 } 4854 return a; 4855 } 4856 else 4857 { 4858 return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 4859 4, 20, 5, 21, 6, 22, 7, 23) 4860 (cast(byte16)a, cast(byte16)b); 4861 } 4862 } 4863 unittest 4864 { 4865 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 4866 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 4867 byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B); 4868 byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]; 4869 assert(C.array == correct); 4870 } 4871 4872 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`. 4873 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe 4874 { 4875 static if (GDC_with_SSE2) 4876 { 4877 return __builtin_ia32_unpcklpd(a, b); 4878 } 4879 else 4880 { 4881 return shufflevector!(__m128d, 0, 2)(a, b); 4882 } 4883 } 4884 unittest 4885 { 4886 __m128d A = _mm_setr_pd(4.0, 6.0); 4887 __m128d B = _mm_setr_pd(7.0, 9.0); 4888 __m128d C = _mm_unpacklo_pd(A, B); 4889 double[2] correct = [4.0, 7.0]; 4890 assert(C.array == correct); 4891 } 4892 4893 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 4894 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 4895 { 4896 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 4897 } 4898 // TODO unittest and thus force inline 4899 4900 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`. 4901 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 4902 { 4903 return a ^ b; 4904 } 4905 // TODO unittest and thus force inline 4906 4907 unittest 4908 { 4909 float distance(float[4] a, float[4] b) nothrow @nogc 4910 { 4911 __m128 va = _mm_loadu_ps(a.ptr); 4912 __m128 vb = _mm_loadu_ps(b.ptr); 4913 __m128 diffSquared = _mm_sub_ps(va, vb); 4914 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 4915 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared)); 4916 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum)); 4917 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 4918 } 4919 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 4920 }