1 /** 2 * SSE2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2 4 * 5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.emmintrin; 9 10 public import inteli.types; 11 public import inteli.xmmintrin; // SSE2 includes SSE1 12 import inteli.mmx; 13 import inteli.internals; 14 15 nothrow @nogc: 16 17 18 // SSE2 instructions 19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 20 21 /// Add packed 16-bit integers in `a` and `b`. 22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 23 { 24 pragma(inline, true); 25 return cast(__m128i)(cast(short8)a + cast(short8)b); 26 } 27 unittest 28 { 29 __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77); 30 short8 R = cast(short8) _mm_add_epi16(A, A); 31 short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154]; 32 assert(R.array == correct); 33 } 34 35 /// Add packed 32-bit integers in `a` and `b`. 36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 37 { 38 pragma(inline, true); 39 return cast(__m128i)(cast(int4)a + cast(int4)b); 40 } 41 unittest 42 { 43 __m128i A = _mm_setr_epi32( -7, -1, 0, 9); 44 int4 R = _mm_add_epi32(A, A); 45 int[4] correct = [ -14, -2, 0, 18 ]; 46 assert(R.array == correct); 47 } 48 49 /// Add packed 64-bit integers in `a` and `b`. 50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 51 { 52 pragma(inline, true); 53 return cast(__m128i)(cast(long2)a + cast(long2)b); 54 } 55 unittest 56 { 57 __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000); 58 long2 R = cast(long2) _mm_add_epi64(A, A); 59 long[2] correct = [ -2, 0 ]; 60 assert(R.array == correct); 61 } 62 63 /// Add packed 8-bit integers in `a` and `b`. 64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 65 { 66 pragma(inline, true); 67 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 68 } 69 unittest 70 { 71 __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78); 72 byte16 R = cast(byte16) _mm_add_epi8(A, A); 73 byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100]; 74 assert(R.array == correct); 75 } 76 77 /// Add the lower double-precision (64-bit) floating-point element 78 /// in `a` and `b`, store the result in the lower element of dst, 79 /// and copy the upper element from `a` to the upper element of destination. 80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 81 { 82 static if (GDC_with_SSE2) 83 { 84 return __builtin_ia32_addsd(a, b); 85 } 86 else version(DigitalMars) 87 { 88 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 89 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 90 asm pure nothrow @nogc @trusted { nop;} 91 a[0] = a[0] + b[0]; 92 return a; 93 } 94 else 95 { 96 a[0] += b[0]; 97 return a; 98 } 99 } 100 unittest 101 { 102 __m128d a = [1.5, -2.0]; 103 a = _mm_add_sd(a, a); 104 assert(a.array == [3.0, -2.0]); 105 } 106 107 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 108 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 109 { 110 pragma(inline, true); 111 return a + b; 112 } 113 unittest 114 { 115 __m128d a = [1.5, -2.0]; 116 a = _mm_add_pd(a, a); 117 assert(a.array == [3.0, -4.0]); 118 } 119 120 /// Add 64-bit integers `a` and `b`. 121 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe 122 { 123 pragma(inline, true); 124 return a + b; 125 } 126 127 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 128 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 129 { 130 static if (GDC_with_SSE2) 131 { 132 return cast(__m128i)__builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 133 } 134 else version(LDC) 135 { 136 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 137 { 138 // x86: Generates PADDSW since LDC 1.15 -O0 139 // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20 140 enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 141 enum ir = ` 142 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 143 ret <8 x i16> %r`; 144 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 145 } 146 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 147 { 148 short[8] res; 149 short8 sa = cast(short8)a; 150 short8 sb = cast(short8)b; 151 foreach(i; 0..8) 152 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 153 return _mm_loadu_si128(cast(int4*)res.ptr); 154 } 155 else 156 return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 157 } 158 else 159 { 160 short[8] res; 161 short8 sa = cast(short8)a; 162 short8 sb = cast(short8)b; 163 foreach(i; 0..8) 164 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 165 return _mm_loadu_si128(cast(int4*)res.ptr); 166 } 167 } 168 unittest 169 { 170 short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), 171 _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); 172 static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 173 assert(res.array == correctResult); 174 } 175 176 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 177 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 178 { 179 static if (GDC_with_SSE2) 180 { 181 return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b); 182 } 183 else version(LDC) 184 { 185 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 186 { 187 // x86: Generates PADDSB since LDC 1.15 -O0 188 // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20 189 enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 190 enum ir = ` 191 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 192 ret <16 x i8> %r`; 193 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 194 } 195 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 196 { 197 byte[16] res; 198 byte16 sa = cast(byte16)a; 199 byte16 sb = cast(byte16)b; 200 foreach(i; 0..16) 201 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 202 return _mm_loadu_si128(cast(int4*)res.ptr); 203 } 204 else 205 return cast(__m128i) __builtin_ia32_paddsb128(cast(byte16)a, cast(byte16)b); 206 } 207 else 208 { 209 byte[16] res; 210 byte16 sa = cast(byte16)a; 211 byte16 sb = cast(byte16)b; 212 foreach(i; 0..16) 213 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 214 return _mm_loadu_si128(cast(int4*)res.ptr); 215 } 216 } 217 unittest 218 { 219 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 220 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 221 static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14, 222 16, 18, 20, 22, 24, 26, 28, 30]; 223 assert(res.array == correctResult); 224 } 225 226 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 227 // PERF: #GDC version? 228 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 229 { 230 version(LDC) 231 { 232 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 233 { 234 // x86: Generates PADDUSB since LDC 1.15 -O0 235 // ARM: Generates uqadd.16b since LDC 1.21 -O1 236 enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 237 enum ir = ` 238 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 239 ret <16 x i8> %r`; 240 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 241 } 242 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 243 { 244 ubyte[16] res; 245 byte16 sa = cast(byte16)a; 246 byte16 sb = cast(byte16)b; 247 foreach(i; 0..16) 248 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 249 return _mm_loadu_si128(cast(int4*)res.ptr); 250 } 251 else 252 return __builtin_ia32_paddusb128(a, b); 253 } 254 else 255 { 256 ubyte[16] res; 257 byte16 sa = cast(byte16)a; 258 byte16 sb = cast(byte16)b; 259 foreach(i; 0..16) 260 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 261 return _mm_loadu_si128(cast(int4*)res.ptr); 262 } 263 } 264 unittest 265 { 266 byte16 res = cast(byte16) 267 _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0), 268 _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0)); 269 static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 270 0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 271 assert(res.array == correctResult); 272 } 273 274 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation. 275 // PERF: #GDC version? 276 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 277 { 278 version(LDC) 279 { 280 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 281 { 282 // x86: Generates PADDUSW since LDC 1.15 -O0 283 // ARM: Generates uqadd.8h since LDC 1.21 -O1 284 enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 285 enum ir = ` 286 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 287 ret <8 x i16> %r`; 288 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 289 } 290 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 291 { 292 ushort[8] res; 293 short8 sa = cast(short8)a; 294 short8 sb = cast(short8)b; 295 foreach(i; 0..8) 296 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 297 return _mm_loadu_si128(cast(int4*)res.ptr); 298 } 299 else 300 return __builtin_ia32_paddusw128(a, b); 301 } 302 else 303 { 304 ushort[8] res; 305 short8 sa = cast(short8)a; 306 short8 sb = cast(short8)b; 307 foreach(i; 0..8) 308 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 309 return _mm_loadu_si128(cast(int4*)res.ptr); 310 } 311 } 312 unittest 313 { 314 short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0), 315 _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0)); 316 static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6]; 317 assert(res.array == correctResult); 318 } 319 320 /// Compute the bitwise AND of packed double-precision (64-bit) 321 /// floating-point elements in `a` and `b`. 322 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 323 { 324 pragma(inline, true); 325 return cast(__m128d)( cast(long2)a & cast(long2)b ); 326 } 327 unittest 328 { 329 double a = 4.32; 330 double b = -78.99; 331 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 332 __m128d A = _mm_set_pd(a, b); 333 __m128d B = _mm_set_pd(b, a); 334 long2 R = cast(long2)( _mm_and_pd(A, B) ); 335 assert(R.array[0] == correct); 336 assert(R.array[1] == correct); 337 } 338 339 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`. 340 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 341 { 342 pragma(inline, true); 343 return a & b; 344 } 345 unittest 346 { 347 __m128i A = _mm_set1_epi32(7); 348 __m128i B = _mm_set1_epi32(14); 349 __m128i R = _mm_and_si128(A, B); 350 int[4] correct = [6, 6, 6, 6]; 351 assert(R.array == correct); 352 } 353 354 /// Compute the bitwise NOT of packed double-precision (64-bit) 355 /// floating-point elements in `a` and then AND with `b`. 356 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 357 { 358 return cast(__m128d)( ~(cast(long2)a) & cast(long2)b); 359 } 360 unittest 361 { 362 double a = 4.32; 363 double b = -78.99; 364 long correct = (~*cast(long*)(&a)) & ( *cast(long*)(&b)); 365 long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b)); 366 __m128d A = _mm_setr_pd(a, b); 367 __m128d B = _mm_setr_pd(b, a); 368 long2 R = cast(long2)( _mm_andnot_pd(A, B) ); 369 assert(R.array[0] == correct); 370 assert(R.array[1] == correct2); 371 } 372 373 /// Compute the bitwise NOT of 128 bits (representing integer data) 374 /// in `a` and then AND with `b`. 375 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 376 { 377 return (~a) & b; 378 } 379 unittest 380 { 381 __m128i A = _mm_set1_epi32(7); 382 __m128i B = _mm_set1_epi32(14); 383 __m128i R = _mm_andnot_si128(A, B); 384 int[4] correct = [8, 8, 8, 8]; 385 assert(R.array == correct); 386 } 387 388 /// Average packed unsigned 16-bit integers in `a` and `b`. 389 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted 390 { 391 static if (GDC_with_SSE2) 392 { 393 return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b); 394 } 395 else static if (LDC_with_ARM64) 396 { 397 return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b); 398 } 399 else version(LDC) 400 { 401 // Generates pavgw even in LDC 1.0, even in -O0 402 // But not in ARM 403 enum ir = ` 404 %ia = zext <8 x i16> %0 to <8 x i32> 405 %ib = zext <8 x i16> %1 to <8 x i32> 406 %isum = add <8 x i32> %ia, %ib 407 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 408 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 409 %r = trunc <8 x i32> %isums to <8 x i16> 410 ret <8 x i16> %r`; 411 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 412 } 413 else 414 { 415 short8 sa = cast(short8)a; 416 short8 sb = cast(short8)b; 417 short8 sr = void; 418 foreach(i; 0..8) 419 { 420 sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 ); 421 } 422 return cast(int4)sr; 423 } 424 } 425 unittest 426 { 427 __m128i A = _mm_set1_epi16(31); 428 __m128i B = _mm_set1_epi16(64); 429 short8 avg = cast(short8)(_mm_avg_epu16(A, B)); 430 foreach(i; 0..8) 431 assert(avg.array[i] == 48); 432 } 433 434 /// Average packed unsigned 8-bit integers in `a` and `b`. 435 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted 436 { 437 static if (GDC_with_SSE2) 438 { 439 return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b); 440 } 441 else static if (LDC_with_ARM64) 442 { 443 return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b); 444 } 445 else version(LDC) 446 { 447 // Generates pavgb even in LDC 1.0, even in -O0 448 // But not in ARM 449 enum ir = ` 450 %ia = zext <16 x i8> %0 to <16 x i16> 451 %ib = zext <16 x i8> %1 to <16 x i16> 452 %isum = add <16 x i16> %ia, %ib 453 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 454 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 455 %r = trunc <16 x i16> %isums to <16 x i8> 456 ret <16 x i8> %r`; 457 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 458 } 459 else 460 { 461 byte16 sa = cast(byte16)a; 462 byte16 sb = cast(byte16)b; 463 byte16 sr = void; 464 foreach(i; 0..16) 465 { 466 sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 ); 467 } 468 return cast(int4)sr; 469 } 470 } 471 unittest 472 { 473 __m128i A = _mm_set1_epi8(31); 474 __m128i B = _mm_set1_epi8(64); 475 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B)); 476 foreach(i; 0..16) 477 assert(avg.array[i] == 48); 478 } 479 480 /// Shift `a` left by `bytes` bytes while shifting in zeros. 481 alias _mm_bslli_si128 = _mm_slli_si128; 482 unittest 483 { 484 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 485 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 486 __m128i result = _mm_bslli_si128!5(toShift); 487 assert( (cast(byte16)result).array == exact); 488 } 489 490 /// Shift `v` right by `bytes` bytes while shifting in zeros. 491 alias _mm_bsrli_si128 = _mm_srli_si128; 492 unittest 493 { 494 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 495 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0]; 496 __m128i result = _mm_bsrli_si128!5(toShift); 497 assert( (cast(byte16)result).array == exact); 498 } 499 500 /// Cast vector of type `__m128d` to type `__m128`. 501 /// Note: Also possible with a regular `cast(__m128)(a)`. 502 __m128 _mm_castpd_ps (__m128d a) pure @safe 503 { 504 return cast(__m128)a; 505 } 506 507 /// Cast vector of type `__m128d` to type `__m128i`. 508 /// Note: Also possible with a regular `cast(__m128i)(a)`. 509 __m128i _mm_castpd_si128 (__m128d a) pure @safe 510 { 511 return cast(__m128i)a; 512 } 513 514 /// Cast vector of type `__m128` to type `__m128d`. 515 /// Note: Also possible with a regular `cast(__m128d)(a)`. 516 __m128d _mm_castps_pd (__m128 a) pure @safe 517 { 518 return cast(__m128d)a; 519 } 520 521 /// Cast vector of type `__m128` to type `__m128i`. 522 /// Note: Also possible with a regular `cast(__m128i)(a)`. 523 __m128i _mm_castps_si128 (__m128 a) pure @safe 524 { 525 return cast(__m128i)a; 526 } 527 528 /// Cast vector of type `__m128i` to type `__m128d`. 529 /// Note: Also possible with a regular `cast(__m128d)(a)`. 530 __m128d _mm_castsi128_pd (__m128i a) pure @safe 531 { 532 return cast(__m128d)a; 533 } 534 535 /// Cast vector of type `__m128i` to type `__m128`. 536 /// Note: Also possible with a regular `cast(__m128)(a)`. 537 __m128 _mm_castsi128_ps (__m128i a) pure @safe 538 { 539 return cast(__m128)a; 540 } 541 542 /// Invalidate and flush the cache line that contains `p` 543 /// from all levels of the cache hierarchy. 544 void _mm_clflush (const(void)* p) @trusted 545 { 546 static if (GDC_with_SSE2) 547 { 548 __builtin_ia32_clflush(p); 549 } 550 else static if (LDC_with_SSE2) 551 { 552 __builtin_ia32_clflush(cast(void*)p); 553 } 554 else version(D_InlineAsm_X86) 555 { 556 asm pure nothrow @nogc @safe 557 { 558 mov EAX, p; 559 clflush [EAX]; 560 } 561 } 562 else version(D_InlineAsm_X86_64) 563 { 564 asm pure nothrow @nogc @safe 565 { 566 mov RAX, p; 567 clflush [RAX]; 568 } 569 } 570 else 571 { 572 // Do nothing. Invalidating cacheline does 573 // not affect correctness. 574 } 575 } 576 unittest 577 { 578 ubyte[64] cacheline; 579 _mm_clflush(cacheline.ptr); 580 } 581 582 /// Compare packed 16-bit integers in `a` and `b` for equality. 583 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 584 { 585 static if (GDC_with_SSE2) 586 { 587 return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b); 588 } 589 else 590 { 591 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 592 } 593 } 594 unittest 595 { 596 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 597 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 598 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 599 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 600 assert(R.array == E); 601 } 602 603 /// Compare packed 32-bit integers in `a` and `b` for equality. 604 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 605 { 606 static if (GDC_with_SSE2) 607 { 608 return __builtin_ia32_pcmpeqd128(a, b); 609 } 610 else 611 { 612 return equalMask!__m128i(a, b); 613 } 614 } 615 unittest 616 { 617 int4 A = [-3, -2, -1, 0]; 618 int4 B = [ 4, -2, 2, 0]; 619 int[4] E = [ 0, -1, 0, -1]; 620 int4 R = cast(int4)(_mm_cmpeq_epi32(A, B)); 621 assert(R.array == E); 622 } 623 624 /// Compare packed 8-bit integers in `a` and `b` for equality. 625 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 626 { 627 static if (GDC_with_SSE2) 628 { 629 return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b); 630 } 631 else 632 { 633 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 634 } 635 } 636 unittest 637 { 638 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 639 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 640 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 641 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 642 assert(C.array == correct); 643 } 644 645 /// Compare packed double-precision (64-bit) floating-point elements 646 /// in `a` and `b` for equality. 647 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 648 { 649 static if (GDC_with_SSE2) 650 { 651 return __builtin_ia32_cmpeqpd(a, b); 652 } 653 else 654 { 655 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b); 656 } 657 } 658 659 /// Compare the lower double-precision (64-bit) floating-point elements 660 /// in `a` and `b` for equality, store the result in the lower element, 661 /// and copy the upper element from `a`. 662 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 663 { 664 static if (GDC_with_SSE2) 665 { 666 return __builtin_ia32_cmpeqsd(a, b); 667 } 668 else 669 { 670 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b); 671 } 672 } 673 674 /// Compare packed double-precision (64-bit) floating-point elements 675 /// in `a` and `b` for greater-than-or-equal. 676 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 677 { 678 static if (GDC_with_SSE2) 679 { 680 return __builtin_ia32_cmpgepd(a, b); 681 } 682 else 683 { 684 return cast(__m128d) cmppd!(FPComparison.oge)(a, b); 685 } 686 } 687 688 /// Compare the lower double-precision (64-bit) floating-point elements 689 /// in `a` and `b` for greater-than-or-equal, store the result in the 690 /// lower element, and copy the upper element from `a`. 691 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 692 { 693 // Note: There is no __builtin_ia32_cmpgesd builtin. 694 static if (GDC_with_SSE2) 695 { 696 return __builtin_ia32_cmpnltsd(b, a); 697 } 698 else 699 { 700 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b); 701 } 702 } 703 704 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 705 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 706 { 707 static if (GDC_with_SSE2) 708 { 709 return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b); 710 } 711 else 712 { 713 return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b); 714 } 715 } 716 unittest 717 { 718 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 719 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 720 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 721 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 722 assert(R.array == E); 723 } 724 725 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 726 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 727 { 728 static if (GDC_with_SSE2) 729 { 730 return __builtin_ia32_pcmpgtd128(a, b); 731 } 732 else 733 { 734 return cast(__m128i)( greaterMask!int4(a, b)); 735 } 736 } 737 unittest 738 { 739 int4 A = [-3, 2, -1, 0]; 740 int4 B = [ 4, -2, 2, 0]; 741 int[4] E = [ 0, -1, 0, 0]; 742 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 743 assert(R.array == E); 744 } 745 746 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 747 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 748 { 749 static if (GDC_with_SSE2) 750 { 751 return cast(__m128i) __builtin_ia32_pcmpgtb128(cast(ubyte16)a, cast(ubyte16)b); 752 } 753 else 754 { 755 return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b); 756 } 757 } 758 unittest 759 { 760 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 761 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 762 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 763 byte[16] correct = [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 764 __m128i D = _mm_cmpeq_epi8(A, B); 765 assert(C.array == correct); 766 } 767 768 /// Compare packed double-precision (64-bit) floating-point elements 769 /// in `a` and `b` for greater-than. 770 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 771 { 772 static if (GDC_with_SSE2) 773 { 774 return __builtin_ia32_cmpgtpd(a, b); 775 } 776 else 777 { 778 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b); 779 } 780 } 781 782 /// Compare the lower double-precision (64-bit) floating-point elements 783 /// in `a` and `b` for greater-than, store the result in the lower element, 784 /// and copy the upper element from `a`. 785 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 786 { 787 // Note: There is no __builtin_ia32_cmpgtsd builtin. 788 static if (GDC_with_SSE2) 789 { 790 return __builtin_ia32_cmpnlesd(b, a); 791 } 792 else 793 { 794 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b); 795 } 796 } 797 798 /// Compare packed double-precision (64-bit) floating-point elements 799 /// in `a` and `b` for less-than-or-equal. 800 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 801 { 802 static if (GDC_with_SSE2) 803 { 804 return __builtin_ia32_cmplepd(a, b); 805 } 806 else 807 { 808 return cast(__m128d) cmppd!(FPComparison.ole)(a, b); 809 } 810 } 811 812 /// Compare the lower double-precision (64-bit) floating-point elements 813 /// in `a` and `b` for less-than-or-equal, store the result in the 814 /// lower element, and copy the upper element from `a`. 815 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 816 { 817 static if (GDC_with_SSE2) 818 { 819 return __builtin_ia32_cmplesd(a, b); 820 } 821 else 822 { 823 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b); 824 } 825 } 826 827 /// Compare packed 16-bit integers in `a` and `b` for less-than. 828 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 829 { 830 return _mm_cmpgt_epi16(b, a); 831 } 832 833 /// Compare packed 32-bit integers in `a` and `b` for less-than. 834 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 835 { 836 return _mm_cmpgt_epi32(b, a); 837 } 838 839 /// Compare packed 8-bit integers in `a` and `b` for less-than. 840 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 841 { 842 return _mm_cmpgt_epi8(b, a); 843 } 844 845 /// Compare packed double-precision (64-bit) floating-point elements 846 /// in `a` and `b` for less-than. 847 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 848 { 849 static if (GDC_with_SSE2) 850 { 851 return __builtin_ia32_cmpltpd(a, b); 852 } 853 else 854 { 855 return cast(__m128d) cmppd!(FPComparison.olt)(a, b); 856 } 857 } 858 859 /// Compare the lower double-precision (64-bit) floating-point elements 860 /// in `a` and `b` for less-than, store the result in the lower 861 /// element, and copy the upper element from `a`. 862 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 863 { 864 static if (GDC_with_SSE2) 865 { 866 return __builtin_ia32_cmpltsd(a, b); 867 } 868 else 869 { 870 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b); 871 } 872 } 873 874 /// Compare packed double-precision (64-bit) floating-point elements 875 /// in `a` and `b` for not-equal. 876 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 877 { 878 static if (GDC_with_SSE2) 879 { 880 return __builtin_ia32_cmpneqpd(a, b); 881 } 882 else 883 { 884 return cast(__m128d) cmppd!(FPComparison.une)(a, b); 885 } 886 } 887 888 /// Compare the lower double-precision (64-bit) floating-point elements 889 /// in `a` and `b` for not-equal, store the result in the lower 890 /// element, and copy the upper element from `a`. 891 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 892 { 893 static if (GDC_with_SSE2) 894 { 895 return __builtin_ia32_cmpneqsd(a, b); 896 } 897 else 898 { 899 return cast(__m128d) cmpsd!(FPComparison.une)(a, b); 900 } 901 } 902 903 /// Compare packed double-precision (64-bit) floating-point elements 904 /// in `a` and `b` for not-greater-than-or-equal. 905 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 906 { 907 static if (GDC_with_SSE2) 908 { 909 return __builtin_ia32_cmpngepd(a, b); 910 } 911 else 912 { 913 return cast(__m128d) cmppd!(FPComparison.ult)(a, b); 914 } 915 } 916 917 /// Compare the lower double-precision (64-bit) floating-point elements 918 /// in `a` and `b` for not-greater-than-or-equal, store the result in 919 /// the lower element, and copy the upper element from `a`. 920 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 921 { 922 // Note: There is no __builtin_ia32_cmpngesd builtin. 923 static if (GDC_with_SSE2) 924 { 925 return __builtin_ia32_cmpltsd(b, a); 926 } 927 else 928 { 929 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b); 930 } 931 } 932 933 /// Compare packed double-precision (64-bit) floating-point elements 934 /// in `a` and `b` for not-greater-than. 935 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 936 { 937 static if (GDC_with_SSE2) 938 { 939 return __builtin_ia32_cmpngtpd(a, b); 940 } 941 else 942 { 943 return cast(__m128d) cmppd!(FPComparison.ule)(a, b); 944 } 945 } 946 947 /// Compare the lower double-precision (64-bit) floating-point elements 948 /// in `a` and `b` for not-greater-than, store the result in the 949 /// lower element, and copy the upper element from `a`. 950 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 951 { 952 // Note: There is no __builtin_ia32_cmpngtsd builtin. 953 static if (GDC_with_SSE2) 954 { 955 return __builtin_ia32_cmplesd(b, a); 956 } 957 else 958 { 959 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b); 960 } 961 } 962 963 /// Compare packed double-precision (64-bit) floating-point elements 964 /// in `a` and `b` for not-less-than-or-equal. 965 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 966 { 967 static if (GDC_with_SSE2) 968 { 969 return __builtin_ia32_cmpnlepd(a, b); 970 } 971 else 972 { 973 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b); 974 } 975 } 976 977 /// Compare the lower double-precision (64-bit) floating-point elements 978 /// in `a` and `b` for not-less-than-or-equal, store the result in the 979 /// lower element, and copy the upper element from `a`. 980 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 981 { 982 static if (GDC_with_SSE2) 983 { 984 return __builtin_ia32_cmpnlesd(a, b); 985 } 986 else 987 { 988 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b); 989 } 990 } 991 992 /// Compare packed double-precision (64-bit) floating-point elements 993 /// in `a` and `b` for not-less-than. 994 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 995 { 996 static if (GDC_with_SSE2) 997 { 998 return __builtin_ia32_cmpnltpd(a, b); 999 } 1000 else 1001 { 1002 return cast(__m128d) cmppd!(FPComparison.uge)(a, b); 1003 } 1004 } 1005 1006 /// Compare the lower double-precision (64-bit) floating-point elements 1007 /// in `a` and `b` for not-less-than, store the result in the lower 1008 /// element, and copy the upper element from `a`. 1009 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 1010 { 1011 static if (GDC_with_SSE2) 1012 { 1013 return __builtin_ia32_cmpnltsd(a, b); 1014 } 1015 else 1016 { 1017 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b); 1018 } 1019 } 1020 1021 /// Compare packed double-precision (64-bit) floating-point elements 1022 /// in `a` and `b` to see if neither is NaN. 1023 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 1024 { 1025 static if (GDC_with_SSE2) 1026 { 1027 return __builtin_ia32_cmpordpd(a, b); 1028 } 1029 else 1030 { 1031 return cast(__m128d) cmppd!(FPComparison.ord)(a, b); 1032 } 1033 } 1034 1035 /// Compare the lower double-precision (64-bit) floating-point elements 1036 /// in `a` and `b` to see if neither is NaN, store the result in the 1037 /// lower element, and copy the upper element from `a` to the upper element. 1038 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 1039 { 1040 static if (GDC_with_SSE2) 1041 { 1042 return __builtin_ia32_cmpordsd(a, b); 1043 } 1044 else 1045 { 1046 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b); 1047 } 1048 } 1049 1050 /// Compare packed double-precision (64-bit) floating-point elements 1051 /// in `a` and `b` to see if either is NaN. 1052 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 1053 { 1054 static if (GDC_with_SSE2) 1055 { 1056 return __builtin_ia32_cmpunordpd(a, b); 1057 } 1058 else 1059 { 1060 return cast(__m128d) cmppd!(FPComparison.uno)(a, b); 1061 } 1062 } 1063 1064 /// Compare the lower double-precision (64-bit) floating-point elements 1065 /// in `a` and `b` to see if either is NaN, store the result in the lower 1066 /// element, and copy the upper element from `a` to the upper element. 1067 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 1068 { 1069 static if (GDC_with_SSE2) 1070 { 1071 return __builtin_ia32_cmpunordsd(a, b); 1072 } 1073 else 1074 { 1075 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b); 1076 } 1077 } 1078 1079 /// Compare the lower double-precision (64-bit) floating-point element 1080 /// in `a` and `b` for equality, and return the boolean result (0 or 1). 1081 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe 1082 { 1083 // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 1084 // comisd instruction, it returns false in case of unordered instead. 1085 // 1086 // Actually C++ compilers disagree over the meaning of that instruction. 1087 // GCC will manage NaNs like the comisd instruction (return true if unordered), 1088 // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says. 1089 // We choose to do like the most numerous. It seems GCC is buggy with NaNs. 1090 return a.array[0] == b.array[0]; 1091 } 1092 unittest 1093 { 1094 assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1095 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1096 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1097 assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1098 assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1099 } 1100 1101 /// Compare the lower double-precision (64-bit) floating-point element 1102 /// in `a` and `b` for greater-than-or-equal, and return the boolean 1103 /// result (0 or 1). 1104 int _mm_comige_sd (__m128d a, __m128d b) pure @safe 1105 { 1106 return a.array[0] >= b.array[0]; 1107 } 1108 unittest 1109 { 1110 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1111 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1112 assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1113 assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1114 assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1115 assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1116 } 1117 1118 /// Compare the lower double-precision (64-bit) floating-point element 1119 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1). 1120 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe 1121 { 1122 return a.array[0] > b.array[0]; 1123 } 1124 unittest 1125 { 1126 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1127 assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1128 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1129 assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1130 assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1131 } 1132 1133 /// Compare the lower double-precision (64-bit) floating-point element 1134 /// in `a` and `b` for less-than-or-equal. 1135 int _mm_comile_sd (__m128d a, __m128d b) pure @safe 1136 { 1137 return a.array[0] <= b.array[0]; 1138 } 1139 unittest 1140 { 1141 assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1142 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1143 assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1144 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1145 assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1146 assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1147 } 1148 1149 /// Compare the lower double-precision (64-bit) floating-point element 1150 /// in `a` and `b` for less-than, and return the boolean result (0 or 1). 1151 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe 1152 { 1153 return a.array[0] < b.array[0]; 1154 } 1155 unittest 1156 { 1157 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1158 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1159 assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1160 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1161 assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1162 assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1163 } 1164 1165 /// Compare the lower double-precision (64-bit) floating-point element 1166 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1). 1167 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe 1168 { 1169 return a.array[0] != b.array[0]; 1170 } 1171 unittest 1172 { 1173 assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1174 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1175 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1176 assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1177 assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1178 } 1179 1180 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) 1181 /// floating-point elements. 1182 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted 1183 { 1184 version(LDC) 1185 { 1186 // Generates cvtdq2pd since LDC 1.0, even without optimizations 1187 enum ir = ` 1188 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 1189 %r = sitofp <2 x i32> %v to <2 x double> 1190 ret <2 x double> %r`; 1191 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a); 1192 } 1193 else static if (GDC_with_SSE2) 1194 { 1195 return __builtin_ia32_cvtdq2pd(a); 1196 } 1197 else 1198 { 1199 double2 r = void; 1200 r.ptr[0] = a.array[0]; 1201 r.ptr[1] = a.array[1]; 1202 return r; 1203 } 1204 } 1205 unittest 1206 { 1207 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54)); 1208 assert(A.array[0] == 54.0); 1209 assert(A.array[1] == 54.0); 1210 } 1211 1212 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 1213 /// floating-point elements. 1214 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted 1215 { 1216 static if (GDC_with_SSE2) 1217 { 1218 return __builtin_ia32_cvtdq2ps(a); 1219 } 1220 else version(LDC) 1221 { 1222 // See #86 for why we had to resort to LLVM IR. 1223 // Plain code below was leading to catastrophic behaviour. 1224 // x86: Generates cvtdq2ps since LDC 1.1.0 -O0 1225 // ARM: Generats scvtf.4s since LDC 1.8.0 -O0 1226 enum ir = ` 1227 %r = sitofp <4 x i32> %0 to <4 x float> 1228 ret <4 x float> %r`; 1229 return cast(__m128) LDCInlineIR!(ir, float4, int4)(a); 1230 } 1231 else 1232 { 1233 __m128 res; 1234 res.ptr[0] = cast(float)a.array[0]; 1235 res.ptr[1] = cast(float)a.array[1]; 1236 res.ptr[2] = cast(float)a.array[2]; 1237 res.ptr[3] = cast(float)a.array[3]; 1238 return res; 1239 } 1240 } 1241 unittest 1242 { 1243 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 1244 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 1245 } 1246 1247 /// Convert packed double-precision (64-bit) floating-point elements 1248 /// in `a` to packed 32-bit integers. 1249 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted 1250 { 1251 // PERF ARM32 1252 static if (LDC_with_SSE2) 1253 { 1254 return __builtin_ia32_cvtpd2dq(a); 1255 } 1256 else static if (GDC_with_SSE2) 1257 { 1258 return __builtin_ia32_cvtpd2dq(a); 1259 } 1260 else static if (LDC_with_ARM64) 1261 { 1262 // Get current rounding mode. 1263 uint fpscr = arm_get_fpcr(); 1264 long2 i; 1265 switch(fpscr & _MM_ROUND_MASK_ARM) 1266 { 1267 default: 1268 case _MM_ROUND_NEAREST_ARM: i = vcvtnq_s64_f64(a); break; 1269 case _MM_ROUND_DOWN_ARM: i = vcvtmq_s64_f64(a); break; 1270 case _MM_ROUND_UP_ARM: i = vcvtpq_s64_f64(a); break; 1271 case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break; 1272 } 1273 int4 zero = 0; 1274 return cast(__m128i) shufflevector!(int4, 0, 2, 4, 6)(cast(int4)i, zero); 1275 } 1276 else 1277 { 1278 // PERF ARM32 1279 __m128i r = _mm_setzero_si128(); 1280 r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]); 1281 r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]); 1282 return r; 1283 } 1284 } 1285 unittest 1286 { 1287 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0)); 1288 assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0); 1289 } 1290 1291 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1292 /// to packed 32-bit integers 1293 __m64 _mm_cvtpd_pi32 (__m128d v) @safe 1294 { 1295 return to_m64(_mm_cvtpd_epi32(v)); 1296 } 1297 unittest 1298 { 1299 int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0)); 1300 assert(A.array[0] == 55 && A.array[1] == 61); 1301 } 1302 1303 /// Convert packed double-precision (64-bit) floating-point elements 1304 /// in `a` to packed single-precision (32-bit) floating-point elements. 1305 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted 1306 { 1307 static if (LDC_with_SSE2) 1308 { 1309 return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately 1310 } 1311 else static if (GDC_with_SSE2) 1312 { 1313 return __builtin_ia32_cvtpd2ps(a); 1314 } 1315 else 1316 { 1317 __m128 r = void; 1318 r.ptr[0] = a.array[0]; 1319 r.ptr[1] = a.array[1]; 1320 r.ptr[2] = 0; 1321 r.ptr[3] = 0; 1322 return r; 1323 } 1324 } 1325 unittest 1326 { 1327 __m128d A = _mm_set_pd(5.25, 4.0); 1328 __m128 B = _mm_cvtpd_ps(A); 1329 assert(B.array == [4.0f, 5.25f, 0, 0]); 1330 } 1331 1332 /// Convert packed 32-bit integers in `v` to packed double-precision 1333 /// (64-bit) floating-point elements. 1334 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe 1335 { 1336 return _mm_cvtepi32_pd(to_m128i(v)); 1337 } 1338 unittest 1339 { 1340 __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5)); 1341 assert(A.array[0] == 4.0 && A.array[1] == -5.0); 1342 } 1343 1344 /// Convert packed single-precision (32-bit) floating-point elements 1345 /// in `a` to packed 32-bit integers 1346 __m128i _mm_cvtps_epi32 (__m128 a) @trusted 1347 { 1348 static if (LDC_with_SSE2) 1349 { 1350 return cast(__m128i) __builtin_ia32_cvtps2dq(a); 1351 } 1352 else static if (GDC_with_SSE2) 1353 { 1354 return __builtin_ia32_cvtps2dq(a); 1355 } 1356 else static if (LDC_with_ARM64) 1357 { 1358 // Get current rounding mode. 1359 uint fpscr = arm_get_fpcr(); 1360 switch(fpscr & _MM_ROUND_MASK_ARM) 1361 { 1362 default: 1363 case _MM_ROUND_NEAREST_ARM: return vcvtnq_s32_f32(a); 1364 case _MM_ROUND_DOWN_ARM: return vcvtmq_s32_f32(a); 1365 case _MM_ROUND_UP_ARM: return vcvtpq_s32_f32(a); 1366 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a); 1367 } 1368 } 1369 else 1370 { 1371 __m128i r = void; 1372 r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]); 1373 r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]); 1374 r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]); 1375 r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]); 1376 return r; 1377 } 1378 } 1379 unittest 1380 { 1381 // GDC bug #98607 1382 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607 1383 // GDC does not provide optimization barrier for rounding mode. 1384 // Workarounded with different literals. This bug will likely only manifest in unittest. 1385 // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't. 1386 1387 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1388 1389 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1390 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1391 assert(A.array == [1, -2, 54, -3]); 1392 1393 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1394 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f)); 1395 assert(A.array == [1, -3, 53, -3]); 1396 1397 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1398 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f)); 1399 assert(A.array == [2, -2, 54, -2]); 1400 1401 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1402 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f)); 1403 assert(A.array == [1, -2, 53, -2]); 1404 1405 _MM_SET_ROUNDING_MODE(savedRounding); 1406 } 1407 1408 /// Convert packed single-precision (32-bit) floating-point elements 1409 /// in `a` to packed double-precision (64-bit) floating-point elements. 1410 __m128d _mm_cvtps_pd (__m128 a) pure @trusted 1411 { 1412 version(LDC) 1413 { 1414 // Generates cvtps2pd since LDC 1.0 -O0 1415 enum ir = ` 1416 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1> 1417 %r = fpext <2 x float> %v to <2 x double> 1418 ret <2 x double> %r`; 1419 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a); 1420 } 1421 else static if (GDC_with_SSE2) 1422 { 1423 return __builtin_ia32_cvtps2pd(a); 1424 } 1425 else 1426 { 1427 double2 r = void; 1428 r.ptr[0] = a.array[0]; 1429 r.ptr[1] = a.array[1]; 1430 return r; 1431 } 1432 } 1433 unittest 1434 { 1435 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f)); 1436 assert(A.array[0] == 54.0); 1437 assert(A.array[1] == 54.0); 1438 } 1439 1440 /// Copy the lower double-precision (64-bit) floating-point element of `a`. 1441 double _mm_cvtsd_f64 (__m128d a) pure @safe 1442 { 1443 return a.array[0]; 1444 } 1445 1446 /// Convert the lower double-precision (64-bit) floating-point element 1447 /// in `a` to a 32-bit integer. 1448 int _mm_cvtsd_si32 (__m128d a) @safe 1449 { 1450 static if (LDC_with_SSE2) 1451 { 1452 return __builtin_ia32_cvtsd2si(a); 1453 } 1454 else static if (GDC_with_SSE2) 1455 { 1456 return __builtin_ia32_cvtsd2si(a); 1457 } 1458 else 1459 { 1460 return convertDoubleToInt32UsingMXCSR(a[0]); 1461 } 1462 } 1463 unittest 1464 { 1465 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0))); 1466 } 1467 1468 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer. 1469 long _mm_cvtsd_si64 (__m128d a) @trusted 1470 { 1471 version (LDC) 1472 { 1473 version (X86_64) 1474 { 1475 return __builtin_ia32_cvtsd2si64(a); 1476 } 1477 else 1478 { 1479 // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer 1480 // using SSE instructions only. So the builtin doesn't exit for this arch. 1481 return convertDoubleToInt64UsingMXCSR(a[0]); 1482 } 1483 } 1484 else 1485 { 1486 return convertDoubleToInt64UsingMXCSR(a.array[0]); 1487 } 1488 } 1489 unittest 1490 { 1491 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0))); 1492 1493 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1494 1495 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1496 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49))); 1497 1498 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1499 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1))); 1500 1501 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1502 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1))); 1503 1504 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1505 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9))); 1506 1507 _MM_SET_ROUNDING_MODE(savedRounding); 1508 } 1509 1510 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; /// 1511 1512 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 1513 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a` 1514 /// to the upper elements of result. 1515 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted 1516 { 1517 static if (GDC_with_SSE2) 1518 { 1519 return __builtin_ia32_cvtsd2ss(a, b); 1520 } 1521 else 1522 { 1523 // Generates cvtsd2ss since LDC 1.3 -O0 1524 a.ptr[0] = b.array[0]; 1525 return a; 1526 } 1527 } 1528 unittest 1529 { 1530 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0)); 1531 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]); 1532 } 1533 1534 /// Get the lower 32-bit integer in `a`. 1535 int _mm_cvtsi128_si32 (__m128i a) pure @safe 1536 { 1537 return a.array[0]; 1538 } 1539 1540 /// Get the lower 64-bit integer in `a`. 1541 long _mm_cvtsi128_si64 (__m128i a) pure @safe 1542 { 1543 long2 la = cast(long2)a; 1544 return la.array[0]; 1545 } 1546 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 1547 1548 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 1549 /// lower element of result, and copy the upper element from `a` to the upper element of result. 1550 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted 1551 { 1552 a.ptr[0] = cast(double)b; 1553 return a; 1554 } 1555 unittest 1556 { 1557 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 1558 assert(a.array == [42.0, 0]); 1559 } 1560 1561 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements. 1562 __m128i _mm_cvtsi32_si128 (int a) pure @trusted 1563 { 1564 int4 r = [0, 0, 0, 0]; 1565 r.ptr[0] = a; 1566 return r; 1567 } 1568 unittest 1569 { 1570 __m128i a = _mm_cvtsi32_si128(65); 1571 assert(a.array == [65, 0, 0, 0]); 1572 } 1573 1574 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 1575 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 1576 1577 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted 1578 { 1579 a.ptr[0] = cast(double)b; 1580 return a; 1581 } 1582 unittest 1583 { 1584 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 1585 assert(a.array == [42.0, 0]); 1586 } 1587 1588 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element. 1589 __m128i _mm_cvtsi64_si128 (long a) pure @trusted 1590 { 1591 long2 r = [0, 0]; 1592 r.ptr[0] = a; 1593 return cast(__m128i)(r); 1594 } 1595 1596 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; /// 1597 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; /// 1598 1599 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 1600 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 1601 // element of result. 1602 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted 1603 { 1604 a.ptr[0] = b.array[0]; 1605 return a; 1606 } 1607 unittest 1608 { 1609 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 1610 assert(a.array == [42.0, 0]); 1611 } 1612 1613 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation. 1614 long _mm_cvttss_si64 (__m128 a) pure @safe 1615 { 1616 return cast(long)(a.array[0]); // Generates cvttss2si as expected 1617 } 1618 unittest 1619 { 1620 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1621 } 1622 1623 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1624 /// Put zeroes in the upper elements of result. 1625 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted 1626 { 1627 static if (LDC_with_SSE2) 1628 { 1629 return __builtin_ia32_cvttpd2dq(a); 1630 } 1631 else static if (GDC_with_SSE2) 1632 { 1633 return __builtin_ia32_cvttpd2dq(a); 1634 } 1635 else 1636 { 1637 // Note: doesn't generate cvttpd2dq as of LDC 1.13 1638 __m128i r; 1639 r.ptr[0] = cast(int)a.array[0]; 1640 r.ptr[1] = cast(int)a.array[1]; 1641 r.ptr[2] = 0; 1642 r.ptr[3] = 0; 1643 return r; 1644 } 1645 } 1646 unittest 1647 { 1648 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f)); 1649 assert(R.array == [-4, 45641, 0, 0]); 1650 } 1651 1652 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1653 /// to packed 32-bit integers with truncation. 1654 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe 1655 { 1656 return to_m64(_mm_cvttpd_epi32(v)); 1657 } 1658 unittest 1659 { 1660 int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f)); 1661 int[2] correct = [-4, 45641]; 1662 assert(R.array == correct); 1663 } 1664 1665 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1666 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted 1667 { 1668 // x86: Generates cvttps2dq since LDC 1.3 -O2 1669 // ARM64: generates fcvtze since LDC 1.8 -O2 1670 __m128i r; 1671 r.ptr[0] = cast(int)a.array[0]; 1672 r.ptr[1] = cast(int)a.array[1]; 1673 r.ptr[2] = cast(int)a.array[2]; 1674 r.ptr[3] = cast(int)a.array[3]; 1675 return r; 1676 } 1677 unittest 1678 { 1679 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f)); 1680 assert(R.array == [-4, 45641, 0, 1]); 1681 } 1682 1683 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation. 1684 int _mm_cvttsd_si32 (__m128d a) 1685 { 1686 // Generates cvttsd2si since LDC 1.3 -O0 1687 return cast(int)a.array[0]; 1688 } 1689 1690 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation. 1691 long _mm_cvttsd_si64 (__m128d a) 1692 { 1693 // Generates cvttsd2si since LDC 1.3 -O0 1694 // but in 32-bit instead, it's a long sequence that resort to FPU 1695 return cast(long)a.array[0]; 1696 } 1697 1698 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; /// 1699 1700 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`. 1701 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe 1702 { 1703 pragma(inline, true); 1704 return a / b; 1705 } 1706 1707 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted 1708 { 1709 static if (GDC_with_SSE2) 1710 { 1711 return __builtin_ia32_divsd(a, b); 1712 } 1713 else version(DigitalMars) 1714 { 1715 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1716 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 1717 asm pure nothrow @nogc @trusted { nop;} 1718 a.array[0] = a.array[0] / b.array[0]; 1719 return a; 1720 } 1721 else 1722 { 1723 a.ptr[0] /= b.array[0]; 1724 return a; 1725 } 1726 } 1727 unittest 1728 { 1729 __m128d a = [2.0, 4.5]; 1730 a = _mm_div_sd(a, a); 1731 assert(a.array == [1.0, 4.5]); 1732 } 1733 1734 /// Extract a 16-bit integer from `v`, selected with `index`. 1735 /// Warning: the returned value is zero-extended to 32-bits. 1736 int _mm_extract_epi16(__m128i v, int index) pure @safe 1737 { 1738 short8 r = cast(short8)v; 1739 return cast(ushort)(r.array[index & 7]); 1740 } 1741 unittest 1742 { 1743 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1); 1744 assert(_mm_extract_epi16(A, 6) == 6); 1745 assert(_mm_extract_epi16(A, 0) == 65535); 1746 assert(_mm_extract_epi16(A, 5 + 8) == 5); 1747 } 1748 1749 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`. 1750 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted 1751 { 1752 short8 r = cast(short8)v; 1753 r.ptr[index & 7] = cast(short)i; 1754 return cast(__m128i)r; 1755 } 1756 unittest 1757 { 1758 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1759 short8 R = cast(short8) _mm_insert_epi16(A, 42, 6); 1760 short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7]; 1761 assert(R.array == correct); 1762 } 1763 1764 1765 void _mm_lfence() @trusted 1766 { 1767 version(GNU) 1768 { 1769 1770 static if (GDC_with_SSE2) 1771 { 1772 __builtin_ia32_lfence(); 1773 } 1774 else version(X86) 1775 { 1776 asm pure nothrow @nogc @trusted 1777 { 1778 "lfence;\n" : : : ; 1779 } 1780 } 1781 else 1782 static assert(false); 1783 } 1784 else static if (LDC_with_SSE2) 1785 { 1786 __builtin_ia32_lfence(); 1787 } 1788 else static if (DMD_with_asm) 1789 { 1790 asm nothrow @nogc pure @safe 1791 { 1792 lfence; 1793 } 1794 } 1795 else version(LDC) 1796 { 1797 llvm_memory_fence(); // PERF actually generates mfence 1798 } 1799 else 1800 static assert(false); 1801 } 1802 unittest 1803 { 1804 _mm_lfence(); 1805 } 1806 1807 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1808 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1809 __m128d _mm_load_pd (const(double) * mem_addr) pure 1810 { 1811 pragma(inline, true); 1812 __m128d* aligned = cast(__m128d*)mem_addr; 1813 return *aligned; 1814 } 1815 unittest 1816 { 1817 align(16) double[2] S = [-5.0, 7.0]; 1818 __m128d R = _mm_load_pd(S.ptr); 1819 assert(R.array == S); 1820 } 1821 1822 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst. 1823 /// `mem_addr` does not need to be aligned on any particular boundary. 1824 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 1825 { 1826 double m = *mem_addr; 1827 __m128d r; 1828 r.ptr[0] = m; 1829 r.ptr[1] = m; 1830 return r; 1831 } 1832 unittest 1833 { 1834 double what = 4; 1835 __m128d R = _mm_load_pd1(&what); 1836 double[2] correct = [4.0, 4]; 1837 assert(R.array == correct); 1838 } 1839 1840 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 1841 /// element. `mem_addr` does not need to be aligned on any particular boundary. 1842 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted 1843 { 1844 double2 r = [0, 0]; 1845 r.ptr[0] = *mem_addr; 1846 return r; 1847 } 1848 unittest 1849 { 1850 double x = -42; 1851 __m128d a = _mm_load_sd(&x); 1852 assert(a.array == [-42.0, 0.0]); 1853 } 1854 1855 /// Load 128-bits of integer data from memory into dst. 1856 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1857 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted // TODO: shoudln't be trusted because alignment, Issue #62 1858 { 1859 pragma(inline, true); 1860 return *mem_addr; 1861 } 1862 unittest 1863 { 1864 align(16) int[4] correct = [-1, 2, 3, 4]; 1865 int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr); 1866 assert(A.array == correct); 1867 } 1868 1869 alias _mm_load1_pd = _mm_load_pd1; /// 1870 1871 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 1872 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary. 1873 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted 1874 { 1875 pragma(inline, true); 1876 a.ptr[1] = *mem_addr; 1877 return a; 1878 } 1879 unittest 1880 { 1881 double A = 7.0; 1882 __m128d B = _mm_setr_pd(4.0, -5.0); 1883 __m128d R = _mm_loadh_pd(B, &A); 1884 double[2] correct = [ 4.0, 7.0 ]; 1885 assert(R.array == correct); 1886 } 1887 1888 /// Load 64-bit integer from memory into the first element of result. Zero out the other. 1889 // Note: strange signature since the memory doesn't have to aligned (Issue #60) 1890 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted // TODO signature 1891 { 1892 pragma(inline, true); 1893 auto pLong = cast(const(long)*)mem_addr; 1894 long2 r = [0, 0]; 1895 r.ptr[0] = *pLong; 1896 return cast(__m128i)(r); 1897 } 1898 unittest 1899 { 1900 long A = 0x7878787870707070; 1901 long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A); 1902 long[2] correct = [0x7878787870707070, 0]; 1903 assert(R.array == correct); 1904 } 1905 1906 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 1907 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary. 1908 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted 1909 { 1910 a.ptr[0] = *mem_addr; 1911 return a; 1912 } 1913 unittest 1914 { 1915 double A = 7.0; 1916 __m128d B = _mm_setr_pd(4.0, -5.0); 1917 __m128d R = _mm_loadl_pd(B, &A); 1918 double[2] correct = [ 7.0, -5.0 ]; 1919 assert(R.array == correct); 1920 } 1921 1922 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 1923 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1924 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted 1925 { 1926 __m128d a = *cast(__m128d*)(mem_addr); 1927 __m128d r; 1928 r.ptr[0] = a.array[1]; 1929 r.ptr[1] = a.array[0]; 1930 return r; 1931 } 1932 unittest 1933 { 1934 align(16) double[2] A = [56.0, -74.0]; 1935 __m128d R = _mm_loadr_pd(A.ptr); 1936 double[2] correct = [-74.0, 56.0]; 1937 assert(R.array == correct); 1938 } 1939 1940 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1941 /// `mem_addr` does not need to be aligned on any particular boundary. 1942 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted 1943 { 1944 pragma(inline, true); 1945 static if (GDC_with_SSE2) 1946 { 1947 return __builtin_ia32_loadupd(mem_addr); 1948 } 1949 else version(LDC) 1950 { 1951 return loadUnaligned!(double2)(mem_addr); 1952 } 1953 else version(DigitalMars) 1954 { 1955 // Apparently inside __simd you can use aligned dereferences without fear. 1956 // That was issue 23048 on dlang's Bugzilla. 1957 static if (DMD_with_DSIMD) 1958 { 1959 return cast(__m128d)__simd(XMM.LODUPD, *cast(double2*)mem_addr); 1960 } 1961 else static if (SSESizedVectorsAreEmulated) 1962 { 1963 // Since this vector is emulated, it doesn't have alignement constraints 1964 // and as such we can just cast it. 1965 return *cast(__m128d*)(mem_addr); 1966 } 1967 else 1968 { 1969 __m128d result; 1970 result.ptr[0] = mem_addr[0]; 1971 result.ptr[1] = mem_addr[1]; 1972 return result; 1973 } 1974 } 1975 else 1976 { 1977 __m128d result; 1978 result.ptr[0] = mem_addr[0]; 1979 result.ptr[1] = mem_addr[1]; 1980 return result; 1981 } 1982 } 1983 unittest 1984 { 1985 double[2] A = [56.0, -75.0]; 1986 __m128d R = _mm_loadu_pd(A.ptr); 1987 double[2] correct = [56.0, -75.0]; 1988 assert(R.array == correct); 1989 } 1990 1991 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary. 1992 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 1993 { 1994 pragma(inline, true); 1995 static if (GDC_with_SSE2) 1996 { 1997 return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr); 1998 } 1999 else 2000 { 2001 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 2002 } 2003 } 2004 unittest 2005 { 2006 align(16) int[4] correct = [-1, 2, -3, 4]; 2007 int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr); 2008 assert(A.array == correct); 2009 } 2010 2011 /// Load unaligned 32-bit integer from memory into the first element of result. 2012 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted 2013 { 2014 pragma(inline, true); 2015 int r = *cast(int*)(mem_addr); 2016 int4 result = [0, 0, 0, 0]; 2017 result.ptr[0] = r; 2018 return result; 2019 } 2020 unittest 2021 { 2022 int r = 42; 2023 __m128i A = _mm_loadu_si32(&r); 2024 int[4] correct = [42, 0, 0, 0]; 2025 assert(A.array == correct); 2026 } 2027 2028 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 2029 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 2030 /// and pack the results in destination. 2031 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted 2032 { 2033 static if (GDC_with_SSE2) 2034 { 2035 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2036 } 2037 else static if (LDC_with_SSE2) 2038 { 2039 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2040 } 2041 else static if (LDC_with_ARM64) 2042 { 2043 int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b)); 2044 int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b)); 2045 int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl)); 2046 int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph)); 2047 return vcombine_s32(rl, rh); 2048 } 2049 else 2050 { 2051 short8 sa = cast(short8)a; 2052 short8 sb = cast(short8)b; 2053 int4 r; 2054 foreach(i; 0..4) 2055 { 2056 r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1]; 2057 } 2058 return r; 2059 } 2060 } 2061 unittest 2062 { 2063 short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2064 short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2065 int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B); 2066 int[4] correct = [1, 13, -2147483648, 2*32767*32767]; 2067 assert(R.array == correct); 2068 } 2069 2070 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 2071 /// (elements are not stored when the highest bit is not set in the corresponding element) 2072 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 2073 /// boundary. 2074 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted 2075 { 2076 static if (GDC_with_SSE2) 2077 { 2078 return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr); 2079 } 2080 else static if (LDC_with_SSE2) 2081 { 2082 return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr); 2083 } 2084 else static if (LDC_with_ARM64) 2085 { 2086 // PERF: catastrophic on ARM32 2087 byte16 bmask = cast(byte16)mask; 2088 byte16 shift = 7; 2089 bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask 2090 mask = cast(__m128i) bmask; 2091 __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr); 2092 dest = (a & mask) | (dest & ~mask); 2093 storeUnaligned!__m128i(dest, cast(int*)mem_addr); 2094 } 2095 else 2096 { 2097 byte16 b = cast(byte16)a; 2098 byte16 m = cast(byte16)mask; 2099 byte* dest = cast(byte*)(mem_addr); 2100 foreach(j; 0..16) 2101 { 2102 if (m.array[j] & 128) 2103 { 2104 dest[j] = b.array[j]; 2105 } 2106 } 2107 } 2108 } 2109 unittest 2110 { 2111 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]; 2112 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0); 2113 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); 2114 _mm_maskmoveu_si128(A, mask, dest.ptr); 2115 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42]; 2116 assert(dest == correct); 2117 } 2118 2119 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values. 2120 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe 2121 { 2122 static if (GDC_with_SSE2) 2123 { 2124 return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b); 2125 } 2126 else version(LDC) 2127 { 2128 // x86: pmaxsw since LDC 1.0 -O1 2129 // ARM: smax.8h since LDC 1.5 -01 2130 short8 sa = cast(short8)a; 2131 short8 sb = cast(short8)b; 2132 short8 greater = greaterMask!short8(sa, sb); 2133 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2134 } 2135 else 2136 { 2137 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else 2138 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2139 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2140 return _mm_xor_si128(b, mask); 2141 } 2142 } 2143 unittest 2144 { 2145 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57), 2146 _mm_setr_epi16(-4,-8, 9, 7, 0,-32768, 0, 0)); 2147 short[8] correct = [32767, 1, 9, 7, 9, 7, 0, 0]; 2148 assert(R.array == correct); 2149 } 2150 2151 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values. 2152 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe 2153 { 2154 version(LDC) 2155 { 2156 // x86: pmaxub since LDC 1.0.0 -O1 2157 // ARM64: umax.16b since LDC 1.5.0 -O1 2158 // PERF: catastrophic on ARM32 2159 ubyte16 sa = cast(ubyte16)a; 2160 ubyte16 sb = cast(ubyte16)b; 2161 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2162 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2163 } 2164 else 2165 { 2166 __m128i value128 = _mm_set1_epi8(-128); 2167 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2168 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2169 __m128i mask = _mm_and_si128(aTob, higher); 2170 return _mm_xor_si128(b, mask); 2171 } 2172 } 2173 unittest 2174 { 2175 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2176 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2177 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 2178 assert(R.array == correct); 2179 } 2180 2181 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed maximum values. 2182 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted 2183 { 2184 static if (GDC_with_SSE2) 2185 { 2186 return __builtin_ia32_maxpd(a, b); 2187 } 2188 else 2189 { 2190 // x86: Generates maxpd starting with LDC 1.9 -O2 2191 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2192 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2193 return a; 2194 } 2195 } 2196 unittest 2197 { 2198 __m128d A = _mm_setr_pd(4.0, 1.0); 2199 __m128d B = _mm_setr_pd(1.0, 8.0); 2200 __m128d M = _mm_max_pd(A, B); 2201 assert(M.array[0] == 4.0); 2202 assert(M.array[1] == 8.0); 2203 } 2204 2205 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 2206 /// lower element of result, and copy the upper element from `a` to the upper element of result. 2207 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted 2208 { 2209 static if (GDC_with_SSE2) 2210 { 2211 return __builtin_ia32_maxsd(a, b); 2212 } 2213 else 2214 { 2215 __m128d r = a; 2216 // Generates maxsd starting with LDC 1.3 2217 r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2218 return r; 2219 } 2220 } 2221 unittest 2222 { 2223 __m128d A = _mm_setr_pd(1.0, 1.0); 2224 __m128d B = _mm_setr_pd(4.0, 2.0); 2225 __m128d M = _mm_max_sd(A, B); 2226 assert(M.array[0] == 4.0); 2227 assert(M.array[1] == 1.0); 2228 } 2229 2230 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 2231 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 2232 /// is globally visible before any memory instruction which follows the fence in program order. 2233 void _mm_mfence() @trusted 2234 { 2235 version(GNU) 2236 { 2237 static if (GDC_with_SSE2) 2238 { 2239 __builtin_ia32_mfence(); 2240 } 2241 else version(X86) 2242 { 2243 asm pure nothrow @nogc @trusted 2244 { 2245 "mfence;\n" : : : ; 2246 } 2247 } 2248 else 2249 static assert(false); 2250 } 2251 else static if (LDC_with_SSE2) 2252 { 2253 __builtin_ia32_mfence(); 2254 } 2255 else static if (DMD_with_asm) 2256 { 2257 asm nothrow @nogc pure @safe 2258 { 2259 mfence; 2260 } 2261 } 2262 else version(LDC) 2263 { 2264 void _mm_mfence() pure @safe 2265 { 2266 // Note: will generate the DMB instruction on ARM 2267 llvm_memory_fence(); 2268 } 2269 } 2270 else 2271 static assert(false); 2272 } 2273 unittest 2274 { 2275 _mm_mfence(); 2276 } 2277 2278 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values. 2279 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe 2280 { 2281 static if (GDC_with_SSE2) 2282 { 2283 return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b); 2284 } 2285 else version(LDC) 2286 { 2287 // x86: pminsw since LDC 1.0 -O1 2288 // ARM64: smin.8h since LDC 1.5 -01 2289 short8 sa = cast(short8)a; 2290 short8 sb = cast(short8)b; 2291 short8 greater = greaterMask!short8(sa, sb); 2292 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2293 } 2294 else 2295 { 2296 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else 2297 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2298 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2299 return _mm_xor_si128(b, mask); 2300 } 2301 } 2302 unittest 2303 { 2304 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-32768), 2305 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 2306 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -32768]; 2307 assert(R.array == correct); 2308 } 2309 2310 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values. 2311 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe 2312 { 2313 version(LDC) 2314 { 2315 // x86: pminub since LDC 1.0.0 -O1 2316 // ARM: umin.16b since LDC 1.5.0 -O1 2317 // PERF: catastrophic on ARM32 2318 ubyte16 sa = cast(ubyte16)a; 2319 ubyte16 sb = cast(ubyte16)b; 2320 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2321 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2322 } 2323 else 2324 { 2325 __m128i value128 = _mm_set1_epi8(-128); 2326 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2327 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2328 __m128i mask = _mm_and_si128(aTob, lower); 2329 return _mm_xor_si128(b, mask); 2330 } 2331 } 2332 unittest 2333 { 2334 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2335 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2336 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 2337 assert(R.array == correct); 2338 } 2339 2340 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values. 2341 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted 2342 { 2343 static if (GDC_with_SSE2) 2344 { 2345 return __builtin_ia32_minpd(a, b); 2346 } 2347 else 2348 { 2349 // Generates minpd starting with LDC 1.9 2350 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2351 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2352 return a; 2353 } 2354 } 2355 unittest 2356 { 2357 __m128d A = _mm_setr_pd(1.0, 2.0); 2358 __m128d B = _mm_setr_pd(4.0, 1.0); 2359 __m128d M = _mm_min_pd(A, B); 2360 assert(M.array[0] == 1.0); 2361 assert(M.array[1] == 1.0); 2362 } 2363 2364 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 2365 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 2366 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe 2367 { 2368 static if (GDC_with_SSE2) 2369 { 2370 return __builtin_ia32_minsd(a, b); 2371 } 2372 else 2373 { 2374 // Generates minsd starting with LDC 1.3 2375 __m128d r = a; 2376 r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2377 return r; 2378 } 2379 } 2380 unittest 2381 { 2382 __m128d A = _mm_setr_pd(1.0, 3.0); 2383 __m128d B = _mm_setr_pd(4.0, 2.0); 2384 __m128d M = _mm_min_sd(A, B); 2385 assert(M.array[0] == 1.0); 2386 assert(M.array[1] == 3.0); 2387 } 2388 2389 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element. 2390 __m128i _mm_move_epi64 (__m128i a) pure @trusted 2391 { 2392 static if (GDC_with_SSE2) 2393 { 2394 // slightly better with GDC -O0 2395 return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 2396 } 2397 else 2398 { 2399 long2 result = [ 0, 0 ]; 2400 long2 la = cast(long2) a; 2401 result.ptr[0] = la.array[0]; 2402 return cast(__m128i)(result); 2403 } 2404 } 2405 unittest 2406 { 2407 long2 A = [13, 47]; 2408 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 2409 long[2] correct = [13, 0]; 2410 assert(B.array == correct); 2411 } 2412 2413 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 2414 /// the upper element from `a` to the upper element of dst. 2415 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted 2416 { 2417 static if (GDC_with_SSE2) 2418 { 2419 return __builtin_ia32_movsd(a, b); 2420 } 2421 else 2422 { 2423 b.ptr[1] = a.array[1]; 2424 return b; 2425 } 2426 } 2427 unittest 2428 { 2429 double2 A = [13.0, 47.0]; 2430 double2 B = [34.0, 58.0]; 2431 double2 C = _mm_move_sd(A, B); 2432 double[2] correct = [34.0, 47.0]; 2433 assert(C.array == correct); 2434 } 2435 2436 /// Create mask from the most significant bit of each 8-bit element in `v`. 2437 int _mm_movemask_epi8 (__m128i a) pure @trusted 2438 { 2439 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2440 static if (GDC_with_SSE2) 2441 { 2442 return __builtin_ia32_pmovmskb128(cast(ubyte16)a); 2443 } 2444 else static if (LDC_with_SSE2) 2445 { 2446 return __builtin_ia32_pmovmskb128(cast(byte16)a); 2447 } 2448 else static if (LDC_with_ARM64) 2449 { 2450 // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon 2451 // The other two solutions lead to unfound intrinsics in LLVM and that took a long time. 2452 // SO there might be something a bit faster, but this one is reasonable and branchless. 2453 byte8 mask_shift; 2454 mask_shift.ptr[0] = 7; 2455 mask_shift.ptr[1] = 6; 2456 mask_shift.ptr[2] = 5; 2457 mask_shift.ptr[3] = 4; 2458 mask_shift.ptr[4] = 3; 2459 mask_shift.ptr[5] = 2; 2460 mask_shift.ptr[6] = 1; 2461 mask_shift.ptr[7] = 0; 2462 byte8 mask_and = byte8(-128); 2463 byte8 lo = vget_low_u8(cast(byte16)a); 2464 byte8 hi = vget_high_u8(cast(byte16)a); 2465 lo = vand_u8(lo, mask_and); 2466 lo = vshr_u8(lo, mask_shift); 2467 hi = vand_u8(hi, mask_and); 2468 hi = vshr_u8(hi, mask_shift); 2469 lo = vpadd_u8(lo,lo); 2470 lo = vpadd_u8(lo,lo); 2471 lo = vpadd_u8(lo,lo); 2472 hi = vpadd_u8(hi,hi); 2473 hi = vpadd_u8(hi,hi); 2474 hi = vpadd_u8(hi,hi); 2475 return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]); 2476 } 2477 else 2478 { 2479 byte16 ai = cast(byte16)a; 2480 int r = 0; 2481 foreach(bit; 0..16) 2482 { 2483 if (ai.array[bit] < 0) r += (1 << bit); 2484 } 2485 return r; 2486 } 2487 } 2488 unittest 2489 { 2490 assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0))); 2491 } 2492 2493 /// Create mask from the most significant bit of each 16-bit element in `v`. #BONUS 2494 int _mm_movemask_epi16 (__m128i a) pure @trusted 2495 { 2496 return _mm_movemask_epi8(_mm_packs_epi16(a, _mm_setzero_si128())); 2497 } 2498 unittest 2499 { 2500 assert(0x9C == _mm_movemask_epi16(_mm_set_epi16(-1, 1, 2, -3, -32768, -1, 32767, 8))); 2501 } 2502 2503 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 2504 /// loating-point element in `v`. 2505 int _mm_movemask_pd(__m128d v) pure @safe 2506 { 2507 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2508 static if (GDC_with_SSE2) 2509 { 2510 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2511 /// packed double-precision (64-bit) floating-point element in `v`. 2512 return __builtin_ia32_movmskpd(v); 2513 } 2514 else static if (LDC_with_SSE2) 2515 { 2516 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2517 /// packed double-precision (64-bit) floating-point element in `v`. 2518 return __builtin_ia32_movmskpd(v); 2519 } 2520 else 2521 { 2522 long2 lv = cast(long2)v; 2523 int r = 0; 2524 if (lv.array[0] < 0) r += 1; 2525 if (lv.array[1] < 0) r += 2; 2526 return r; 2527 } 2528 } 2529 unittest 2530 { 2531 __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0); 2532 assert(_mm_movemask_pd(A) == 2); 2533 } 2534 2535 /// Copy the lower 64-bit integer in `v`. 2536 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe 2537 { 2538 long2 lv = cast(long2)v; 2539 return long1(lv.array[0]); 2540 } 2541 unittest 2542 { 2543 __m128i A = _mm_set_epi64x(-1, -2); 2544 __m64 R = _mm_movepi64_pi64(A); 2545 assert(R.array[0] == -2); 2546 } 2547 2548 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element. 2549 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted 2550 { 2551 long2 r; 2552 r.ptr[0] = a.array[0]; 2553 r.ptr[1] = 0; 2554 return cast(__m128i)r; 2555 } 2556 2557 // Note: generates pmuludq in LDC with -O1 2558 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted 2559 { 2560 __m128i zero = _mm_setzero_si128(); 2561 2562 static if (__VERSION__ >= 2088) 2563 { 2564 // Need LLVM9 to avoid this shufflevector 2565 long2 la, lb; 2566 la.ptr[0] = cast(uint)a.array[0]; 2567 la.ptr[1] = cast(uint)a.array[2]; 2568 lb.ptr[0] = cast(uint)b.array[0]; 2569 lb.ptr[1] = cast(uint)b.array[2]; 2570 } 2571 else 2572 { 2573 long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero); 2574 long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero); 2575 } 2576 2577 version(DigitalMars) 2578 { 2579 // DMD has no long2 mul 2580 // long2 mul not supported before LDC 1.5 2581 la.ptr[0] *= lb.array[0]; 2582 la.ptr[1] *= lb.array[1]; 2583 return cast(__m128i)(la); 2584 } 2585 else 2586 { 2587 static if (__VERSION__ >= 2076) 2588 { 2589 return cast(__m128i)(la * lb); 2590 } 2591 else 2592 { 2593 // long2 mul not supported before LDC 1.5 2594 la.ptr[0] *= lb.array[0]; 2595 la.ptr[1] *= lb.array[1]; 2596 return cast(__m128i)(la); 2597 } 2598 } 2599 } 2600 unittest 2601 { 2602 __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff); 2603 __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff); 2604 __m128i C = _mm_mul_epu32(A, B); 2605 long2 LC = cast(long2)C; 2606 assert(LC.array[0] == 18446744065119617025uL); 2607 assert(LC.array[1] == 12723420444339690338uL); 2608 } 2609 2610 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 2611 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 2612 { 2613 pragma(inline, true); 2614 return a * b; 2615 } 2616 unittest 2617 { 2618 __m128d a = [-2.0, 1.5]; 2619 a = _mm_mul_pd(a, a); 2620 assert(a.array == [4.0, 2.25]); 2621 } 2622 2623 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 2624 /// element of result, and copy the upper element from `a` to the upper element of result. 2625 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted 2626 { 2627 version(DigitalMars) 2628 { 2629 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 2630 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 2631 asm pure nothrow @nogc @trusted { nop;} 2632 a.array[0] = a.array[0] * b.array[0]; 2633 return a; 2634 } 2635 else static if (GDC_with_SSE2) 2636 { 2637 return __builtin_ia32_mulsd(a, b); 2638 } 2639 else 2640 { 2641 a.ptr[0] *= b.array[0]; 2642 return a; 2643 } 2644 } 2645 unittest 2646 { 2647 __m128d a = [-2.0, 1.5]; 2648 a = _mm_mul_sd(a, a); 2649 assert(a.array == [4.0, 1.5]); 2650 } 2651 2652 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 2653 /// and get an unsigned 64-bit result. 2654 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe 2655 { 2656 return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b))); 2657 } 2658 unittest 2659 { 2660 __m64 A = _mm_set_pi32(42, 0xDEADBEEF); 2661 __m64 B = _mm_set_pi32(42, 0xCAFEBABE); 2662 __m64 C = _mm_mul_su32(A, B); 2663 assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL); 2664 } 2665 2666 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2667 /// high 16 bits of the intermediate integers. 2668 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted 2669 { 2670 static if (GDC_with_SSE2) 2671 { 2672 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2673 } 2674 else static if (LDC_with_SSE2) 2675 { 2676 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2677 } 2678 else 2679 { 2680 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h 2681 // PERF: it seems the simde solution has one less instruction in ARM64. 2682 // PERF: Catastrophic in ARM32. 2683 short8 sa = cast(short8)a; 2684 short8 sb = cast(short8)b; 2685 short8 r = void; 2686 r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16; 2687 r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16; 2688 r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16; 2689 r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16; 2690 r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16; 2691 r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16; 2692 r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16; 2693 r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16; 2694 return cast(__m128i)r; 2695 } 2696 } 2697 unittest 2698 { 2699 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2700 __m128i B = _mm_set1_epi16(16384); 2701 short8 R = cast(short8)_mm_mulhi_epi16(A, B); 2702 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1]; 2703 assert(R.array == correct); 2704 } 2705 2706 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2707 /// high 16 bits of the intermediate integers. 2708 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted 2709 { 2710 static if (GDC_with_SSE2) 2711 { 2712 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2713 } 2714 else static if (LDC_with_SSE2) 2715 { 2716 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2717 } 2718 else 2719 { 2720 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h 2721 // it seems the simde solution has one less instruction in ARM64 2722 // PERF: Catastrophic in ARM32. 2723 short8 sa = cast(short8)a; 2724 short8 sb = cast(short8)b; 2725 short8 r = void; 2726 r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 ); 2727 r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 ); 2728 r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 ); 2729 r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 ); 2730 r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 ); 2731 r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 ); 2732 r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 ); 2733 r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 ); 2734 return cast(__m128i)r; 2735 } 2736 } 2737 unittest 2738 { 2739 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2740 __m128i B = _mm_set1_epi16(16384); 2741 short8 R = cast(short8)_mm_mulhi_epu16(A, B); 2742 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1]; 2743 assert(R.array == correct); 2744 } 2745 2746 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 2747 /// bits of the intermediate integers. 2748 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe 2749 { 2750 return cast(__m128i)(cast(short8)a * cast(short8)b); 2751 } 2752 unittest 2753 { 2754 __m128i A = _mm_setr_epi16(16384, -16, 0, 3, 4, 1, 16, 7); 2755 __m128i B = _mm_set1_epi16(16384); 2756 short8 R = cast(short8)_mm_mullo_epi16(A, B); 2757 short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384]; 2758 assert(R.array == correct); 2759 } 2760 2761 /// Compute the bitwise NOT of 128 bits in `a`. #BONUS 2762 __m128i _mm_not_si128 (__m128i a) pure @safe 2763 { 2764 return ~a; 2765 } 2766 unittest 2767 { 2768 __m128i A = _mm_set1_epi32(-748); 2769 int4 notA = cast(int4) _mm_not_si128(A); 2770 int[4] correct = [747, 747, 747, 747]; 2771 assert(notA.array == correct); 2772 } 2773 2774 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 2775 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 2776 { 2777 pragma(inline, true); 2778 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 2779 } 2780 2781 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`. 2782 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 2783 { 2784 pragma(inline, true); 2785 return a | b; 2786 } 2787 2788 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation. 2789 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted 2790 { 2791 static if (GDC_with_SSE2) 2792 { 2793 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2794 } 2795 else static if (LDC_with_SSE2) 2796 { 2797 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2798 } 2799 else static if (LDC_with_ARM64) 2800 { 2801 short4 ra = vqmovn_s32(cast(int4)a); 2802 short4 rb = vqmovn_s32(cast(int4)b); 2803 return cast(__m128i)vcombine_s16(ra, rb); 2804 } 2805 else 2806 { 2807 // PERF: catastrophic on ARM32 2808 short8 r; 2809 r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]); 2810 r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]); 2811 r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]); 2812 r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]); 2813 r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]); 2814 r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]); 2815 r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]); 2816 r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]); 2817 return cast(__m128i)r; 2818 } 2819 } 2820 unittest 2821 { 2822 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 2823 short8 R = cast(short8) _mm_packs_epi32(A, A); 2824 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0]; 2825 assert(R.array == correct); 2826 } 2827 2828 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation. 2829 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted 2830 { 2831 static if (GDC_with_SSE2) 2832 { 2833 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2834 } 2835 else static if (LDC_with_SSE2) 2836 { 2837 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2838 } 2839 else static if (LDC_with_ARM64) 2840 { 2841 // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02 2842 byte8 ra = vqmovn_s16(cast(short8)a); 2843 byte8 rb = vqmovn_s16(cast(short8)b); 2844 return cast(__m128i)vcombine_s8(ra, rb); 2845 } 2846 else 2847 { 2848 // PERF: ARM32 is missing 2849 byte16 r; 2850 short8 sa = cast(short8)a; 2851 short8 sb = cast(short8)b; 2852 foreach(i; 0..8) 2853 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]); 2854 foreach(i; 0..8) 2855 r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]); 2856 return cast(__m128i)r; 2857 } 2858 } 2859 unittest 2860 { 2861 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0); 2862 byte16 R = cast(byte16) _mm_packs_epi16(A, A); 2863 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0, 2864 127, -128, 127, 0, 127, -128, 127, 0]; 2865 assert(R.array == correct); 2866 } 2867 2868 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation. 2869 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 2870 { 2871 static if (GDC_with_SSE2) 2872 { 2873 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2874 } 2875 else static if (LDC_with_SSE2) 2876 { 2877 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2878 } 2879 else static if (LDC_with_ARM64) 2880 { 2881 // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02 2882 byte8 ra = vqmovun_s16(cast(short8)a); 2883 byte8 rb = vqmovun_s16(cast(short8)b); 2884 return cast(__m128i)vcombine_s8(ra, rb); 2885 } 2886 else 2887 { 2888 short8 sa = cast(short8)a; 2889 short8 sb = cast(short8)b; 2890 ubyte[16] result = void; 2891 for (int i = 0; i < 8; ++i) 2892 { 2893 short s = sa[i]; 2894 if (s < 0) s = 0; 2895 if (s > 255) s = 255; 2896 result[i] = cast(ubyte)s; 2897 2898 s = sb[i]; 2899 if (s < 0) s = 0; 2900 if (s > 255) s = 255; 2901 result[i+8] = cast(ubyte)s; 2902 } 2903 return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr); 2904 } 2905 } 2906 unittest 2907 { 2908 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 2909 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 2910 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 2911 0, 255, 0, 255, 255, 2, 1, 0]; 2912 foreach(i; 0..16) 2913 assert(AA.array[i] == cast(byte)(correctResult[i])); 2914 } 2915 2916 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 2917 /// and power consumption of spin-wait loops. 2918 void _mm_pause() @trusted 2919 { 2920 version(GNU) 2921 { 2922 static if (GDC_with_SSE2) 2923 { 2924 __builtin_ia32_pause(); 2925 } 2926 else version(X86) 2927 { 2928 asm pure nothrow @nogc @trusted 2929 { 2930 "pause;\n" : : : ; 2931 } 2932 } 2933 else 2934 static assert(false); 2935 } 2936 else static if (LDC_with_SSE2) 2937 { 2938 __builtin_ia32_pause(); 2939 } 2940 else static if (DMD_with_asm) 2941 { 2942 asm nothrow @nogc pure @safe 2943 { 2944 rep; nop; // F3 90 = pause 2945 } 2946 } 2947 else version (LDC) 2948 { 2949 // PERF: Do nothing currently , could be the "yield" intruction on ARM. 2950 } 2951 else 2952 static assert(false); 2953 } 2954 unittest 2955 { 2956 _mm_pause(); 2957 } 2958 2959 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 2960 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 2961 /// low 16 bits of 64-bit elements in result. 2962 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted 2963 { 2964 static if (GDC_with_SSE2) 2965 { 2966 return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b); 2967 } 2968 else static if (LDC_with_SSE2) 2969 { 2970 return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b); 2971 } 2972 else static if (LDC_with_ARM64) 2973 { 2974 ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b)); 2975 2976 // PERF: Looks suboptimal vs addp 2977 ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]); 2978 ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]); 2979 ushort8 r = 0; 2980 r[0] = r0; 2981 r[4] = r4; 2982 return cast(__m128i) r; 2983 } 2984 else 2985 { 2986 // PERF: ARM32 is lacking 2987 byte16 ab = cast(byte16)a; 2988 byte16 bb = cast(byte16)b; 2989 ubyte[16] t; 2990 foreach(i; 0..16) 2991 { 2992 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]); 2993 if (diff < 0) diff = -diff; 2994 t[i] = cast(ubyte)(diff); 2995 } 2996 int4 r = _mm_setzero_si128(); 2997 r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 2998 r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15]; 2999 return r; 3000 } 3001 } 3002 unittest 3003 { 3004 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 3005 __m128i B = _mm_set1_epi8(1); 3006 __m128i R = _mm_sad_epu8(A, B); 3007 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 3008 0, 3009 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 3010 0]; 3011 assert(R.array == correct); 3012 } 3013 3014 /// Set packed 16-bit integers with the supplied values. 3015 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 3016 { 3017 short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7]; 3018 return cast(__m128i) loadUnaligned!(short8)(result.ptr); 3019 } 3020 unittest 3021 { 3022 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 3023 short8 B = cast(short8) A; 3024 foreach(i; 0..8) 3025 assert(B.array[i] == i); 3026 } 3027 3028 /// Set packed 32-bit integers with the supplied values. 3029 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3030 { 3031 pragma(inline, true); 3032 int[4] result = [e0, e1, e2, e3]; 3033 return loadUnaligned!(int4)(result.ptr); 3034 } 3035 unittest 3036 { 3037 __m128i A = _mm_set_epi32(3, 2, 1, 0); 3038 foreach(i; 0..4) 3039 assert(A.array[i] == i); 3040 } 3041 3042 /// Set packed 64-bit integers with the supplied values. 3043 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted 3044 { 3045 pragma(inline, true); 3046 long[2] result = [e0.array[0], e1.array[0]]; 3047 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 3048 } 3049 unittest 3050 { 3051 __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678)); 3052 long2 B = cast(long2) A; 3053 assert(B.array[0] == 5678); 3054 assert(B.array[1] == 1234); 3055 } 3056 3057 /// Set packed 64-bit integers with the supplied values. 3058 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 3059 { 3060 pragma(inline, true); 3061 long[2] result = [e0, e1]; 3062 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 3063 } 3064 unittest 3065 { 3066 __m128i A = _mm_set_epi64x(1234, 5678); 3067 long2 B = cast(long2) A; 3068 assert(B.array[0] == 5678); 3069 assert(B.array[1] == 1234); 3070 } 3071 3072 /// Set packed 8-bit integers with the supplied values. 3073 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 3074 byte e11, byte e10, byte e9, byte e8, 3075 byte e7, byte e6, byte e5, byte e4, 3076 byte e3, byte e2, byte e1, byte e0) pure @trusted 3077 { 3078 byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 3079 e8, e9, e10, e11, e12, e13, e14, e15]; 3080 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 3081 } 3082 3083 /// Set packed double-precision (64-bit) floating-point elements with the supplied values. 3084 __m128d _mm_set_pd (double e1, double e0) pure @trusted 3085 { 3086 pragma(inline, true); 3087 double[2] result = [e0, e1]; 3088 return loadUnaligned!(double2)(result.ptr); 3089 } 3090 unittest 3091 { 3092 __m128d A = _mm_set_pd(61.0, 55.0); 3093 double[2] correct = [55.0, 61.0]; 3094 assert(A.array == correct); 3095 } 3096 3097 /// Broadcast double-precision (64-bit) floating-point value `a` to all element. 3098 __m128d _mm_set_pd1 (double a) pure @trusted 3099 { 3100 pragma(inline, true); 3101 double[2] result = [a, a]; 3102 return loadUnaligned!(double2)(result.ptr); 3103 } 3104 unittest 3105 { 3106 __m128d A = _mm_set_pd1(61.0); 3107 double[2] correct = [61.0, 61.0]; 3108 assert(A.array == correct); 3109 } 3110 3111 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 3112 /// and zero the upper element. 3113 __m128d _mm_set_sd (double a) pure @trusted 3114 { 3115 double[2] result = [a, 0]; 3116 return loadUnaligned!(double2)(result.ptr); 3117 } 3118 3119 /// Broadcast 16-bit integer a to all elements of dst. 3120 __m128i _mm_set1_epi16 (short a) pure @trusted 3121 { 3122 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 3123 { 3124 short8 v = a; 3125 return cast(__m128i) v; 3126 } 3127 else 3128 { 3129 pragma(inline, true); 3130 return cast(__m128i)(short8(a)); 3131 } 3132 } 3133 unittest 3134 { 3135 short8 a = cast(short8) _mm_set1_epi16(31); 3136 for (int i = 0; i < 8; ++i) 3137 assert(a.array[i] == 31); 3138 } 3139 3140 /// Broadcast 32-bit integer `a` to all elements. 3141 __m128i _mm_set1_epi32 (int a) pure @trusted 3142 { 3143 pragma(inline, true); 3144 return cast(__m128i)(int4(a)); 3145 } 3146 unittest 3147 { 3148 int4 a = cast(int4) _mm_set1_epi32(31); 3149 for (int i = 0; i < 4; ++i) 3150 assert(a.array[i] == 31); 3151 } 3152 3153 /// Broadcast 64-bit integer `a` to all elements. 3154 __m128i _mm_set1_epi64 (__m64 a) pure @safe 3155 { 3156 return _mm_set_epi64(a, a); 3157 } 3158 unittest 3159 { 3160 long b = 0x1DEADCAFE; 3161 __m64 a; 3162 a.ptr[0] = b; 3163 long2 c = cast(long2) _mm_set1_epi64(a); 3164 assert(c.array[0] == b); 3165 assert(c.array[1] == b); 3166 } 3167 3168 /// Broadcast 64-bit integer `a` to all elements 3169 __m128i _mm_set1_epi64x (long a) pure @trusted 3170 { 3171 long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3172 return cast(__m128i)(b); 3173 } 3174 unittest 3175 { 3176 long b = 0x1DEADCAFE; 3177 long2 c = cast(long2) _mm_set1_epi64x(b); 3178 for (int i = 0; i < 2; ++i) 3179 assert(c.array[i] == b); 3180 } 3181 3182 /// Broadcast 8-bit integer `a` to all elements. 3183 __m128i _mm_set1_epi8 (byte a) pure @trusted 3184 { 3185 pragma(inline, true); 3186 byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3187 return cast(__m128i)(b); 3188 } 3189 unittest 3190 { 3191 byte16 b = cast(byte16) _mm_set1_epi8(31); 3192 for (int i = 0; i < 16; ++i) 3193 assert(b.array[i] == 31); 3194 } 3195 3196 alias _mm_set1_pd = _mm_set_pd1; 3197 3198 /// Set packed 16-bit integers with the supplied values in reverse order. 3199 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 3200 short e3, short e2, short e1, short e0) pure @trusted 3201 { 3202 short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0]; 3203 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 3204 } 3205 unittest 3206 { 3207 short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0); 3208 short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0]; 3209 assert(A.array == correct); 3210 } 3211 3212 /// Set packed 32-bit integers with the supplied values in reverse order. 3213 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3214 { 3215 pragma(inline, true); 3216 int[4] result = [e3, e2, e1, e0]; 3217 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 3218 } 3219 unittest 3220 { 3221 int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647); 3222 int[4] correct = [-1, 0, -2147483648, 2147483647]; 3223 assert(A.array == correct); 3224 } 3225 3226 /// Set packed 64-bit integers with the supplied values in reverse order. 3227 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 3228 { 3229 long[2] result = [e1, e0]; 3230 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 3231 } 3232 unittest 3233 { 3234 long2 A = cast(long2) _mm_setr_epi64(-1, 0); 3235 long[2] correct = [-1, 0]; 3236 assert(A.array == correct); 3237 } 3238 3239 /// Set packed 8-bit integers with the supplied values in reverse order. 3240 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12, 3241 byte e11, byte e10, byte e9, byte e8, 3242 byte e7, byte e6, byte e5, byte e4, 3243 byte e3, byte e2, byte e1, byte e0) pure @trusted 3244 { 3245 byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 3246 e7, e6, e5, e4, e3, e2, e1, e0]; 3247 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 3248 } 3249 3250 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order. 3251 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 3252 { 3253 pragma(inline, true); 3254 double2 result; 3255 result.ptr[0] = e1; 3256 result.ptr[1] = e0; 3257 return result; 3258 } 3259 unittest 3260 { 3261 __m128d A = _mm_setr_pd(61.0, 55.0); 3262 double[2] correct = [61.0, 55.0]; 3263 assert(A.array == correct); 3264 } 3265 3266 /// Return vector of type `__m128d` with all elements set to zero. 3267 __m128d _mm_setzero_pd () pure @trusted 3268 { 3269 pragma(inline, true); 3270 // Note: using loadUnaligned has better -O0 codegen compared to .ptr 3271 double[2] result = [0.0, 0.0]; 3272 return loadUnaligned!(double2)(result.ptr); 3273 } 3274 3275 /// Return vector of type `__m128i` with all elements set to zero. 3276 __m128i _mm_setzero_si128() pure @trusted 3277 { 3278 pragma(inline, true); 3279 // Note: using loadUnaligned has better -O0 codegen compared to .ptr 3280 int[4] result = [0, 0, 0, 0]; 3281 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 3282 } 3283 3284 /// Shuffle 32-bit integers in a using the control in `imm8`. 3285 /// See_also: `_MM_SHUFFLE`. 3286 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe 3287 { 3288 static if (GDC_with_SSE2) 3289 { 3290 return __builtin_ia32_pshufd(a, imm8); 3291 } 3292 else 3293 { 3294 return shufflevector!(int4, (imm8 >> 0) & 3, 3295 (imm8 >> 2) & 3, 3296 (imm8 >> 4) & 3, 3297 (imm8 >> 6) & 3)(a, a); 3298 } 3299 } 3300 unittest 3301 { 3302 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 3303 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3304 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 3305 int[4] expectedB = [ 3, 2, 1, 0 ]; 3306 assert(B.array == expectedB); 3307 } 3308 3309 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`. 3310 /// See_also: `_MM_SHUFFLE2`. 3311 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe 3312 { 3313 static if (GDC_with_SSE2) 3314 { 3315 return __builtin_ia32_shufpd(a, b, imm8); 3316 } 3317 else 3318 { 3319 return shufflevector!(double2, 0 + ( imm8 & 1 ), 3320 2 + ( (imm8 >> 1) & 1 ))(a, b); 3321 } 3322 } 3323 unittest 3324 { 3325 __m128d A = _mm_setr_pd(0.5, 2.0); 3326 __m128d B = _mm_setr_pd(4.0, 5.0); 3327 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 3328 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B); 3329 double[2] correct = [ 2.0, 5.0 ]; 3330 assert(R.array == correct); 3331 } 3332 3333 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 3334 /// 64 bits of result, with the low 64 bits being copied from from `a` to result. 3335 /// See also: `_MM_SHUFFLE`. 3336 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe 3337 { 3338 static if (GDC_with_SSE2) 3339 { 3340 return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8); 3341 } 3342 else 3343 { 3344 return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3, 3345 4 + ( (imm8 >> 0) & 3 ), 3346 4 + ( (imm8 >> 2) & 3 ), 3347 4 + ( (imm8 >> 4) & 3 ), 3348 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 3349 } 3350 } 3351 unittest 3352 { 3353 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3354 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3355 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 3356 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 3357 assert(C.array == expectedC); 3358 } 3359 3360 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 3361 /// bits of result, with the high 64 bits being copied from from `a` to result. 3362 /// See_also: `_MM_SHUFFLE`. 3363 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe 3364 { 3365 static if (GDC_with_SSE2) 3366 { 3367 return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8); 3368 } 3369 else 3370 { 3371 return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ), 3372 ( (imm8 >> 2) & 3 ), 3373 ( (imm8 >> 4) & 3 ), 3374 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 3375 } 3376 } 3377 unittest 3378 { 3379 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3380 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3381 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 3382 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 3383 assert(B.array == expectedB); 3384 } 3385 3386 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros. 3387 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted 3388 { 3389 static if (LDC_with_SSE2) 3390 { 3391 return __builtin_ia32_pslld128(a, count); 3392 } 3393 else static if (GDC_with_SSE2) 3394 { 3395 return __builtin_ia32_pslld128(a, count); 3396 } 3397 else static if (DMD_with_32bit_asm) 3398 { 3399 asm pure nothrow @nogc @trusted 3400 { 3401 movdqu XMM0, a; 3402 movdqu XMM1, count; 3403 pslld XMM0, XMM1; 3404 movdqu a, XMM0; 3405 } 3406 return a; 3407 } 3408 else 3409 { 3410 int4 r = void; 3411 long2 lc = cast(long2)count; 3412 int bits = cast(int)(lc.array[0]); 3413 foreach(i; 0..4) 3414 r[i] = cast(uint)(a[i]) << bits; 3415 return r; 3416 } 3417 } 3418 3419 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros. 3420 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted 3421 { 3422 static if (LDC_with_SSE2) 3423 { 3424 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3425 } 3426 else static if (GDC_with_SSE2) 3427 { 3428 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3429 } 3430 else static if (DMD_with_32bit_asm) 3431 { 3432 asm pure nothrow @nogc @trusted 3433 { 3434 movdqu XMM0, a; 3435 movdqu XMM1, count; 3436 psllq XMM0, XMM1; 3437 movdqu a, XMM0; 3438 } 3439 return a; 3440 } 3441 else 3442 { 3443 // ARM: good since LDC 1.12 -O2 3444 // ~but -O0 version is catastrophic 3445 long2 r = void; 3446 long2 sa = cast(long2)a; 3447 long2 lc = cast(long2)count; 3448 int bits = cast(int)(lc.array[0]); 3449 foreach(i; 0..2) 3450 r.array[i] = cast(ulong)(sa.array[i]) << bits; 3451 return cast(__m128i)r; 3452 } 3453 } 3454 3455 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros. 3456 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 3457 { 3458 static if (LDC_with_SSE2) 3459 { 3460 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3461 } 3462 else static if (GDC_with_SSE2) 3463 { 3464 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3465 } 3466 else static if (DMD_with_32bit_asm) 3467 { 3468 asm pure nothrow @nogc 3469 { 3470 movdqu XMM0, a; 3471 movdqu XMM1, count; 3472 psllw XMM0, XMM1; 3473 movdqu a, XMM0; 3474 } 3475 return a; 3476 } 3477 else 3478 { 3479 short8 sa = cast(short8)a; 3480 long2 lc = cast(long2)count; 3481 int bits = cast(int)(lc.array[0]); 3482 short8 r = void; 3483 foreach(i; 0..8) 3484 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits); 3485 return cast(int4)r; 3486 } 3487 } 3488 3489 3490 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 3491 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted 3492 { 3493 static if (GDC_with_SSE2) 3494 { 3495 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3496 } 3497 else static if (LDC_with_SSE2) 3498 { 3499 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3500 } 3501 else 3502 { 3503 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3504 // D says "It's illegal to shift by the same or more bits 3505 // than the size of the quantity being shifted" 3506 // and it's UB instead. 3507 int4 r = _mm_setzero_si128(); 3508 3509 ubyte count = cast(ubyte) imm8; 3510 if (count > 31) 3511 return r; 3512 3513 foreach(i; 0..4) 3514 r.array[i] = cast(uint)(a.array[i]) << count; 3515 return r; 3516 } 3517 } 3518 unittest 3519 { 3520 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3521 __m128i B = _mm_slli_epi32(A, 1); 3522 __m128i B2 = _mm_slli_epi32(A, 1 + 256); 3523 int[4] expectedB = [ 0, 4, 6, -8]; 3524 assert(B.array == expectedB); 3525 assert(B2.array == expectedB); 3526 3527 __m128i C = _mm_slli_epi32(A, 0); 3528 int[4] expectedC = [ 0, 2, 3, -4]; 3529 assert(C.array == expectedC); 3530 3531 __m128i D = _mm_slli_epi32(A, 65); 3532 int[4] expectedD = [ 0, 0, 0, 0]; 3533 assert(D.array == expectedD); 3534 } 3535 3536 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros. 3537 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted 3538 { 3539 static if (GDC_with_SSE2) 3540 { 3541 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3542 } 3543 else static if (LDC_with_SSE2) 3544 { 3545 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3546 } 3547 else 3548 { 3549 long2 sa = cast(long2)a; 3550 3551 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3552 // D says "It's illegal to shift by the same or more bits 3553 // than the size of the quantity being shifted" 3554 // and it's UB instead. 3555 long2 r = cast(long2) _mm_setzero_si128(); 3556 ubyte count = cast(ubyte) imm8; 3557 if (count > 63) 3558 return cast(__m128i)r; 3559 3560 r.ptr[0] = cast(ulong)(sa.array[0]) << count; 3561 r.ptr[1] = cast(ulong)(sa.array[1]) << count; 3562 return cast(__m128i)r; 3563 } 3564 } 3565 unittest 3566 { 3567 __m128i A = _mm_setr_epi64(8, -4); 3568 long2 B = cast(long2) _mm_slli_epi64(A, 1); 3569 long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024); 3570 long[2] expectedB = [ 16, -8]; 3571 assert(B.array == expectedB); 3572 assert(B2.array == expectedB); 3573 3574 long2 C = cast(long2) _mm_slli_epi64(A, 0); 3575 long[2] expectedC = [ 8, -4]; 3576 assert(C.array == expectedC); 3577 3578 long2 D = cast(long2) _mm_slli_epi64(A, 64); 3579 long[2] expectedD = [ 0, -0]; 3580 assert(D.array == expectedD); 3581 } 3582 3583 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 3584 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted 3585 { 3586 static if (GDC_with_SSE2) 3587 { 3588 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3589 } 3590 else static if (LDC_with_SSE2) 3591 { 3592 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3593 } 3594 else static if (LDC_with_ARM64) 3595 { 3596 short8 sa = cast(short8)a; 3597 short8 r = cast(short8)_mm_setzero_si128(); 3598 ubyte count = cast(ubyte) imm8; 3599 if (count > 15) 3600 return cast(__m128i)r; 3601 r = sa << short8(count); 3602 return cast(__m128i)r; 3603 } 3604 else 3605 { 3606 short8 sa = cast(short8)a; 3607 short8 r = cast(short8)_mm_setzero_si128(); 3608 ubyte count = cast(ubyte) imm8; 3609 if (count > 15) 3610 return cast(__m128i)r; 3611 foreach(i; 0..8) 3612 r.ptr[i] = cast(short)(sa.array[i] << count); 3613 return cast(__m128i)r; 3614 } 3615 } 3616 unittest 3617 { 3618 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3619 short8 B = cast(short8)( _mm_slli_epi16(A, 1) ); 3620 short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) ); 3621 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 3622 assert(B.array == expectedB); 3623 assert(B2.array == expectedB); 3624 3625 short8 C = cast(short8)( _mm_slli_epi16(A, 16) ); 3626 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 3627 assert(C.array == expectedC); 3628 } 3629 3630 3631 /// Shift `a` left by `bytes` bytes while shifting in zeros. 3632 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted 3633 { 3634 static if (bytes & 0xF0) 3635 { 3636 return _mm_setzero_si128(); 3637 } 3638 else 3639 { 3640 static if (GDC_with_SSE2) 3641 { 3642 return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 3643 } 3644 else version(DigitalMars) 3645 { 3646 version(D_InlineAsm_X86) 3647 { 3648 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64 3649 { 3650 movdqu XMM0, op; 3651 pslldq XMM0, bytes; 3652 movdqu op, XMM0; 3653 } 3654 return op; 3655 } 3656 else 3657 { 3658 byte16 A = cast(byte16)op; 3659 byte16 R; 3660 for (int n = 15; n >= bytes; --n) 3661 R.ptr[n] = A.array[n-bytes]; 3662 for (int n = bytes-1; n >= 0; --n) 3663 R.ptr[n] = 0; 3664 return cast(__m128i)R; 3665 } 3666 } 3667 else 3668 { 3669 return cast(__m128i) shufflevector!(byte16, 3670 16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes, 3671 22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes, 3672 28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes) 3673 (cast(byte16)_mm_setzero_si128(), cast(byte16)op); 3674 } 3675 } 3676 } 3677 unittest 3678 { 3679 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3680 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left 3681 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ]; 3682 assert(R.array == correct); 3683 3684 __m128i B = _mm_srli_si128!16(_mm_set1_epi32(-1)); 3685 int[4] expectedB = [0, 0, 0, 0]; 3686 assert(B.array == expectedB); 3687 } 3688 3689 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`. 3690 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted 3691 { 3692 version(LDC) 3693 { 3694 // Disappeared with LDC 1.11 3695 static if (__VERSION__ < 2081) 3696 return __builtin_ia32_sqrtpd(vec); 3697 else 3698 { 3699 vec.array[0] = llvm_sqrt(vec.array[0]); 3700 vec.array[1] = llvm_sqrt(vec.array[1]); 3701 return vec; 3702 } 3703 } 3704 else static if (GDC_with_SSE2) 3705 { 3706 return __builtin_ia32_sqrtpd(vec); 3707 } 3708 else 3709 { 3710 vec.ptr[0] = sqrt(vec.array[0]); 3711 vec.ptr[1] = sqrt(vec.array[1]); 3712 return vec; 3713 } 3714 } 3715 3716 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 3717 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 3718 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted 3719 { 3720 // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only. 3721 // "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 3722 // The quadword at bits 127:64 of the destination operand remains unchanged." 3723 version(LDC) 3724 { 3725 // Disappeared with LDC 1.11 3726 static if (__VERSION__ < 2081) 3727 { 3728 __m128d c = __builtin_ia32_sqrtsd(b); 3729 a[0] = c[0]; 3730 return a; 3731 } 3732 else 3733 { 3734 a.array[0] = llvm_sqrt(b.array[0]); 3735 return a; 3736 } 3737 } 3738 else static if (GDC_with_SSE2) 3739 { 3740 __m128d c = __builtin_ia32_sqrtsd(b); 3741 a.ptr[0] = c.array[0]; 3742 return a; 3743 } 3744 else 3745 { 3746 a.ptr[0] = sqrt(b.array[0]); 3747 return a; 3748 } 3749 } 3750 unittest 3751 { 3752 __m128d A = _mm_setr_pd(1.0, 3.0); 3753 __m128d B = _mm_setr_pd(4.0, 5.0); 3754 __m128d R = _mm_sqrt_sd(A, B); 3755 double[2] correct = [2.0, 3.0 ]; 3756 assert(R.array == correct); 3757 } 3758 3759 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits. 3760 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted 3761 { 3762 static if (GDC_with_SSE2) 3763 { 3764 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3765 } 3766 else static if (LDC_with_SSE2) 3767 { 3768 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3769 } 3770 else 3771 { 3772 short8 sa = cast(short8)a; 3773 long2 lc = cast(long2)count; 3774 int bits = cast(int)(lc.array[0]); 3775 short8 r = void; 3776 foreach(i; 0..8) 3777 r.ptr[i] = cast(short)(sa.array[i] >> bits); 3778 return cast(int4)r; 3779 } 3780 } 3781 3782 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits. 3783 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted 3784 { 3785 static if (LDC_with_SSE2) 3786 { 3787 return __builtin_ia32_psrad128(a, count); 3788 } 3789 else static if (GDC_with_SSE2) 3790 { 3791 return __builtin_ia32_psrad128(a, count); 3792 } 3793 else 3794 { 3795 int4 r = void; 3796 long2 lc = cast(long2)count; 3797 int bits = cast(int)(lc.array[0]); 3798 r.ptr[0] = (a.array[0] >> bits); 3799 r.ptr[1] = (a.array[1] >> bits); 3800 r.ptr[2] = (a.array[2] >> bits); 3801 r.ptr[3] = (a.array[3] >> bits); 3802 return r; 3803 } 3804 } 3805 3806 3807 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 3808 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted 3809 { 3810 static if (GDC_with_SSE2) 3811 { 3812 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 3813 } 3814 else static if (LDC_with_SSE2) 3815 { 3816 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 3817 } 3818 else static if (LDC_with_ARM64) 3819 { 3820 short8 sa = cast(short8)a; 3821 ubyte count = cast(ubyte)imm8; 3822 if (count > 15) 3823 count = 15; 3824 short8 r = sa >> short8(count); 3825 return cast(__m128i)r; 3826 } 3827 else 3828 { 3829 short8 sa = cast(short8)a; 3830 short8 r = void; 3831 3832 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3833 // D says "It's illegal to shift by the same or more bits 3834 // than the size of the quantity being shifted" 3835 // and it's UB instead. 3836 ubyte count = cast(ubyte)imm8; 3837 if (count > 15) 3838 count = 15; 3839 foreach(i; 0..8) 3840 r.ptr[i] = cast(short)(sa.array[i] >> count); 3841 return cast(int4)r; 3842 } 3843 } 3844 unittest 3845 { 3846 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3847 short8 B = cast(short8)( _mm_srai_epi16(A, 1) ); 3848 short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) ); 3849 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 3850 assert(B.array == expectedB); 3851 assert(B2.array == expectedB); 3852 3853 short8 C = cast(short8)( _mm_srai_epi16(A, 18) ); 3854 short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ]; 3855 assert(C.array == expectedC); 3856 } 3857 3858 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 3859 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted 3860 { 3861 static if (LDC_with_SSE2) 3862 { 3863 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 3864 } 3865 else static if (GDC_with_SSE2) 3866 { 3867 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 3868 } 3869 else 3870 { 3871 int4 r = void; 3872 3873 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3874 // D says "It's illegal to shift by the same or more bits 3875 // than the size of the quantity being shifted" 3876 // and it's UB instead. 3877 ubyte count = cast(ubyte) imm8; 3878 if (count > 31) 3879 count = 31; 3880 3881 r.ptr[0] = (a.array[0] >> count); 3882 r.ptr[1] = (a.array[1] >> count); 3883 r.ptr[2] = (a.array[2] >> count); 3884 r.ptr[3] = (a.array[3] >> count); 3885 return r; 3886 } 3887 } 3888 unittest 3889 { 3890 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3891 __m128i B = _mm_srai_epi32(A, 1); 3892 __m128i B2 = _mm_srai_epi32(A, 1 + 256); 3893 int[4] expectedB = [ 0, 1, 1, -2]; 3894 assert(B.array == expectedB); 3895 assert(B2.array == expectedB); 3896 3897 __m128i C = _mm_srai_epi32(A, 32); 3898 int[4] expectedC = [ 0, 0, 0, -1]; 3899 assert(C.array == expectedC); 3900 3901 __m128i D = _mm_srai_epi32(A, 0); 3902 int[4] expectedD = [ 0, 2, 3, -4]; 3903 assert(D.array == expectedD); 3904 } 3905 3906 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted 3907 { 3908 static if (LDC_with_SSE2) 3909 { 3910 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 3911 } 3912 else static if (GDC_with_SSE2) 3913 { 3914 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 3915 } 3916 else 3917 { 3918 short8 sa = cast(short8)a; 3919 long2 lc = cast(long2)count; 3920 int bits = cast(int)(lc.array[0]); 3921 short8 r = void; 3922 foreach(i; 0..8) 3923 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits); 3924 return cast(int4)r; 3925 } 3926 } 3927 3928 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted 3929 { 3930 static if (LDC_with_SSE2) 3931 { 3932 return __builtin_ia32_psrld128(a, count); 3933 } 3934 else static if (GDC_with_SSE2) 3935 { 3936 return __builtin_ia32_psrld128(a, count); 3937 } 3938 else 3939 { 3940 int4 r = void; 3941 long2 lc = cast(long2)count; 3942 int bits = cast(int)(lc.array[0]); 3943 r.ptr[0] = cast(uint)(a.array[0]) >> bits; 3944 r.ptr[1] = cast(uint)(a.array[1]) >> bits; 3945 r.ptr[2] = cast(uint)(a.array[2]) >> bits; 3946 r.ptr[3] = cast(uint)(a.array[3]) >> bits; 3947 return r; 3948 } 3949 } 3950 3951 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted 3952 { 3953 static if (LDC_with_SSE2) 3954 { 3955 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 3956 } 3957 else static if (GDC_with_SSE2) 3958 { 3959 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 3960 } 3961 else 3962 { 3963 // Workaround for https://issues.dlang.org/show_bug.cgi?id=23047 3964 // => avoid void initialization. 3965 long2 r; 3966 long2 sa = cast(long2)a; 3967 long2 lc = cast(long2)count; 3968 int bits = cast(int)(lc.array[0]); 3969 r.ptr[0] = cast(ulong)(sa.array[0]) >> bits; 3970 r.ptr[1] = cast(ulong)(sa.array[1]) >> bits; 3971 return cast(__m128i)r; 3972 } 3973 } 3974 3975 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 3976 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted 3977 { 3978 static if (GDC_with_SSE2) 3979 { 3980 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 3981 } 3982 else static if (LDC_with_SSE2) 3983 { 3984 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 3985 } 3986 else static if (LDC_with_ARM64) 3987 { 3988 short8 sa = cast(short8)a; 3989 short8 r = cast(short8) _mm_setzero_si128(); 3990 3991 ubyte count = cast(ubyte)imm8; 3992 if (count >= 16) 3993 return cast(__m128i)r; 3994 3995 r = sa >>> short8(count); // This facility offered with LDC, but not DMD. 3996 return cast(__m128i)r; 3997 } 3998 else 3999 { 4000 short8 sa = cast(short8)a; 4001 ubyte count = cast(ubyte)imm8; 4002 4003 short8 r = cast(short8) _mm_setzero_si128(); 4004 if (count >= 16) 4005 return cast(__m128i)r; 4006 4007 foreach(i; 0..8) 4008 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count); 4009 return cast(__m128i)r; 4010 } 4011 } 4012 unittest 4013 { 4014 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 4015 short8 B = cast(short8)( _mm_srli_epi16(A, 1) ); 4016 short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) ); 4017 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 4018 assert(B.array == expectedB); 4019 assert(B2.array == expectedB); 4020 4021 short8 C = cast(short8)( _mm_srli_epi16(A, 16) ); 4022 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0]; 4023 assert(C.array == expectedC); 4024 4025 short8 D = cast(short8)( _mm_srli_epi16(A, 0) ); 4026 short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ]; 4027 assert(D.array == expectedD); 4028 } 4029 4030 4031 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 4032 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted 4033 { 4034 static if (GDC_with_SSE2) 4035 { 4036 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 4037 } 4038 else static if (LDC_with_SSE2) 4039 { 4040 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 4041 } 4042 else 4043 { 4044 ubyte count = cast(ubyte) imm8; 4045 4046 // Note: the intrinsics guarantee imm8[0..7] is taken, however 4047 // D says "It's illegal to shift by the same or more bits 4048 // than the size of the quantity being shifted" 4049 // and it's UB instead. 4050 int4 r = _mm_setzero_si128(); 4051 if (count >= 32) 4052 return r; 4053 r.ptr[0] = a.array[0] >>> count; 4054 r.ptr[1] = a.array[1] >>> count; 4055 r.ptr[2] = a.array[2] >>> count; 4056 r.ptr[3] = a.array[3] >>> count; 4057 return r; 4058 } 4059 } 4060 unittest 4061 { 4062 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 4063 __m128i B = _mm_srli_epi32(A, 1); 4064 __m128i B2 = _mm_srli_epi32(A, 1 + 256); 4065 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 4066 assert(B.array == expectedB); 4067 assert(B2.array == expectedB); 4068 4069 __m128i C = _mm_srli_epi32(A, 255); 4070 int[4] expectedC = [ 0, 0, 0, 0 ]; 4071 assert(C.array == expectedC); 4072 } 4073 4074 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros. 4075 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted 4076 { 4077 static if (GDC_with_SSE2) 4078 { 4079 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4080 } 4081 else static if (LDC_with_SSE2) 4082 { 4083 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4084 } 4085 else 4086 { 4087 long2 r = cast(long2) _mm_setzero_si128(); 4088 long2 sa = cast(long2)a; 4089 4090 ubyte count = cast(ubyte) imm8; 4091 if (count >= 64) 4092 return cast(__m128i)r; 4093 4094 r.ptr[0] = sa.array[0] >>> count; 4095 r.ptr[1] = sa.array[1] >>> count; 4096 return cast(__m128i)r; 4097 } 4098 } 4099 unittest 4100 { 4101 __m128i A = _mm_setr_epi64(8, -4); 4102 long2 B = cast(long2) _mm_srli_epi64(A, 1); 4103 long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512); 4104 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 4105 assert(B.array == expectedB); 4106 assert(B2.array == expectedB); 4107 4108 long2 C = cast(long2) _mm_srli_epi64(A, 64); 4109 long[2] expectedC = [ 0, 0 ]; 4110 assert(C.array == expectedC); 4111 } 4112 4113 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4114 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe 4115 { 4116 static if (bytes & 0xF0) 4117 { 4118 return _mm_setzero_si128(); 4119 } 4120 else static if (GDC_with_SSE2) 4121 { 4122 return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8)); 4123 } 4124 else static if (DMD_with_32bit_asm) 4125 { 4126 asm pure nothrow @nogc @trusted 4127 { 4128 movdqu XMM0, v; 4129 psrldq XMM0, bytes; 4130 movdqu v, XMM0; 4131 } 4132 return v; 4133 } 4134 else 4135 { 4136 return cast(__m128i) shufflevector!(byte16, 4137 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7, 4138 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15) 4139 (cast(byte16) v, cast(byte16)_mm_setzero_si128()); 4140 } 4141 } 4142 unittest 4143 { 4144 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1)); 4145 int[4] correct = [2, 3, 4, 0]; 4146 assert(R.array == correct); 4147 4148 __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1)); 4149 int[4] expectedA = [0, 0, 0, 0]; 4150 assert(A.array == expectedA); 4151 } 4152 4153 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4154 /// #BONUS 4155 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe 4156 { 4157 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v); 4158 } 4159 unittest 4160 { 4161 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)); 4162 float[4] correct = [3.0f, 4.0f, 0, 0]; 4163 assert(R.array == correct); 4164 } 4165 4166 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4167 /// #BONUS 4168 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe 4169 { 4170 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v); 4171 } 4172 4173 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4174 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4175 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted 4176 { 4177 pragma(inline, true); 4178 __m128d* aligned = cast(__m128d*)mem_addr; 4179 *aligned = a; 4180 } 4181 4182 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 4183 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4184 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted 4185 { 4186 __m128d* aligned = cast(__m128d*)mem_addr; 4187 __m128d r; 4188 r.ptr[0] = a.array[0]; 4189 r.ptr[1] = a.array[0]; 4190 *aligned = r; 4191 } 4192 4193 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 4194 /// be aligned on any particular boundary. 4195 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 4196 { 4197 pragma(inline, true); 4198 *mem_addr = a.array[0]; 4199 } 4200 4201 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 4202 /// general-protection exception may be generated. 4203 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 4204 { 4205 pragma(inline, true); 4206 *mem_addr = a; 4207 } 4208 4209 alias _mm_store1_pd = _mm_store_pd1; /// 4210 4211 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory. 4212 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 4213 { 4214 pragma(inline, true); 4215 *mem_addr = a.array[1]; 4216 } 4217 4218 // Note: `mem_addr` doesn't have to actually be aligned, which breaks 4219 // expectations from the user point of view. This problem also exist in C++. 4220 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 4221 { 4222 pragma(inline, true); 4223 long* dest = cast(long*)mem_addr; 4224 long2 la = cast(long2)a; 4225 *dest = la.array[0]; 4226 } 4227 unittest 4228 { 4229 long[3] A = [1, 2, 3]; 4230 _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 4231 long[3] correct = [1, 0x1_0000_0000, 3]; 4232 assert(A == correct); 4233 } 4234 4235 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. 4236 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 4237 { 4238 pragma(inline, true); 4239 *mem_addr = a.array[0]; 4240 } 4241 4242 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order. `mem_addr` must be 4243 /// aligned on a 16-byte boundary or a general-protection exception may be generated. 4244 void _mm_storer_pd (double* mem_addr, __m128d a) pure 4245 { 4246 __m128d* aligned = cast(__m128d*)mem_addr; 4247 *aligned = shufflevector!(double2, 1, 0)(a, a); 4248 } 4249 4250 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4251 /// `mem_addr` does not need to be aligned on any particular boundary. 4252 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe 4253 { 4254 pragma(inline, true); 4255 storeUnaligned!double2(a, mem_addr); 4256 } 4257 4258 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 4259 /// boundary. 4260 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe 4261 { 4262 pragma(inline, true); 4263 storeUnaligned!__m128i(a, cast(int*)mem_addr); 4264 } 4265 4266 /// Store 32-bit integer from the first element of `a` into memory. 4267 /// `mem_addr` does not need to be aligned on any particular boundary. 4268 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted 4269 { 4270 pragma(inline, true); 4271 int* dest = cast(int*)mem_addr; 4272 *dest = a.array[0]; 4273 } 4274 unittest 4275 { 4276 int[2] arr = [-24, 12]; 4277 _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7)); 4278 assert(arr == [-24, -1]); 4279 } 4280 4281 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 4282 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte 4283 /// boundary or a general-protection exception may be generated. 4284 void _mm_stream_pd (double* mem_addr, __m128d a) 4285 { 4286 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4287 __m128d* dest = cast(__m128d*)mem_addr; 4288 *dest = a; 4289 } 4290 4291 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 4292 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception 4293 /// may be generated. 4294 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) 4295 { 4296 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4297 __m128i* dest = cast(__m128i*)mem_addr; 4298 *dest = a; 4299 } 4300 4301 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 4302 /// pollution. If the cache line containing address mem_addr is already in the cache, 4303 /// the cache will be updated. 4304 void _mm_stream_si32 (int* mem_addr, int a) 4305 { 4306 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4307 *mem_addr = a; 4308 } 4309 4310 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 4311 /// cache pollution. If the cache line containing address mem_addr is already 4312 /// in the cache, the cache will be updated. 4313 void _mm_stream_si64 (long* mem_addr, long a) 4314 { 4315 // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4316 *mem_addr = a; 4317 } 4318 4319 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 4320 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 4321 { 4322 pragma(inline, true); 4323 return cast(__m128i)(cast(short8)a - cast(short8)b); 4324 } 4325 4326 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 4327 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 4328 { 4329 pragma(inline, true); 4330 return cast(__m128i)(cast(int4)a - cast(int4)b); 4331 } 4332 4333 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`. 4334 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 4335 { 4336 pragma(inline, true); 4337 return cast(__m128i)(cast(long2)a - cast(long2)b); 4338 } 4339 4340 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 4341 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 4342 { 4343 pragma(inline, true); 4344 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 4345 } 4346 4347 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 4348 /// floating-point elements in `a`. 4349 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 4350 { 4351 pragma(inline, true); 4352 return a - b; 4353 } 4354 4355 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 4356 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the 4357 /// upper element of result. 4358 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted 4359 { 4360 version(DigitalMars) 4361 { 4362 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 4363 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 4364 asm pure nothrow @nogc @trusted { nop;} 4365 a[0] = a[0] - b[0]; 4366 return a; 4367 } 4368 else static if (GDC_with_SSE2) 4369 { 4370 return __builtin_ia32_subsd(a, b); 4371 } 4372 else 4373 { 4374 a.ptr[0] -= b.array[0]; 4375 return a; 4376 } 4377 } 4378 unittest 4379 { 4380 __m128d a = [1.5, -2.0]; 4381 a = _mm_sub_sd(a, a); 4382 assert(a.array == [0.0, -2.0]); 4383 } 4384 4385 /// Subtract 64-bit integer `b` from 64-bit integer `a`. 4386 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe 4387 { 4388 pragma(inline, true); 4389 return a - b; 4390 } 4391 4392 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4393 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 4394 { 4395 version(LDC) 4396 { 4397 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4398 { 4399 // Generates PSUBSW since LDC 1.15 -O0 4400 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4401 4402 enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4403 enum ir = ` 4404 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4405 ret <8 x i16> %r`; 4406 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4407 } 4408 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4409 { 4410 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4411 short[8] res; 4412 short8 sa = cast(short8)a; 4413 short8 sb = cast(short8)b; 4414 foreach(i; 0..8) 4415 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4416 return _mm_loadu_si128(cast(int4*)res.ptr); 4417 } 4418 else static if (LDC_with_SSE2) 4419 { 4420 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 4421 } 4422 else 4423 static assert(false); 4424 } 4425 else static if (GDC_with_SSE2) 4426 { 4427 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 4428 } 4429 else 4430 { 4431 short[8] res; 4432 short8 sa = cast(short8)a; 4433 short8 sb = cast(short8)b; 4434 foreach(i; 0..8) 4435 res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4436 return _mm_loadu_si128(cast(int4*)res.ptr); 4437 } 4438 } 4439 unittest 4440 { 4441 short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0), 4442 _mm_setr_epi16(-10 , 16, 5, 4, 3, 2, 1, 0)); 4443 static immutable short[8] correctResult = [32767, -32768, 0, 0, 0, 0, 0, 0]; 4444 assert(res.array == correctResult); 4445 } 4446 4447 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 4448 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 4449 { 4450 version(LDC) 4451 { 4452 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4453 { 4454 // x86: Generates PSUBSB since LDC 1.15 -O0 4455 // ARM: Generates sqsub.16b since LDC 1.21 -O0 4456 enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 4457 enum ir = ` 4458 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 4459 ret <16 x i8> %r`; 4460 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 4461 } 4462 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4463 { 4464 byte[16] res; 4465 byte16 sa = cast(byte16)a; 4466 byte16 sb = cast(byte16)b; 4467 foreach(i; 0..16) 4468 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4469 return _mm_loadu_si128(cast(int4*)res.ptr); 4470 } 4471 else static if (LDC_with_SSE2) 4472 { 4473 return cast(__m128i) __builtin_ia32_psubsb128(cast(byte16) a, cast(byte16) b); 4474 } 4475 else 4476 static assert(false); 4477 } 4478 else static if (GDC_with_SSE2) 4479 { 4480 return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b); 4481 } 4482 else 4483 { 4484 byte[16] res; 4485 byte16 sa = cast(byte16)a; 4486 byte16 sb = cast(byte16)b; 4487 foreach(i; 0..16) 4488 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4489 return _mm_loadu_si128(cast(int4*)res.ptr); 4490 } 4491 } 4492 unittest 4493 { 4494 byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4495 _mm_setr_epi8( 15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4496 static immutable byte[16] correctResult = [-128, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4497 assert(res.array == correctResult); 4498 } 4499 4500 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 4501 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 4502 { 4503 version(LDC) 4504 { 4505 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4506 { 4507 // x86: Generates PSUBUSW since LDC 1.15 -O0 4508 // ARM: Generates uqsub.8h since LDC 1.21 -O0 4509 enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4510 enum ir = ` 4511 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4512 ret <8 x i16> %r`; 4513 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4514 } 4515 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4516 { 4517 short[8] res; 4518 short8 sa = cast(short8)a; 4519 short8 sb = cast(short8)b; 4520 foreach(i; 0..8) 4521 { 4522 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4523 res[i] = saturateSignedIntToUnsignedShort(sum); 4524 } 4525 return _mm_loadu_si128(cast(int4*)res.ptr); 4526 } 4527 else static if (LDC_with_SSE2) 4528 { 4529 return cast(__m128i) __builtin_ia32_psubusw128(a, b); 4530 } 4531 else 4532 static assert(false); 4533 } 4534 else static if (GDC_with_SSE2) 4535 { 4536 return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b); 4537 } 4538 else 4539 { 4540 short[8] res; 4541 short8 sa = cast(short8)a; 4542 short8 sb = cast(short8)b; 4543 foreach(i; 0..8) 4544 { 4545 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4546 res[i] = saturateSignedIntToUnsignedShort(sum); 4547 } 4548 return _mm_loadu_si128(cast(int4*)res.ptr); 4549 } 4550 } 4551 unittest 4552 { 4553 short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534, 1, 5, 4, 3, 2, 1, 0), 4554 _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0)); 4555 static immutable short[8] correct = [ 0, 0, 1, 0, 0, 2, 0, 0]; 4556 assert(R.array == correct); 4557 } 4558 4559 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4560 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4561 { 4562 version(LDC) 4563 { 4564 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4565 { 4566 // x86: Generates PSUBUSB since LDC 1.15 -O0 4567 // ARM: Generates uqsub.16b since LDC 1.21 -O0 4568 enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 4569 enum ir = ` 4570 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 4571 ret <16 x i8> %r`; 4572 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 4573 } 4574 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4575 { 4576 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4577 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4578 { 4579 ubyte[16] res; 4580 byte16 sa = cast(byte16)a; 4581 byte16 sb = cast(byte16)b; 4582 foreach(i; 0..16) 4583 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4584 return _mm_loadu_si128(cast(int4*)res.ptr); 4585 } 4586 } 4587 else static if (LDC_with_SSE2) 4588 { 4589 return __builtin_ia32_psubusb128(a, b); 4590 } 4591 else 4592 static assert(false); 4593 } 4594 else static if (GDC_with_SSE2) 4595 { 4596 return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b); 4597 } 4598 else 4599 { 4600 ubyte[16] res; 4601 byte16 sa = cast(byte16)a; 4602 byte16 sb = cast(byte16)b; 4603 foreach(i; 0..16) 4604 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4605 return _mm_loadu_si128(cast(int4*)res.ptr); 4606 } 4607 } 4608 unittest 4609 { 4610 byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4611 _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4612 static immutable byte[16] correctResult = [ 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4613 assert(res.array == correctResult); 4614 } 4615 4616 // Note: the only difference between these intrinsics is the signalling 4617 // behaviour of quiet NaNs. This is incorrect but the case where 4618 // you would want to differentiate between qNaN and sNaN and then 4619 // treat them differently on purpose seems extremely rare. 4620 alias _mm_ucomieq_sd = _mm_comieq_sd; /// 4621 alias _mm_ucomige_sd = _mm_comige_sd; /// 4622 alias _mm_ucomigt_sd = _mm_comigt_sd; /// 4623 alias _mm_ucomile_sd = _mm_comile_sd; /// 4624 alias _mm_ucomilt_sd = _mm_comilt_sd; /// 4625 alias _mm_ucomineq_sd = _mm_comineq_sd; /// 4626 4627 /// Return vector of type `__m128d` with undefined elements. 4628 __m128d _mm_undefined_pd() pure @safe 4629 { 4630 pragma(inline, true); 4631 __m128d result = void; 4632 return result; 4633 } 4634 4635 /// Return vector of type `__m128i` with undefined elements. 4636 __m128i _mm_undefined_si128() pure @safe 4637 { 4638 pragma(inline, true); 4639 __m128i result = void; 4640 return result; 4641 } 4642 4643 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. 4644 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe 4645 { 4646 static if (GDC_with_SSE2) 4647 { 4648 return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b); 4649 } 4650 else static if (DMD_with_32bit_asm) 4651 { 4652 asm pure nothrow @nogc @trusted 4653 { 4654 movdqu XMM0, a; 4655 movdqu XMM1, b; 4656 punpckhwd XMM0, XMM1; 4657 movdqu a, XMM0; 4658 } 4659 return a; 4660 } 4661 else 4662 { 4663 return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 4664 (cast(short8)a, cast(short8)b); 4665 } 4666 } 4667 unittest 4668 { 4669 __m128i A = _mm_setr_epi16(4, 5, 6, 7, 8, 9, 10, 11); 4670 __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19); 4671 short8 C = cast(short8)(_mm_unpackhi_epi16(A, B)); 4672 short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19]; 4673 assert(C.array == correct); 4674 } 4675 4676 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. 4677 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted 4678 { 4679 static if (GDC_with_SSE2) 4680 { 4681 return __builtin_ia32_punpckhdq128(a, b); 4682 } 4683 else version(DigitalMars) 4684 { 4685 __m128i r; 4686 r.ptr[0] = a.array[2]; 4687 r.ptr[1] = b.array[2]; 4688 r.ptr[2] = a.array[3]; 4689 r.ptr[3] = b.array[3]; 4690 return r; 4691 } 4692 else 4693 { 4694 return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 4695 } 4696 } 4697 unittest 4698 { 4699 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 4700 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 4701 __m128i C = _mm_unpackhi_epi32(A, B); 4702 int[4] correct = [3, 7, 4, 8]; 4703 assert(C.array == correct); 4704 } 4705 4706 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`. 4707 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted 4708 { 4709 static if (GDC_with_SSE2) 4710 { 4711 return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b); 4712 } 4713 else 4714 { 4715 __m128i r = cast(__m128i)b; 4716 r[0] = a[2]; 4717 r[1] = a[3]; 4718 return r; 4719 } 4720 } 4721 unittest // Issue #36 4722 { 4723 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4724 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4725 long2 C = cast(long2)(_mm_unpackhi_epi64(A, B)); 4726 long[2] correct = [0x33333333_33333333, 0x55555555_55555555]; 4727 assert(C.array == correct); 4728 } 4729 4730 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. 4731 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe 4732 { 4733 static if (GDC_with_SSE2) 4734 { 4735 return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b); 4736 } 4737 else static if (DMD_with_32bit_asm) 4738 { 4739 asm pure nothrow @nogc @trusted 4740 { 4741 movdqu XMM0, a; 4742 movdqu XMM1, b; 4743 punpckhbw XMM0, XMM1; 4744 movdqu a, XMM0; 4745 } 4746 return a; 4747 } 4748 else 4749 { 4750 return cast(__m128i)shufflevector!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 4751 12, 28, 13, 29, 14, 30, 15, 31) 4752 (cast(byte16)a, cast(byte16)b); 4753 } 4754 } 4755 unittest 4756 { 4757 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 4758 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 4759 byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B); 4760 byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]; 4761 assert(C.array == correct); 4762 } 4763 4764 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`. 4765 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe 4766 { 4767 static if (GDC_with_SSE2) 4768 { 4769 return __builtin_ia32_unpckhpd(a, b); 4770 } 4771 else 4772 { 4773 return shufflevector!(__m128d, 1, 3)(a, b); 4774 } 4775 } 4776 unittest 4777 { 4778 __m128d A = _mm_setr_pd(4.0, 6.0); 4779 __m128d B = _mm_setr_pd(7.0, 9.0); 4780 __m128d C = _mm_unpackhi_pd(A, B); 4781 double[2] correct = [6.0, 9.0]; 4782 assert(C.array == correct); 4783 } 4784 4785 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. 4786 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe 4787 { 4788 static if (GDC_with_SSE2) 4789 { 4790 return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b); 4791 } 4792 else static if (DMD_with_32bit_asm) 4793 { 4794 asm pure nothrow @nogc @trusted 4795 { 4796 movdqu XMM0, a; 4797 movdqu XMM1, b; 4798 punpcklwd XMM0, XMM1; 4799 movdqu a, XMM0; 4800 } 4801 return a; 4802 } 4803 else 4804 { 4805 return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11) 4806 (cast(short8)a, cast(short8)b); 4807 } 4808 } 4809 unittest 4810 { 4811 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 4812 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 4813 short8 C = cast(short8) _mm_unpacklo_epi16(A, B); 4814 short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11]; 4815 assert(C.array == correct); 4816 } 4817 4818 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. 4819 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted 4820 { 4821 static if (GDC_with_SSE2) 4822 { 4823 return __builtin_ia32_punpckldq128(a, b); 4824 } 4825 else version(DigitalMars) 4826 { 4827 __m128i r; 4828 r.ptr[0] = a.array[0]; 4829 r.ptr[1] = b.array[0]; 4830 r.ptr[2] = a.array[1]; 4831 r.ptr[3] = b.array[1]; 4832 return r; 4833 } 4834 else 4835 { 4836 return shufflevector!(int4, 0, 4, 1, 5)(cast(int4)a, cast(int4)b); 4837 } 4838 } 4839 unittest 4840 { 4841 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 4842 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 4843 __m128i C = _mm_unpacklo_epi32(A, B); 4844 int[4] correct = [1, 5, 2, 6]; 4845 assert(C.array == correct); 4846 } 4847 4848 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`. 4849 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted 4850 { 4851 static if (GDC_with_SSE2) 4852 { 4853 return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b); 4854 } 4855 else 4856 { 4857 long2 lA = cast(long2)a; 4858 long2 lB = cast(long2)b; 4859 long2 R; 4860 R.ptr[0] = lA.array[0]; 4861 R.ptr[1] = lB.array[0]; 4862 return cast(__m128i)R; 4863 } 4864 } 4865 unittest // Issue #36 4866 { 4867 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4868 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4869 long2 C = cast(long2)(_mm_unpacklo_epi64(A, B)); 4870 long[2] correct = [0x22222222_22222222, 0x44444444_44444444]; 4871 assert(C.array == correct); 4872 } 4873 4874 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. 4875 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe 4876 { 4877 static if (GDC_with_SSE2) 4878 { 4879 return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b); 4880 } 4881 else static if (DMD_with_32bit_asm) 4882 { 4883 asm pure nothrow @nogc @trusted 4884 { 4885 movdqu XMM0, a; 4886 movdqu XMM1, b; 4887 punpcklbw XMM0, XMM1; 4888 movdqu a, XMM0; 4889 } 4890 return a; 4891 } 4892 else 4893 { 4894 return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 4895 4, 20, 5, 21, 6, 22, 7, 23) 4896 (cast(byte16)a, cast(byte16)b); 4897 } 4898 } 4899 unittest 4900 { 4901 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 4902 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 4903 byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B); 4904 byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]; 4905 assert(C.array == correct); 4906 } 4907 4908 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`. 4909 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe 4910 { 4911 static if (GDC_with_SSE2) 4912 { 4913 return __builtin_ia32_unpcklpd(a, b); 4914 } 4915 else 4916 { 4917 return shufflevector!(__m128d, 0, 2)(a, b); 4918 } 4919 } 4920 unittest 4921 { 4922 __m128d A = _mm_setr_pd(4.0, 6.0); 4923 __m128d B = _mm_setr_pd(7.0, 9.0); 4924 __m128d C = _mm_unpacklo_pd(A, B); 4925 double[2] correct = [4.0, 7.0]; 4926 assert(C.array == correct); 4927 } 4928 4929 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 4930 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 4931 { 4932 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 4933 } 4934 // TODO unittest and thus force inline 4935 4936 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`. 4937 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 4938 { 4939 return a ^ b; 4940 } 4941 // TODO unittest and thus force inline 4942 4943 unittest 4944 { 4945 float distance(float[4] a, float[4] b) nothrow @nogc 4946 { 4947 __m128 va = _mm_loadu_ps(a.ptr); 4948 __m128 vb = _mm_loadu_ps(b.ptr); 4949 __m128 diffSquared = _mm_sub_ps(va, vb); 4950 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 4951 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared)); 4952 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum)); 4953 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 4954 } 4955 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 4956 }