1 /** 2 * SSE2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2 4 * 5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.emmintrin; 9 10 public import inteli.types; 11 public import inteli.xmmintrin; // SSE2 includes SSE1 12 import inteli.mmx; 13 import inteli.internals; 14 15 nothrow @nogc: 16 17 18 // SSE2 instructions 19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 20 21 /// Add packed 16-bit integers in `a` and `b`. 22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 23 { 24 pragma(inline, true); 25 return cast(__m128i)(cast(short8)a + cast(short8)b); 26 } 27 unittest 28 { 29 __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77); 30 short8 R = cast(short8) _mm_add_epi16(A, A); 31 short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154]; 32 assert(R.array == correct); 33 } 34 35 /// Add packed 32-bit integers in `a` and `b`. 36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 37 { 38 pragma(inline, true); 39 return cast(__m128i)(cast(int4)a + cast(int4)b); 40 } 41 unittest 42 { 43 __m128i A = _mm_setr_epi32( -7, -1, 0, 9); 44 int4 R = _mm_add_epi32(A, A); 45 int[4] correct = [ -14, -2, 0, 18 ]; 46 assert(R.array == correct); 47 } 48 49 /// Add packed 64-bit integers in `a` and `b`. 50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 51 { 52 pragma(inline, true); 53 return cast(__m128i)(cast(long2)a + cast(long2)b); 54 } 55 unittest 56 { 57 __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000); 58 long2 R = cast(long2) _mm_add_epi64(A, A); 59 long[2] correct = [ -2, 0 ]; 60 assert(R.array == correct); 61 } 62 63 /// Add packed 8-bit integers in `a` and `b`. 64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 65 { 66 pragma(inline, true); 67 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 68 } 69 unittest 70 { 71 __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78); 72 byte16 R = cast(byte16) _mm_add_epi8(A, A); 73 byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100]; 74 assert(R.array == correct); 75 } 76 77 /// Add the lower double-precision (64-bit) floating-point element 78 /// in `a` and `b`, store the result in the lower element of dst, 79 /// and copy the upper element from `a` to the upper element of destination. 80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 81 { 82 static if (GDC_with_SSE2) 83 { 84 return __builtin_ia32_addsd(a, b); 85 } 86 else version(DigitalMars) 87 { 88 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 89 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 90 asm pure nothrow @nogc @trusted { nop;} 91 a[0] = a[0] + b[0]; 92 return a; 93 } 94 else 95 { 96 a[0] += b[0]; 97 return a; 98 } 99 } 100 unittest 101 { 102 __m128d a = [1.5, -2.0]; 103 a = _mm_add_sd(a, a); 104 assert(a.array == [3.0, -2.0]); 105 } 106 107 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 108 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 109 { 110 pragma(inline, true); 111 return a + b; 112 } 113 unittest 114 { 115 __m128d a = [1.5, -2.0]; 116 a = _mm_add_pd(a, a); 117 assert(a.array == [3.0, -4.0]); 118 } 119 120 /// Add 64-bit integers `a` and `b`. 121 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe 122 { 123 pragma(inline, true); 124 return a + b; 125 } 126 127 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 128 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 129 { 130 static if (GDC_with_SSE2) 131 { 132 return cast(__m128i)__builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 133 } 134 else version(LDC) 135 { 136 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 137 { 138 // x86: Generates PADDSW since LDC 1.15 -O0 139 // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20 140 enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 141 enum ir = ` 142 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 143 ret <8 x i16> %r`; 144 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 145 } 146 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 147 { 148 short[8] res; 149 short8 sa = cast(short8)a; 150 short8 sb = cast(short8)b; 151 foreach(i; 0..8) 152 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 153 return _mm_loadu_si128(cast(int4*)res.ptr); 154 } 155 else 156 return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 157 } 158 else 159 { 160 short[8] res; 161 short8 sa = cast(short8)a; 162 short8 sb = cast(short8)b; 163 foreach(i; 0..8) 164 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 165 return _mm_loadu_si128(cast(int4*)res.ptr); 166 } 167 } 168 unittest 169 { 170 short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), 171 _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); 172 static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 173 assert(res.array == correctResult); 174 } 175 176 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 177 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 178 { 179 static if (GDC_with_SSE2) 180 { 181 return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b); 182 } 183 else version(LDC) 184 { 185 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 186 { 187 // x86: Generates PADDSB since LDC 1.15 -O0 188 // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20 189 enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 190 enum ir = ` 191 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 192 ret <16 x i8> %r`; 193 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 194 } 195 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 196 { 197 byte[16] res; 198 byte16 sa = cast(byte16)a; 199 byte16 sb = cast(byte16)b; 200 foreach(i; 0..16) 201 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 202 return _mm_loadu_si128(cast(int4*)res.ptr); 203 } 204 else 205 return cast(__m128i) __builtin_ia32_paddsb128(cast(byte16)a, cast(byte16)b); 206 } 207 else 208 { 209 byte[16] res; 210 byte16 sa = cast(byte16)a; 211 byte16 sb = cast(byte16)b; 212 foreach(i; 0..16) 213 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 214 return _mm_loadu_si128(cast(int4*)res.ptr); 215 } 216 } 217 unittest 218 { 219 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 220 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 221 static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14, 222 16, 18, 20, 22, 24, 26, 28, 30]; 223 assert(res.array == correctResult); 224 } 225 226 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 227 // PERF: #GDC version? 228 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 229 { 230 version(LDC) 231 { 232 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 233 { 234 // x86: Generates PADDUSB since LDC 1.15 -O0 235 // ARM: Generates uqadd.16b since LDC 1.21 -O1 236 enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 237 enum ir = ` 238 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 239 ret <16 x i8> %r`; 240 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 241 } 242 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 243 { 244 ubyte[16] res; 245 byte16 sa = cast(byte16)a; 246 byte16 sb = cast(byte16)b; 247 foreach(i; 0..16) 248 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 249 return _mm_loadu_si128(cast(int4*)res.ptr); 250 } 251 else 252 return __builtin_ia32_paddusb128(a, b); 253 } 254 else 255 { 256 ubyte[16] res; 257 byte16 sa = cast(byte16)a; 258 byte16 sb = cast(byte16)b; 259 foreach(i; 0..16) 260 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 261 return _mm_loadu_si128(cast(int4*)res.ptr); 262 } 263 } 264 unittest 265 { 266 byte16 res = cast(byte16) 267 _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0), 268 _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0)); 269 static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 270 0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 271 assert(res.array == correctResult); 272 } 273 274 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation. 275 // PERF: #GDC version? 276 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 277 { 278 version(LDC) 279 { 280 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 281 { 282 // x86: Generates PADDUSW since LDC 1.15 -O0 283 // ARM: Generates uqadd.8h since LDC 1.21 -O1 284 enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 285 enum ir = ` 286 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 287 ret <8 x i16> %r`; 288 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 289 } 290 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 291 { 292 ushort[8] res; 293 short8 sa = cast(short8)a; 294 short8 sb = cast(short8)b; 295 foreach(i; 0..8) 296 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 297 return _mm_loadu_si128(cast(int4*)res.ptr); 298 } 299 else 300 return __builtin_ia32_paddusw128(a, b); 301 } 302 else 303 { 304 ushort[8] res; 305 short8 sa = cast(short8)a; 306 short8 sb = cast(short8)b; 307 foreach(i; 0..8) 308 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 309 return _mm_loadu_si128(cast(int4*)res.ptr); 310 } 311 } 312 unittest 313 { 314 short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0), 315 _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0)); 316 static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6]; 317 assert(res.array == correctResult); 318 } 319 320 /// Compute the bitwise AND of packed double-precision (64-bit) 321 /// floating-point elements in `a` and `b`. 322 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 323 { 324 pragma(inline, true); 325 return cast(__m128d)( cast(long2)a & cast(long2)b ); 326 } 327 unittest 328 { 329 double a = 4.32; 330 double b = -78.99; 331 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 332 __m128d A = _mm_set_pd(a, b); 333 __m128d B = _mm_set_pd(b, a); 334 long2 R = cast(long2)( _mm_and_pd(A, B) ); 335 assert(R.array[0] == correct); 336 assert(R.array[1] == correct); 337 } 338 339 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`. 340 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 341 { 342 pragma(inline, true); 343 return a & b; 344 } 345 unittest 346 { 347 __m128i A = _mm_set1_epi32(7); 348 __m128i B = _mm_set1_epi32(14); 349 __m128i R = _mm_and_si128(A, B); 350 int[4] correct = [6, 6, 6, 6]; 351 assert(R.array == correct); 352 } 353 354 /// Compute the bitwise NOT of packed double-precision (64-bit) 355 /// floating-point elements in `a` and then AND with `b`. 356 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 357 { 358 return cast(__m128d)( ~(cast(long2)a) & cast(long2)b); 359 } 360 unittest 361 { 362 double a = 4.32; 363 double b = -78.99; 364 long correct = (~*cast(long*)(&a)) & ( *cast(long*)(&b)); 365 long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b)); 366 __m128d A = _mm_setr_pd(a, b); 367 __m128d B = _mm_setr_pd(b, a); 368 long2 R = cast(long2)( _mm_andnot_pd(A, B) ); 369 assert(R.array[0] == correct); 370 assert(R.array[1] == correct2); 371 } 372 373 /// Compute the bitwise NOT of 128 bits (representing integer data) 374 /// in `a` and then AND with `b`. 375 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 376 { 377 return (~a) & b; 378 } 379 unittest 380 { 381 __m128i A = _mm_set1_epi32(7); 382 __m128i B = _mm_set1_epi32(14); 383 __m128i R = _mm_andnot_si128(A, B); 384 int[4] correct = [8, 8, 8, 8]; 385 assert(R.array == correct); 386 } 387 388 /// Average packed unsigned 16-bit integers in `a` and `b`. 389 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted 390 { 391 static if (GDC_with_SSE2) 392 { 393 return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b); 394 } 395 else static if (LDC_with_ARM64) 396 { 397 return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b); 398 } 399 else version(LDC) 400 { 401 // Generates pavgw even in LDC 1.0, even in -O0 402 // But not in ARM 403 enum ir = ` 404 %ia = zext <8 x i16> %0 to <8 x i32> 405 %ib = zext <8 x i16> %1 to <8 x i32> 406 %isum = add <8 x i32> %ia, %ib 407 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 408 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 409 %r = trunc <8 x i32> %isums to <8 x i16> 410 ret <8 x i16> %r`; 411 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 412 } 413 else 414 { 415 short8 sa = cast(short8)a; 416 short8 sb = cast(short8)b; 417 short8 sr = void; 418 foreach(i; 0..8) 419 { 420 sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 ); 421 } 422 return cast(int4)sr; 423 } 424 } 425 unittest 426 { 427 __m128i A = _mm_set1_epi16(31); 428 __m128i B = _mm_set1_epi16(64); 429 short8 avg = cast(short8)(_mm_avg_epu16(A, B)); 430 foreach(i; 0..8) 431 assert(avg.array[i] == 48); 432 } 433 434 /// Average packed unsigned 8-bit integers in `a` and `b`. 435 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted 436 { 437 static if (GDC_with_SSE2) 438 { 439 return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b); 440 } 441 else static if (LDC_with_ARM64) 442 { 443 return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b); 444 } 445 else version(LDC) 446 { 447 // Generates pavgb even in LDC 1.0, even in -O0 448 // But not in ARM 449 enum ir = ` 450 %ia = zext <16 x i8> %0 to <16 x i16> 451 %ib = zext <16 x i8> %1 to <16 x i16> 452 %isum = add <16 x i16> %ia, %ib 453 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 454 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 455 %r = trunc <16 x i16> %isums to <16 x i8> 456 ret <16 x i8> %r`; 457 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 458 } 459 else 460 { 461 byte16 sa = cast(byte16)a; 462 byte16 sb = cast(byte16)b; 463 byte16 sr = void; 464 foreach(i; 0..16) 465 { 466 sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 ); 467 } 468 return cast(int4)sr; 469 } 470 } 471 unittest 472 { 473 __m128i A = _mm_set1_epi8(31); 474 __m128i B = _mm_set1_epi8(64); 475 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B)); 476 foreach(i; 0..16) 477 assert(avg.array[i] == 48); 478 } 479 480 /// Shift `a` left by `bytes` bytes while shifting in zeros. 481 alias _mm_bslli_si128 = _mm_slli_si128; 482 unittest 483 { 484 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 485 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 486 __m128i result = _mm_bslli_si128!5(toShift); 487 assert( (cast(byte16)result).array == exact); 488 } 489 490 /// Shift `v` right by `bytes` bytes while shifting in zeros. 491 alias _mm_bsrli_si128 = _mm_srli_si128; 492 unittest 493 { 494 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 495 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0]; 496 __m128i result = _mm_bsrli_si128!5(toShift); 497 assert( (cast(byte16)result).array == exact); 498 } 499 500 /// Cast vector of type `__m128d` to type `__m128`. 501 /// Note: Also possible with a regular `cast(__m128)(a)`. 502 __m128 _mm_castpd_ps (__m128d a) pure @safe 503 { 504 return cast(__m128)a; 505 } 506 507 /// Cast vector of type `__m128d` to type `__m128i`. 508 /// Note: Also possible with a regular `cast(__m128i)(a)`. 509 __m128i _mm_castpd_si128 (__m128d a) pure @safe 510 { 511 return cast(__m128i)a; 512 } 513 514 /// Cast vector of type `__m128` to type `__m128d`. 515 /// Note: Also possible with a regular `cast(__m128d)(a)`. 516 __m128d _mm_castps_pd (__m128 a) pure @safe 517 { 518 return cast(__m128d)a; 519 } 520 521 /// Cast vector of type `__m128` to type `__m128i`. 522 /// Note: Also possible with a regular `cast(__m128i)(a)`. 523 __m128i _mm_castps_si128 (__m128 a) pure @safe 524 { 525 return cast(__m128i)a; 526 } 527 528 /// Cast vector of type `__m128i` to type `__m128d`. 529 /// Note: Also possible with a regular `cast(__m128d)(a)`. 530 __m128d _mm_castsi128_pd (__m128i a) pure @safe 531 { 532 return cast(__m128d)a; 533 } 534 535 /// Cast vector of type `__m128i` to type `__m128`. 536 /// Note: Also possible with a regular `cast(__m128)(a)`. 537 __m128 _mm_castsi128_ps (__m128i a) pure @safe 538 { 539 return cast(__m128)a; 540 } 541 542 /// Invalidate and flush the cache line that contains `p` 543 /// from all levels of the cache hierarchy. 544 void _mm_clflush (const(void)* p) @trusted 545 { 546 static if (GDC_with_SSE2) 547 { 548 __builtin_ia32_clflush(p); 549 } 550 else static if (LDC_with_SSE2) 551 { 552 __builtin_ia32_clflush(cast(void*)p); 553 } 554 else version(D_InlineAsm_X86) 555 { 556 asm pure nothrow @nogc @safe 557 { 558 mov EAX, p; 559 clflush [EAX]; 560 } 561 } 562 else version(D_InlineAsm_X86_64) 563 { 564 asm pure nothrow @nogc @safe 565 { 566 mov RAX, p; 567 clflush [RAX]; 568 } 569 } 570 else 571 { 572 // Do nothing. Invalidating cacheline does 573 // not affect correctness. 574 } 575 } 576 unittest 577 { 578 ubyte[64] cacheline; 579 _mm_clflush(cacheline.ptr); 580 } 581 582 /// Compare packed 16-bit integers in `a` and `b` for equality. 583 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 584 { 585 static if (GDC_with_SSE2) 586 { 587 return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b); 588 } 589 else 590 { 591 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 592 } 593 } 594 unittest 595 { 596 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 597 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 598 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 599 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 600 assert(R.array == E); 601 } 602 603 /// Compare packed 32-bit integers in `a` and `b` for equality. 604 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 605 { 606 static if (GDC_with_SSE2) 607 { 608 return __builtin_ia32_pcmpeqd128(a, b); 609 } 610 else 611 { 612 return equalMask!__m128i(a, b); 613 } 614 } 615 unittest 616 { 617 int4 A = [-3, -2, -1, 0]; 618 int4 B = [ 4, -2, 2, 0]; 619 int[4] E = [ 0, -1, 0, -1]; 620 int4 R = cast(int4)(_mm_cmpeq_epi32(A, B)); 621 assert(R.array == E); 622 } 623 624 /// Compare packed 8-bit integers in `a` and `b` for equality. 625 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 626 { 627 static if (GDC_with_SSE2) 628 { 629 return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b); 630 } 631 else 632 { 633 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 634 } 635 } 636 unittest 637 { 638 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 639 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 640 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 641 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 642 assert(C.array == correct); 643 } 644 645 /// Compare packed double-precision (64-bit) floating-point elements 646 /// in `a` and `b` for equality. 647 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 648 { 649 static if (GDC_with_SSE2) 650 { 651 return __builtin_ia32_cmpeqpd(a, b); 652 } 653 else 654 { 655 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b); 656 } 657 } 658 659 /// Compare the lower double-precision (64-bit) floating-point elements 660 /// in `a` and `b` for equality, store the result in the lower element, 661 /// and copy the upper element from `a`. 662 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 663 { 664 static if (GDC_with_SSE2) 665 { 666 return __builtin_ia32_cmpeqsd(a, b); 667 } 668 else 669 { 670 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b); 671 } 672 } 673 674 /// Compare packed double-precision (64-bit) floating-point elements 675 /// in `a` and `b` for greater-than-or-equal. 676 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 677 { 678 static if (GDC_with_SSE2) 679 { 680 return __builtin_ia32_cmpgepd(a, b); 681 } 682 else 683 { 684 return cast(__m128d) cmppd!(FPComparison.oge)(a, b); 685 } 686 } 687 688 /// Compare the lower double-precision (64-bit) floating-point elements 689 /// in `a` and `b` for greater-than-or-equal, store the result in the 690 /// lower element, and copy the upper element from `a`. 691 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 692 { 693 // Note: There is no __builtin_ia32_cmpgesd builtin. 694 static if (GDC_with_SSE2) 695 { 696 return __builtin_ia32_cmpnltsd(b, a); 697 } 698 else 699 { 700 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b); 701 } 702 } 703 704 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 705 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 706 { 707 static if (GDC_with_SSE2) 708 { 709 return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b); 710 } 711 else 712 { 713 return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b); 714 } 715 } 716 unittest 717 { 718 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 719 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 720 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 721 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 722 assert(R.array == E); 723 } 724 725 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 726 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 727 { 728 static if (GDC_with_SSE2) 729 { 730 return __builtin_ia32_pcmpgtd128(a, b); 731 } 732 else 733 { 734 return cast(__m128i)( greaterMask!int4(a, b)); 735 } 736 } 737 unittest 738 { 739 int4 A = [-3, 2, -1, 0]; 740 int4 B = [ 4, -2, 2, 0]; 741 int[4] E = [ 0, -1, 0, 0]; 742 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 743 assert(R.array == E); 744 } 745 746 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 747 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 748 { 749 static if (GDC_with_SSE2) 750 { 751 return cast(__m128i) __builtin_ia32_pcmpgtb128(cast(ubyte16)a, cast(ubyte16)b); 752 } 753 else 754 { 755 return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b); 756 } 757 } 758 unittest 759 { 760 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 761 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 762 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 763 byte[16] correct = [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 764 __m128i D = _mm_cmpeq_epi8(A, B); 765 assert(C.array == correct); 766 } 767 768 /// Compare packed double-precision (64-bit) floating-point elements 769 /// in `a` and `b` for greater-than. 770 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 771 { 772 static if (GDC_with_SSE2) 773 { 774 return __builtin_ia32_cmpgtpd(a, b); 775 } 776 else 777 { 778 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b); 779 } 780 } 781 782 /// Compare the lower double-precision (64-bit) floating-point elements 783 /// in `a` and `b` for greater-than, store the result in the lower element, 784 /// and copy the upper element from `a`. 785 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 786 { 787 // Note: There is no __builtin_ia32_cmpgtsd builtin. 788 static if (GDC_with_SSE2) 789 { 790 return __builtin_ia32_cmpnlesd(b, a); 791 } 792 else 793 { 794 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b); 795 } 796 } 797 798 /// Compare packed double-precision (64-bit) floating-point elements 799 /// in `a` and `b` for less-than-or-equal. 800 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 801 { 802 static if (GDC_with_SSE2) 803 { 804 return __builtin_ia32_cmplepd(a, b); 805 } 806 else 807 { 808 return cast(__m128d) cmppd!(FPComparison.ole)(a, b); 809 } 810 } 811 812 /// Compare the lower double-precision (64-bit) floating-point elements 813 /// in `a` and `b` for less-than-or-equal, store the result in the 814 /// lower element, and copy the upper element from `a`. 815 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 816 { 817 static if (GDC_with_SSE2) 818 { 819 return __builtin_ia32_cmplesd(a, b); 820 } 821 else 822 { 823 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b); 824 } 825 } 826 827 /// Compare packed 16-bit integers in `a` and `b` for less-than. 828 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 829 { 830 return _mm_cmpgt_epi16(b, a); 831 } 832 833 /// Compare packed 32-bit integers in `a` and `b` for less-than. 834 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 835 { 836 return _mm_cmpgt_epi32(b, a); 837 } 838 839 /// Compare packed 8-bit integers in `a` and `b` for less-than. 840 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 841 { 842 return _mm_cmpgt_epi8(b, a); 843 } 844 845 /// Compare packed double-precision (64-bit) floating-point elements 846 /// in `a` and `b` for less-than. 847 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 848 { 849 static if (GDC_with_SSE2) 850 { 851 return __builtin_ia32_cmpltpd(a, b); 852 } 853 else 854 { 855 return cast(__m128d) cmppd!(FPComparison.olt)(a, b); 856 } 857 } 858 859 /// Compare the lower double-precision (64-bit) floating-point elements 860 /// in `a` and `b` for less-than, store the result in the lower 861 /// element, and copy the upper element from `a`. 862 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 863 { 864 static if (GDC_with_SSE2) 865 { 866 return __builtin_ia32_cmpltsd(a, b); 867 } 868 else 869 { 870 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b); 871 } 872 } 873 874 /// Compare packed double-precision (64-bit) floating-point elements 875 /// in `a` and `b` for not-equal. 876 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 877 { 878 static if (GDC_with_SSE2) 879 { 880 return __builtin_ia32_cmpneqpd(a, b); 881 } 882 else 883 { 884 return cast(__m128d) cmppd!(FPComparison.une)(a, b); 885 } 886 } 887 888 /// Compare the lower double-precision (64-bit) floating-point elements 889 /// in `a` and `b` for not-equal, store the result in the lower 890 /// element, and copy the upper element from `a`. 891 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 892 { 893 static if (GDC_with_SSE2) 894 { 895 return __builtin_ia32_cmpneqsd(a, b); 896 } 897 else 898 { 899 return cast(__m128d) cmpsd!(FPComparison.une)(a, b); 900 } 901 } 902 903 /// Compare packed double-precision (64-bit) floating-point elements 904 /// in `a` and `b` for not-greater-than-or-equal. 905 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 906 { 907 static if (GDC_with_SSE2) 908 { 909 return __builtin_ia32_cmpngepd(a, b); 910 } 911 else 912 { 913 return cast(__m128d) cmppd!(FPComparison.ult)(a, b); 914 } 915 } 916 917 /// Compare the lower double-precision (64-bit) floating-point elements 918 /// in `a` and `b` for not-greater-than-or-equal, store the result in 919 /// the lower element, and copy the upper element from `a`. 920 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 921 { 922 // Note: There is no __builtin_ia32_cmpngesd builtin. 923 static if (GDC_with_SSE2) 924 { 925 return __builtin_ia32_cmpltsd(b, a); 926 } 927 else 928 { 929 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b); 930 } 931 } 932 933 /// Compare packed double-precision (64-bit) floating-point elements 934 /// in `a` and `b` for not-greater-than. 935 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 936 { 937 static if (GDC_with_SSE2) 938 { 939 return __builtin_ia32_cmpngtpd(a, b); 940 } 941 else 942 { 943 return cast(__m128d) cmppd!(FPComparison.ule)(a, b); 944 } 945 } 946 947 /// Compare the lower double-precision (64-bit) floating-point elements 948 /// in `a` and `b` for not-greater-than, store the result in the 949 /// lower element, and copy the upper element from `a`. 950 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 951 { 952 // Note: There is no __builtin_ia32_cmpngtsd builtin. 953 static if (GDC_with_SSE2) 954 { 955 return __builtin_ia32_cmplesd(b, a); 956 } 957 else 958 { 959 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b); 960 } 961 } 962 963 /// Compare packed double-precision (64-bit) floating-point elements 964 /// in `a` and `b` for not-less-than-or-equal. 965 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 966 { 967 static if (GDC_with_SSE2) 968 { 969 return __builtin_ia32_cmpnlepd(a, b); 970 } 971 else 972 { 973 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b); 974 } 975 } 976 977 /// Compare the lower double-precision (64-bit) floating-point elements 978 /// in `a` and `b` for not-less-than-or-equal, store the result in the 979 /// lower element, and copy the upper element from `a`. 980 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 981 { 982 static if (GDC_with_SSE2) 983 { 984 return __builtin_ia32_cmpnlesd(a, b); 985 } 986 else 987 { 988 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b); 989 } 990 } 991 992 /// Compare packed double-precision (64-bit) floating-point elements 993 /// in `a` and `b` for not-less-than. 994 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 995 { 996 static if (GDC_with_SSE2) 997 { 998 return __builtin_ia32_cmpnltpd(a, b); 999 } 1000 else 1001 { 1002 return cast(__m128d) cmppd!(FPComparison.uge)(a, b); 1003 } 1004 } 1005 1006 /// Compare the lower double-precision (64-bit) floating-point elements 1007 /// in `a` and `b` for not-less-than, store the result in the lower 1008 /// element, and copy the upper element from `a`. 1009 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 1010 { 1011 static if (GDC_with_SSE2) 1012 { 1013 return __builtin_ia32_cmpnltsd(a, b); 1014 } 1015 else 1016 { 1017 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b); 1018 } 1019 } 1020 1021 /// Compare packed double-precision (64-bit) floating-point elements 1022 /// in `a` and `b` to see if neither is NaN. 1023 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 1024 { 1025 static if (GDC_with_SSE2) 1026 { 1027 return __builtin_ia32_cmpordpd(a, b); 1028 } 1029 else 1030 { 1031 return cast(__m128d) cmppd!(FPComparison.ord)(a, b); 1032 } 1033 } 1034 1035 /// Compare the lower double-precision (64-bit) floating-point elements 1036 /// in `a` and `b` to see if neither is NaN, store the result in the 1037 /// lower element, and copy the upper element from `a` to the upper element. 1038 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 1039 { 1040 static if (GDC_with_SSE2) 1041 { 1042 return __builtin_ia32_cmpordsd(a, b); 1043 } 1044 else 1045 { 1046 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b); 1047 } 1048 } 1049 1050 /// Compare packed double-precision (64-bit) floating-point elements 1051 /// in `a` and `b` to see if either is NaN. 1052 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 1053 { 1054 static if (GDC_with_SSE2) 1055 { 1056 return __builtin_ia32_cmpunordpd(a, b); 1057 } 1058 else 1059 { 1060 return cast(__m128d) cmppd!(FPComparison.uno)(a, b); 1061 } 1062 } 1063 1064 /// Compare the lower double-precision (64-bit) floating-point elements 1065 /// in `a` and `b` to see if either is NaN, store the result in the lower 1066 /// element, and copy the upper element from `a` to the upper element. 1067 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 1068 { 1069 static if (GDC_with_SSE2) 1070 { 1071 return __builtin_ia32_cmpunordsd(a, b); 1072 } 1073 else 1074 { 1075 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b); 1076 } 1077 } 1078 1079 /// Compare the lower double-precision (64-bit) floating-point element 1080 /// in `a` and `b` for equality, and return the boolean result (0 or 1). 1081 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe 1082 { 1083 // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 1084 // comisd instruction, it returns false in case of unordered instead. 1085 // 1086 // Actually C++ compilers disagree over the meaning of that instruction. 1087 // GCC will manage NaNs like the comisd instruction (return true if unordered), 1088 // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says. 1089 // We choose to do like the most numerous. It seems GCC is buggy with NaNs. 1090 return a.array[0] == b.array[0]; 1091 } 1092 unittest 1093 { 1094 assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1095 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1096 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1097 assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1098 assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1099 } 1100 1101 /// Compare the lower double-precision (64-bit) floating-point element 1102 /// in `a` and `b` for greater-than-or-equal, and return the boolean 1103 /// result (0 or 1). 1104 int _mm_comige_sd (__m128d a, __m128d b) pure @safe 1105 { 1106 return a.array[0] >= b.array[0]; 1107 } 1108 unittest 1109 { 1110 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1111 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1112 assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1113 assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1114 assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1115 assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1116 } 1117 1118 /// Compare the lower double-precision (64-bit) floating-point element 1119 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1). 1120 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe 1121 { 1122 return a.array[0] > b.array[0]; 1123 } 1124 unittest 1125 { 1126 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1127 assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1128 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1129 assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1130 assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1131 } 1132 1133 /// Compare the lower double-precision (64-bit) floating-point element 1134 /// in `a` and `b` for less-than-or-equal. 1135 int _mm_comile_sd (__m128d a, __m128d b) pure @safe 1136 { 1137 return a.array[0] <= b.array[0]; 1138 } 1139 unittest 1140 { 1141 assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1142 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1143 assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1144 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1145 assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1146 assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1147 } 1148 1149 /// Compare the lower double-precision (64-bit) floating-point element 1150 /// in `a` and `b` for less-than, and return the boolean result (0 or 1). 1151 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe 1152 { 1153 return a.array[0] < b.array[0]; 1154 } 1155 unittest 1156 { 1157 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1158 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1159 assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1160 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1161 assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1162 assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1163 } 1164 1165 /// Compare the lower double-precision (64-bit) floating-point element 1166 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1). 1167 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe 1168 { 1169 return a.array[0] != b.array[0]; 1170 } 1171 unittest 1172 { 1173 assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1174 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1175 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1176 assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1177 assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1178 } 1179 1180 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) 1181 /// floating-point elements. 1182 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted 1183 { 1184 version(LDC) 1185 { 1186 // Generates cvtdq2pd since LDC 1.0, even without optimizations 1187 enum ir = ` 1188 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 1189 %r = sitofp <2 x i32> %v to <2 x double> 1190 ret <2 x double> %r`; 1191 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a); 1192 } 1193 else static if (GDC_with_SSE2) 1194 { 1195 return __builtin_ia32_cvtdq2pd(a); 1196 } 1197 else 1198 { 1199 double2 r = void; 1200 r.ptr[0] = a.array[0]; 1201 r.ptr[1] = a.array[1]; 1202 return r; 1203 } 1204 } 1205 unittest 1206 { 1207 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54)); 1208 assert(A.array[0] == 54.0); 1209 assert(A.array[1] == 54.0); 1210 } 1211 1212 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 1213 /// floating-point elements. 1214 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted 1215 { 1216 static if (GDC_with_SSE2) 1217 { 1218 return __builtin_ia32_cvtdq2ps(a); 1219 } 1220 else version(LDC) 1221 { 1222 // See #86 for why we had to resort to LLVM IR. 1223 // Plain code below was leading to catastrophic behaviour. 1224 // x86: Generates cvtdq2ps since LDC 1.1.0 -O0 1225 // ARM: Generats scvtf.4s since LDC 1.8.0 -O0 1226 enum ir = ` 1227 %r = sitofp <4 x i32> %0 to <4 x float> 1228 ret <4 x float> %r`; 1229 return cast(__m128) LDCInlineIR!(ir, float4, int4)(a); 1230 } 1231 else 1232 { 1233 __m128 res; 1234 res.ptr[0] = cast(float)a.array[0]; 1235 res.ptr[1] = cast(float)a.array[1]; 1236 res.ptr[2] = cast(float)a.array[2]; 1237 res.ptr[3] = cast(float)a.array[3]; 1238 return res; 1239 } 1240 } 1241 unittest 1242 { 1243 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 1244 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 1245 } 1246 1247 /// Convert packed double-precision (64-bit) floating-point elements 1248 /// in `a` to packed 32-bit integers. 1249 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted 1250 { 1251 // PERF ARM32 1252 static if (LDC_with_SSE2) 1253 { 1254 return __builtin_ia32_cvtpd2dq(a); 1255 } 1256 else static if (GDC_with_SSE2) 1257 { 1258 return __builtin_ia32_cvtpd2dq(a); 1259 } 1260 else static if (LDC_with_ARM64) 1261 { 1262 // Get current rounding mode. 1263 uint fpscr = arm_get_fpcr(); 1264 long2 i; 1265 switch(fpscr & _MM_ROUND_MASK_ARM) 1266 { 1267 default: 1268 case _MM_ROUND_NEAREST_ARM: i = vcvtnq_s64_f64(a); break; 1269 case _MM_ROUND_DOWN_ARM: i = vcvtmq_s64_f64(a); break; 1270 case _MM_ROUND_UP_ARM: i = vcvtpq_s64_f64(a); break; 1271 case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break; 1272 } 1273 int4 zero = 0; 1274 return cast(__m128i) shufflevector!(int4, 0, 2, 4, 6)(cast(int4)i, zero); 1275 } 1276 else 1277 { 1278 // PERF ARM32 1279 __m128i r = _mm_setzero_si128(); 1280 r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]); 1281 r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]); 1282 return r; 1283 } 1284 } 1285 unittest 1286 { 1287 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0)); 1288 assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0); 1289 } 1290 1291 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1292 /// to packed 32-bit integers 1293 __m64 _mm_cvtpd_pi32 (__m128d v) @safe 1294 { 1295 return to_m64(_mm_cvtpd_epi32(v)); 1296 } 1297 unittest 1298 { 1299 int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0)); 1300 assert(A.array[0] == 55 && A.array[1] == 61); 1301 } 1302 1303 /// Convert packed double-precision (64-bit) floating-point elements 1304 /// in `a` to packed single-precision (32-bit) floating-point elements. 1305 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted 1306 { 1307 static if (LDC_with_SSE2) 1308 { 1309 return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately 1310 } 1311 else static if (GDC_with_SSE2) 1312 { 1313 return __builtin_ia32_cvtpd2ps(a); 1314 } 1315 else 1316 { 1317 __m128 r = void; 1318 r.ptr[0] = a.array[0]; 1319 r.ptr[1] = a.array[1]; 1320 r.ptr[2] = 0; 1321 r.ptr[3] = 0; 1322 return r; 1323 } 1324 } 1325 unittest 1326 { 1327 __m128d A = _mm_set_pd(5.25, 4.0); 1328 __m128 B = _mm_cvtpd_ps(A); 1329 assert(B.array == [4.0f, 5.25f, 0, 0]); 1330 } 1331 1332 /// Convert packed 32-bit integers in `v` to packed double-precision 1333 /// (64-bit) floating-point elements. 1334 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe 1335 { 1336 return _mm_cvtepi32_pd(to_m128i(v)); 1337 } 1338 unittest 1339 { 1340 __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5)); 1341 assert(A.array[0] == 4.0 && A.array[1] == -5.0); 1342 } 1343 1344 /// Convert packed single-precision (32-bit) floating-point elements 1345 /// in `a` to packed 32-bit integers 1346 __m128i _mm_cvtps_epi32 (__m128 a) @trusted 1347 { 1348 static if (LDC_with_SSE2) 1349 { 1350 return cast(__m128i) __builtin_ia32_cvtps2dq(a); 1351 } 1352 else static if (GDC_with_SSE2) 1353 { 1354 return __builtin_ia32_cvtps2dq(a); 1355 } 1356 else static if (LDC_with_ARM64) 1357 { 1358 // Get current rounding mode. 1359 uint fpscr = arm_get_fpcr(); 1360 switch(fpscr & _MM_ROUND_MASK_ARM) 1361 { 1362 default: 1363 case _MM_ROUND_NEAREST_ARM: return vcvtnq_s32_f32(a); 1364 case _MM_ROUND_DOWN_ARM: return vcvtmq_s32_f32(a); 1365 case _MM_ROUND_UP_ARM: return vcvtpq_s32_f32(a); 1366 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a); 1367 } 1368 } 1369 else 1370 { 1371 __m128i r = void; 1372 r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]); 1373 r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]); 1374 r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]); 1375 r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]); 1376 return r; 1377 } 1378 } 1379 unittest 1380 { 1381 // GDC bug #98607 1382 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607 1383 // GDC does not provide optimization barrier for rounding mode. 1384 // Workarounded with different literals. This bug will likely only manifest in unittest. 1385 // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't. 1386 1387 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1388 1389 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1390 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1391 assert(A.array == [1, -2, 54, -3]); 1392 1393 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1394 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f)); 1395 assert(A.array == [1, -3, 53, -3]); 1396 1397 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1398 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f)); 1399 assert(A.array == [2, -2, 54, -2]); 1400 1401 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1402 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f)); 1403 assert(A.array == [1, -2, 53, -2]); 1404 1405 _MM_SET_ROUNDING_MODE(savedRounding); 1406 } 1407 1408 /// Convert packed single-precision (32-bit) floating-point elements 1409 /// in `a` to packed double-precision (64-bit) floating-point elements. 1410 __m128d _mm_cvtps_pd (__m128 a) pure @trusted 1411 { 1412 version(LDC) 1413 { 1414 // Generates cvtps2pd since LDC 1.0 -O0 1415 enum ir = ` 1416 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1> 1417 %r = fpext <2 x float> %v to <2 x double> 1418 ret <2 x double> %r`; 1419 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a); 1420 } 1421 else static if (GDC_with_SSE2) 1422 { 1423 return __builtin_ia32_cvtps2pd(a); 1424 } 1425 else 1426 { 1427 double2 r = void; 1428 r.ptr[0] = a.array[0]; 1429 r.ptr[1] = a.array[1]; 1430 return r; 1431 } 1432 } 1433 unittest 1434 { 1435 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f)); 1436 assert(A.array[0] == 54.0); 1437 assert(A.array[1] == 54.0); 1438 } 1439 1440 /// Copy the lower double-precision (64-bit) floating-point element of `a`. 1441 double _mm_cvtsd_f64 (__m128d a) pure @safe 1442 { 1443 return a.array[0]; 1444 } 1445 1446 /// Convert the lower double-precision (64-bit) floating-point element 1447 /// in `a` to a 32-bit integer. 1448 int _mm_cvtsd_si32 (__m128d a) @safe 1449 { 1450 static if (LDC_with_SSE2) 1451 { 1452 return __builtin_ia32_cvtsd2si(a); 1453 } 1454 else static if (GDC_with_SSE2) 1455 { 1456 return __builtin_ia32_cvtsd2si(a); 1457 } 1458 else 1459 { 1460 return convertDoubleToInt32UsingMXCSR(a[0]); 1461 } 1462 } 1463 unittest 1464 { 1465 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0))); 1466 } 1467 1468 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer. 1469 long _mm_cvtsd_si64 (__m128d a) @trusted 1470 { 1471 version (LDC) 1472 { 1473 version (X86_64) 1474 { 1475 return __builtin_ia32_cvtsd2si64(a); 1476 } 1477 else 1478 { 1479 // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer 1480 // using SSE instructions only. So the builtin doesn't exit for this arch. 1481 return convertDoubleToInt64UsingMXCSR(a[0]); 1482 } 1483 } 1484 else 1485 { 1486 return convertDoubleToInt64UsingMXCSR(a.array[0]); 1487 } 1488 } 1489 unittest 1490 { 1491 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0))); 1492 1493 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1494 1495 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1496 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49))); 1497 1498 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1499 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1))); 1500 1501 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1502 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1))); 1503 1504 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1505 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9))); 1506 1507 _MM_SET_ROUNDING_MODE(savedRounding); 1508 } 1509 1510 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; /// 1511 1512 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 1513 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a` 1514 /// to the upper elements of result. 1515 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted 1516 { 1517 static if (GDC_with_SSE2) 1518 { 1519 return __builtin_ia32_cvtsd2ss(a, b); 1520 } 1521 else 1522 { 1523 // Generates cvtsd2ss since LDC 1.3 -O0 1524 a.ptr[0] = b.array[0]; 1525 return a; 1526 } 1527 } 1528 unittest 1529 { 1530 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0)); 1531 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]); 1532 } 1533 1534 /// Get the lower 32-bit integer in `a`. 1535 int _mm_cvtsi128_si32 (__m128i a) pure @safe 1536 { 1537 return a.array[0]; 1538 } 1539 1540 /// Get the lower 64-bit integer in `a`. 1541 long _mm_cvtsi128_si64 (__m128i a) pure @safe 1542 { 1543 long2 la = cast(long2)a; 1544 return la.array[0]; 1545 } 1546 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 1547 1548 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 1549 /// lower element of result, and copy the upper element from `a` to the upper element of result. 1550 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted 1551 { 1552 a.ptr[0] = cast(double)b; 1553 return a; 1554 } 1555 unittest 1556 { 1557 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 1558 assert(a.array == [42.0, 0]); 1559 } 1560 1561 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements. 1562 __m128i _mm_cvtsi32_si128 (int a) pure @trusted 1563 { 1564 int4 r = [0, 0, 0, 0]; 1565 r.ptr[0] = a; 1566 return r; 1567 } 1568 unittest 1569 { 1570 __m128i a = _mm_cvtsi32_si128(65); 1571 assert(a.array == [65, 0, 0, 0]); 1572 } 1573 1574 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 1575 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 1576 1577 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted 1578 { 1579 a.ptr[0] = cast(double)b; 1580 return a; 1581 } 1582 unittest 1583 { 1584 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 1585 assert(a.array == [42.0, 0]); 1586 } 1587 1588 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element. 1589 __m128i _mm_cvtsi64_si128 (long a) pure @trusted 1590 { 1591 long2 r = [0, 0]; 1592 r.ptr[0] = a; 1593 return cast(__m128i)(r); 1594 } 1595 1596 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; /// 1597 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; /// 1598 1599 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 1600 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 1601 // element of result. 1602 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted 1603 { 1604 a.ptr[0] = b.array[0]; 1605 return a; 1606 } 1607 unittest 1608 { 1609 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 1610 assert(a.array == [42.0, 0]); 1611 } 1612 1613 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation. 1614 long _mm_cvttss_si64 (__m128 a) pure @safe 1615 { 1616 return cast(long)(a.array[0]); // Generates cvttss2si as expected 1617 } 1618 unittest 1619 { 1620 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1621 } 1622 1623 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1624 /// Put zeroes in the upper elements of result. 1625 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted 1626 { 1627 static if (LDC_with_SSE2) 1628 { 1629 return __builtin_ia32_cvttpd2dq(a); 1630 } 1631 else static if (GDC_with_SSE2) 1632 { 1633 return __builtin_ia32_cvttpd2dq(a); 1634 } 1635 else 1636 { 1637 // Note: doesn't generate cvttpd2dq as of LDC 1.13 1638 __m128i r; 1639 r.ptr[0] = cast(int)a.array[0]; 1640 r.ptr[1] = cast(int)a.array[1]; 1641 r.ptr[2] = 0; 1642 r.ptr[3] = 0; 1643 return r; 1644 } 1645 } 1646 unittest 1647 { 1648 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f)); 1649 assert(R.array == [-4, 45641, 0, 0]); 1650 } 1651 1652 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1653 /// to packed 32-bit integers with truncation. 1654 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe 1655 { 1656 return to_m64(_mm_cvttpd_epi32(v)); 1657 } 1658 unittest 1659 { 1660 int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f)); 1661 int[2] correct = [-4, 45641]; 1662 assert(R.array == correct); 1663 } 1664 1665 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1666 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted 1667 { 1668 // x86: Generates cvttps2dq since LDC 1.3 -O2 1669 // ARM64: generates fcvtze since LDC 1.8 -O2 1670 __m128i r; 1671 r.ptr[0] = cast(int)a.array[0]; 1672 r.ptr[1] = cast(int)a.array[1]; 1673 r.ptr[2] = cast(int)a.array[2]; 1674 r.ptr[3] = cast(int)a.array[3]; 1675 return r; 1676 } 1677 unittest 1678 { 1679 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f)); 1680 assert(R.array == [-4, 45641, 0, 1]); 1681 } 1682 1683 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation. 1684 int _mm_cvttsd_si32 (__m128d a) 1685 { 1686 // Generates cvttsd2si since LDC 1.3 -O0 1687 return cast(int)a.array[0]; 1688 } 1689 1690 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation. 1691 long _mm_cvttsd_si64 (__m128d a) 1692 { 1693 // Generates cvttsd2si since LDC 1.3 -O0 1694 // but in 32-bit instead, it's a long sequence that resort to FPU 1695 return cast(long)a.array[0]; 1696 } 1697 1698 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; /// 1699 1700 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`. 1701 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe 1702 { 1703 pragma(inline, true); 1704 return a / b; 1705 } 1706 1707 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted 1708 { 1709 static if (GDC_with_SSE2) 1710 { 1711 return __builtin_ia32_divsd(a, b); 1712 } 1713 else version(DigitalMars) 1714 { 1715 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1716 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 1717 asm pure nothrow @nogc @trusted { nop;} 1718 a.array[0] = a.array[0] / b.array[0]; 1719 return a; 1720 } 1721 else 1722 { 1723 a.ptr[0] /= b.array[0]; 1724 return a; 1725 } 1726 } 1727 unittest 1728 { 1729 __m128d a = [2.0, 4.5]; 1730 a = _mm_div_sd(a, a); 1731 assert(a.array == [1.0, 4.5]); 1732 } 1733 1734 /// Extract a 16-bit integer from `v`, selected with `index`. 1735 /// Warning: the returned value is zero-extended to 32-bits. 1736 int _mm_extract_epi16(__m128i v, int index) pure @safe 1737 { 1738 short8 r = cast(short8)v; 1739 return cast(ushort)(r.array[index & 7]); 1740 } 1741 unittest 1742 { 1743 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1); 1744 assert(_mm_extract_epi16(A, 6) == 6); 1745 assert(_mm_extract_epi16(A, 0) == 65535); 1746 assert(_mm_extract_epi16(A, 5 + 8) == 5); 1747 } 1748 1749 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`. 1750 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted 1751 { 1752 short8 r = cast(short8)v; 1753 r.ptr[index & 7] = cast(short)i; 1754 return cast(__m128i)r; 1755 } 1756 unittest 1757 { 1758 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1759 short8 R = cast(short8) _mm_insert_epi16(A, 42, 6); 1760 short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7]; 1761 assert(R.array == correct); 1762 } 1763 1764 1765 void _mm_lfence() @trusted 1766 { 1767 version(GNU) 1768 { 1769 1770 static if (GDC_with_SSE2) 1771 { 1772 __builtin_ia32_lfence(); 1773 } 1774 else version(X86) 1775 { 1776 asm pure nothrow @nogc @trusted 1777 { 1778 "lfence;\n" : : : ; 1779 } 1780 } 1781 else 1782 static assert(false); 1783 } 1784 else static if (LDC_with_SSE2) 1785 { 1786 __builtin_ia32_lfence(); 1787 } 1788 else static if (DMD_with_asm) 1789 { 1790 asm nothrow @nogc pure @safe 1791 { 1792 lfence; 1793 } 1794 } 1795 else version(LDC) 1796 { 1797 llvm_memory_fence(); // PERF actually generates mfence 1798 } 1799 else 1800 static assert(false); 1801 } 1802 unittest 1803 { 1804 _mm_lfence(); 1805 } 1806 1807 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1808 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1809 __m128d _mm_load_pd (const(double) * mem_addr) pure 1810 { 1811 pragma(inline, true); 1812 __m128d* aligned = cast(__m128d*)mem_addr; 1813 return *aligned; 1814 } 1815 unittest 1816 { 1817 align(16) double[2] S = [-5.0, 7.0]; 1818 __m128d R = _mm_load_pd(S.ptr); 1819 assert(R.array == S); 1820 } 1821 1822 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst. 1823 /// `mem_addr` does not need to be aligned on any particular boundary. 1824 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 1825 { 1826 double m = *mem_addr; 1827 __m128d r; 1828 r.ptr[0] = m; 1829 r.ptr[1] = m; 1830 return r; 1831 } 1832 unittest 1833 { 1834 double what = 4; 1835 __m128d R = _mm_load_pd1(&what); 1836 double[2] correct = [4.0, 4]; 1837 assert(R.array == correct); 1838 } 1839 1840 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 1841 /// element. `mem_addr` does not need to be aligned on any particular boundary. 1842 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted 1843 { 1844 double2 r = [0, 0]; 1845 r.ptr[0] = *mem_addr; 1846 return r; 1847 } 1848 unittest 1849 { 1850 double x = -42; 1851 __m128d a = _mm_load_sd(&x); 1852 assert(a.array == [-42.0, 0.0]); 1853 } 1854 1855 /// Load 128-bits of integer data from memory into dst. 1856 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1857 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted // TODO: shoudln't be trusted because alignment, Issue #62 1858 { 1859 pragma(inline, true); 1860 return *mem_addr; 1861 } 1862 unittest 1863 { 1864 align(16) int[4] correct = [-1, 2, 3, 4]; 1865 int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr); 1866 assert(A.array == correct); 1867 } 1868 1869 alias _mm_load1_pd = _mm_load_pd1; /// 1870 1871 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 1872 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary. 1873 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted 1874 { 1875 pragma(inline, true); 1876 a.ptr[1] = *mem_addr; 1877 return a; 1878 } 1879 unittest 1880 { 1881 double A = 7.0; 1882 __m128d B = _mm_setr_pd(4.0, -5.0); 1883 __m128d R = _mm_loadh_pd(B, &A); 1884 double[2] correct = [ 4.0, 7.0 ]; 1885 assert(R.array == correct); 1886 } 1887 1888 /// Load 64-bit integer from memory into the first element of result. Zero out the other. 1889 // Note: strange signature since the memory doesn't have to aligned (Issue #60) 1890 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted // TODO signature 1891 { 1892 pragma(inline, true); 1893 auto pLong = cast(const(long)*)mem_addr; 1894 long2 r = [0, 0]; 1895 r.ptr[0] = *pLong; 1896 return cast(__m128i)(r); 1897 } 1898 unittest 1899 { 1900 long A = 0x7878787870707070; 1901 long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A); 1902 long[2] correct = [0x7878787870707070, 0]; 1903 assert(R.array == correct); 1904 } 1905 1906 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 1907 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary. 1908 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted 1909 { 1910 a.ptr[0] = *mem_addr; 1911 return a; 1912 } 1913 unittest 1914 { 1915 double A = 7.0; 1916 __m128d B = _mm_setr_pd(4.0, -5.0); 1917 __m128d R = _mm_loadl_pd(B, &A); 1918 double[2] correct = [ 7.0, -5.0 ]; 1919 assert(R.array == correct); 1920 } 1921 1922 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 1923 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1924 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted 1925 { 1926 __m128d a = *cast(__m128d*)(mem_addr); 1927 __m128d r; 1928 r.ptr[0] = a.array[1]; 1929 r.ptr[1] = a.array[0]; 1930 return r; 1931 } 1932 unittest 1933 { 1934 align(16) double[2] A = [56.0, -74.0]; 1935 __m128d R = _mm_loadr_pd(A.ptr); 1936 double[2] correct = [-74.0, 56.0]; 1937 assert(R.array == correct); 1938 } 1939 1940 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1941 /// `mem_addr` does not need to be aligned on any particular boundary. 1942 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted 1943 { 1944 pragma(inline, true); 1945 static if (GDC_with_SSE2) 1946 { 1947 return __builtin_ia32_loadupd(mem_addr); 1948 } 1949 else version(LDC) 1950 { 1951 return loadUnaligned!(double2)(mem_addr); 1952 } 1953 else version(DigitalMars) 1954 { 1955 static if (DMD_with_DSIMD) 1956 { 1957 return cast(__m128d)__simd(XMM.LODUPD, *mem_addr); 1958 } 1959 else static if (SSESizedVectorsAreEmulated) 1960 { 1961 // Since this vector is emulated, it doesn't have alignement constraints 1962 // and as such we can just cast it. 1963 return *cast(__m128d*)(mem_addr); 1964 } 1965 else 1966 { 1967 __m128d result; 1968 result.ptr[0] = mem_addr[0]; 1969 result.ptr[1] = mem_addr[1]; 1970 return result; 1971 } 1972 } 1973 else 1974 { 1975 __m128d result; 1976 result.ptr[0] = mem_addr[0]; 1977 result.ptr[1] = mem_addr[1]; 1978 return result; 1979 } 1980 } 1981 unittest 1982 { 1983 double[2] A = [56.0, -75.0]; 1984 __m128d R = _mm_loadu_pd(A.ptr); 1985 double[2] correct = [56.0, -75.0]; 1986 assert(R.array == correct); 1987 } 1988 1989 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary. 1990 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 1991 { 1992 pragma(inline, true); 1993 static if (GDC_with_SSE2) 1994 { 1995 return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr); 1996 } 1997 else 1998 { 1999 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 2000 } 2001 } 2002 unittest 2003 { 2004 align(16) int[4] correct = [-1, 2, -3, 4]; 2005 int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr); 2006 assert(A.array == correct); 2007 } 2008 2009 /// Load unaligned 32-bit integer from memory into the first element of result. 2010 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted 2011 { 2012 pragma(inline, true); 2013 int r = *cast(int*)(mem_addr); 2014 int4 result = [0, 0, 0, 0]; 2015 result.ptr[0] = r; 2016 return result; 2017 } 2018 unittest 2019 { 2020 int r = 42; 2021 __m128i A = _mm_loadu_si32(&r); 2022 int[4] correct = [42, 0, 0, 0]; 2023 assert(A.array == correct); 2024 } 2025 2026 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 2027 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 2028 /// and pack the results in destination. 2029 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted 2030 { 2031 static if (GDC_with_SSE2) 2032 { 2033 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2034 } 2035 else static if (LDC_with_SSE2) 2036 { 2037 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2038 } 2039 else static if (LDC_with_ARM64) 2040 { 2041 int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b)); 2042 int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b)); 2043 int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl)); 2044 int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph)); 2045 return vcombine_s32(rl, rh); 2046 } 2047 else 2048 { 2049 short8 sa = cast(short8)a; 2050 short8 sb = cast(short8)b; 2051 int4 r; 2052 foreach(i; 0..4) 2053 { 2054 r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1]; 2055 } 2056 return r; 2057 } 2058 } 2059 unittest 2060 { 2061 short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2062 short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2063 int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B); 2064 int[4] correct = [1, 13, -2147483648, 2*32767*32767]; 2065 assert(R.array == correct); 2066 } 2067 2068 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 2069 /// (elements are not stored when the highest bit is not set in the corresponding element) 2070 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 2071 /// boundary. 2072 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted 2073 { 2074 static if (GDC_with_SSE2) 2075 { 2076 return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr); 2077 } 2078 else static if (LDC_with_SSE2) 2079 { 2080 return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr); 2081 } 2082 else static if (LDC_with_ARM64) 2083 { 2084 // PERF: catastrophic on ARM32 2085 byte16 bmask = cast(byte16)mask; 2086 byte16 shift = 7; 2087 bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask 2088 mask = cast(__m128i) bmask; 2089 __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr); 2090 dest = (a & mask) | (dest & ~mask); 2091 storeUnaligned!__m128i(dest, cast(int*)mem_addr); 2092 } 2093 else 2094 { 2095 byte16 b = cast(byte16)a; 2096 byte16 m = cast(byte16)mask; 2097 byte* dest = cast(byte*)(mem_addr); 2098 foreach(j; 0..16) 2099 { 2100 if (m.array[j] & 128) 2101 { 2102 dest[j] = b.array[j]; 2103 } 2104 } 2105 } 2106 } 2107 unittest 2108 { 2109 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]; 2110 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0); 2111 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); 2112 _mm_maskmoveu_si128(A, mask, dest.ptr); 2113 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42]; 2114 assert(dest == correct); 2115 } 2116 2117 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values. 2118 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe 2119 { 2120 static if (GDC_with_SSE2) 2121 { 2122 return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b); 2123 } 2124 else version(LDC) 2125 { 2126 // x86: pmaxsw since LDC 1.0 -O1 2127 // ARM: smax.8h since LDC 1.5 -01 2128 short8 sa = cast(short8)a; 2129 short8 sb = cast(short8)b; 2130 short8 greater = greaterMask!short8(sa, sb); 2131 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2132 } 2133 else 2134 { 2135 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else 2136 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2137 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2138 return _mm_xor_si128(b, mask); 2139 } 2140 } 2141 unittest 2142 { 2143 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57), 2144 _mm_setr_epi16(-4,-8, 9, 7, 0,-32768, 0, 0)); 2145 short[8] correct = [32767, 1, 9, 7, 9, 7, 0, 0]; 2146 assert(R.array == correct); 2147 } 2148 2149 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values. 2150 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe 2151 { 2152 version(LDC) 2153 { 2154 // x86: pmaxub since LDC 1.0.0 -O1 2155 // ARM64: umax.16b since LDC 1.5.0 -O1 2156 // PERF: catastrophic on ARM32 2157 ubyte16 sa = cast(ubyte16)a; 2158 ubyte16 sb = cast(ubyte16)b; 2159 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2160 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2161 } 2162 else 2163 { 2164 __m128i value128 = _mm_set1_epi8(-128); 2165 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2166 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2167 __m128i mask = _mm_and_si128(aTob, higher); 2168 return _mm_xor_si128(b, mask); 2169 } 2170 } 2171 unittest 2172 { 2173 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2174 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2175 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 2176 assert(R.array == correct); 2177 } 2178 2179 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed maximum values. 2180 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted 2181 { 2182 static if (GDC_with_SSE2) 2183 { 2184 return __builtin_ia32_maxpd(a, b); 2185 } 2186 else 2187 { 2188 // x86: Generates maxpd starting with LDC 1.9 -O2 2189 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2190 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2191 return a; 2192 } 2193 } 2194 unittest 2195 { 2196 __m128d A = _mm_setr_pd(4.0, 1.0); 2197 __m128d B = _mm_setr_pd(1.0, 8.0); 2198 __m128d M = _mm_max_pd(A, B); 2199 assert(M.array[0] == 4.0); 2200 assert(M.array[1] == 8.0); 2201 } 2202 2203 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 2204 /// lower element of result, and copy the upper element from `a` to the upper element of result. 2205 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted 2206 { 2207 static if (GDC_with_SSE2) 2208 { 2209 return __builtin_ia32_maxsd(a, b); 2210 } 2211 else 2212 { 2213 __m128d r = a; 2214 // Generates maxsd starting with LDC 1.3 2215 r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2216 return r; 2217 } 2218 } 2219 unittest 2220 { 2221 __m128d A = _mm_setr_pd(1.0, 1.0); 2222 __m128d B = _mm_setr_pd(4.0, 2.0); 2223 __m128d M = _mm_max_sd(A, B); 2224 assert(M.array[0] == 4.0); 2225 assert(M.array[1] == 1.0); 2226 } 2227 2228 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 2229 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 2230 /// is globally visible before any memory instruction which follows the fence in program order. 2231 void _mm_mfence() @trusted 2232 { 2233 version(GNU) 2234 { 2235 static if (GDC_with_SSE2) 2236 { 2237 __builtin_ia32_mfence(); 2238 } 2239 else version(X86) 2240 { 2241 asm pure nothrow @nogc @trusted 2242 { 2243 "mfence;\n" : : : ; 2244 } 2245 } 2246 else 2247 static assert(false); 2248 } 2249 else static if (LDC_with_SSE2) 2250 { 2251 __builtin_ia32_mfence(); 2252 } 2253 else static if (DMD_with_asm) 2254 { 2255 asm nothrow @nogc pure @safe 2256 { 2257 mfence; 2258 } 2259 } 2260 else version(LDC) 2261 { 2262 void _mm_mfence() pure @safe 2263 { 2264 // Note: will generate the DMB instruction on ARM 2265 llvm_memory_fence(); 2266 } 2267 } 2268 else 2269 static assert(false); 2270 } 2271 unittest 2272 { 2273 _mm_mfence(); 2274 } 2275 2276 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values. 2277 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe 2278 { 2279 static if (GDC_with_SSE2) 2280 { 2281 return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b); 2282 } 2283 else version(LDC) 2284 { 2285 // x86: pminsw since LDC 1.0 -O1 2286 // ARM64: smin.8h since LDC 1.5 -01 2287 short8 sa = cast(short8)a; 2288 short8 sb = cast(short8)b; 2289 short8 greater = greaterMask!short8(sa, sb); 2290 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2291 } 2292 else 2293 { 2294 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else 2295 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2296 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2297 return _mm_xor_si128(b, mask); 2298 } 2299 } 2300 unittest 2301 { 2302 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-32768), 2303 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 2304 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -32768]; 2305 assert(R.array == correct); 2306 } 2307 2308 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values. 2309 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe 2310 { 2311 version(LDC) 2312 { 2313 // x86: pminub since LDC 1.0.0 -O1 2314 // ARM: umin.16b since LDC 1.5.0 -O1 2315 // PERF: catastrophic on ARM32 2316 ubyte16 sa = cast(ubyte16)a; 2317 ubyte16 sb = cast(ubyte16)b; 2318 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2319 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2320 } 2321 else 2322 { 2323 __m128i value128 = _mm_set1_epi8(-128); 2324 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2325 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2326 __m128i mask = _mm_and_si128(aTob, lower); 2327 return _mm_xor_si128(b, mask); 2328 } 2329 } 2330 unittest 2331 { 2332 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2333 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2334 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 2335 assert(R.array == correct); 2336 } 2337 2338 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values. 2339 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted 2340 { 2341 static if (GDC_with_SSE2) 2342 { 2343 return __builtin_ia32_minpd(a, b); 2344 } 2345 else 2346 { 2347 // Generates minpd starting with LDC 1.9 2348 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2349 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2350 return a; 2351 } 2352 } 2353 unittest 2354 { 2355 __m128d A = _mm_setr_pd(1.0, 2.0); 2356 __m128d B = _mm_setr_pd(4.0, 1.0); 2357 __m128d M = _mm_min_pd(A, B); 2358 assert(M.array[0] == 1.0); 2359 assert(M.array[1] == 1.0); 2360 } 2361 2362 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 2363 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 2364 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe 2365 { 2366 static if (GDC_with_SSE2) 2367 { 2368 return __builtin_ia32_minsd(a, b); 2369 } 2370 else 2371 { 2372 // Generates minsd starting with LDC 1.3 2373 __m128d r = a; 2374 r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2375 return r; 2376 } 2377 } 2378 unittest 2379 { 2380 __m128d A = _mm_setr_pd(1.0, 3.0); 2381 __m128d B = _mm_setr_pd(4.0, 2.0); 2382 __m128d M = _mm_min_sd(A, B); 2383 assert(M.array[0] == 1.0); 2384 assert(M.array[1] == 3.0); 2385 } 2386 2387 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element. 2388 __m128i _mm_move_epi64 (__m128i a) pure @trusted 2389 { 2390 static if (GDC_with_SSE2) 2391 { 2392 // slightly better with GDC -O0 2393 return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 2394 } 2395 else 2396 { 2397 long2 result = [ 0, 0 ]; 2398 long2 la = cast(long2) a; 2399 result.ptr[0] = la.array[0]; 2400 return cast(__m128i)(result); 2401 } 2402 } 2403 unittest 2404 { 2405 long2 A = [13, 47]; 2406 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 2407 long[2] correct = [13, 0]; 2408 assert(B.array == correct); 2409 } 2410 2411 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 2412 /// the upper element from `a` to the upper element of dst. 2413 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted 2414 { 2415 static if (GDC_with_SSE2) 2416 { 2417 return __builtin_ia32_movsd(a, b); 2418 } 2419 else 2420 { 2421 b.ptr[1] = a.array[1]; 2422 return b; 2423 } 2424 } 2425 unittest 2426 { 2427 double2 A = [13.0, 47.0]; 2428 double2 B = [34.0, 58.0]; 2429 double2 C = _mm_move_sd(A, B); 2430 double[2] correct = [34.0, 47.0]; 2431 assert(C.array == correct); 2432 } 2433 2434 /// Create mask from the most significant bit of each 8-bit element in `v`. 2435 int _mm_movemask_epi8 (__m128i a) pure @trusted 2436 { 2437 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2438 static if (GDC_with_SSE2) 2439 { 2440 return __builtin_ia32_pmovmskb128(cast(ubyte16)a); 2441 } 2442 else static if (LDC_with_SSE2) 2443 { 2444 return __builtin_ia32_pmovmskb128(cast(byte16)a); 2445 } 2446 else static if (LDC_with_ARM64) 2447 { 2448 // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon 2449 // The other two solutions lead to unfound intrinsics in LLVM and that took a long time. 2450 // SO there might be something a bit faster, but this one is reasonable and branchless. 2451 byte8 mask_shift; 2452 mask_shift.ptr[0] = 7; 2453 mask_shift.ptr[1] = 6; 2454 mask_shift.ptr[2] = 5; 2455 mask_shift.ptr[3] = 4; 2456 mask_shift.ptr[4] = 3; 2457 mask_shift.ptr[5] = 2; 2458 mask_shift.ptr[6] = 1; 2459 mask_shift.ptr[7] = 0; 2460 byte8 mask_and = byte8(-128); 2461 byte8 lo = vget_low_u8(cast(byte16)a); 2462 byte8 hi = vget_high_u8(cast(byte16)a); 2463 lo = vand_u8(lo, mask_and); 2464 lo = vshr_u8(lo, mask_shift); 2465 hi = vand_u8(hi, mask_and); 2466 hi = vshr_u8(hi, mask_shift); 2467 lo = vpadd_u8(lo,lo); 2468 lo = vpadd_u8(lo,lo); 2469 lo = vpadd_u8(lo,lo); 2470 hi = vpadd_u8(hi,hi); 2471 hi = vpadd_u8(hi,hi); 2472 hi = vpadd_u8(hi,hi); 2473 return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]); 2474 } 2475 else 2476 { 2477 byte16 ai = cast(byte16)a; 2478 int r = 0; 2479 foreach(bit; 0..16) 2480 { 2481 if (ai.array[bit] < 0) r += (1 << bit); 2482 } 2483 return r; 2484 } 2485 } 2486 unittest 2487 { 2488 assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0))); 2489 } 2490 2491 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 2492 /// loating-point element in `v`. 2493 int _mm_movemask_pd(__m128d v) pure @safe 2494 { 2495 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2496 static if (GDC_with_SSE2) 2497 { 2498 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2499 /// packed double-precision (64-bit) floating-point element in `v`. 2500 return __builtin_ia32_movmskpd(v); 2501 } 2502 else static if (LDC_with_SSE2) 2503 { 2504 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2505 /// packed double-precision (64-bit) floating-point element in `v`. 2506 return __builtin_ia32_movmskpd(v); 2507 } 2508 else 2509 { 2510 long2 lv = cast(long2)v; 2511 int r = 0; 2512 if (lv.array[0] < 0) r += 1; 2513 if (lv.array[1] < 0) r += 2; 2514 return r; 2515 } 2516 } 2517 unittest 2518 { 2519 __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0); 2520 assert(_mm_movemask_pd(A) == 2); 2521 } 2522 2523 /// Copy the lower 64-bit integer in `v`. 2524 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe 2525 { 2526 long2 lv = cast(long2)v; 2527 return long1(lv.array[0]); 2528 } 2529 unittest 2530 { 2531 __m128i A = _mm_set_epi64x(-1, -2); 2532 __m64 R = _mm_movepi64_pi64(A); 2533 assert(R.array[0] == -2); 2534 } 2535 2536 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element. 2537 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted 2538 { 2539 long2 r; 2540 r.ptr[0] = a.array[0]; 2541 r.ptr[1] = 0; 2542 return cast(__m128i)r; 2543 } 2544 2545 // Note: generates pmuludq in LDC with -O1 2546 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted 2547 { 2548 __m128i zero = _mm_setzero_si128(); 2549 2550 static if (__VERSION__ >= 2088) 2551 { 2552 // Need LLVM9 to avoid this shufflevector 2553 long2 la, lb; 2554 la.ptr[0] = cast(uint)a.array[0]; 2555 la.ptr[1] = cast(uint)a.array[2]; 2556 lb.ptr[0] = cast(uint)b.array[0]; 2557 lb.ptr[1] = cast(uint)b.array[2]; 2558 } 2559 else 2560 { 2561 long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero); 2562 long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero); 2563 } 2564 2565 version(DigitalMars) 2566 { 2567 // DMD has no long2 mul 2568 // long2 mul not supported before LDC 1.5 2569 la.ptr[0] *= lb.array[0]; 2570 la.ptr[1] *= lb.array[1]; 2571 return cast(__m128i)(la); 2572 } 2573 else 2574 { 2575 static if (__VERSION__ >= 2076) 2576 { 2577 return cast(__m128i)(la * lb); 2578 } 2579 else 2580 { 2581 // long2 mul not supported before LDC 1.5 2582 la.ptr[0] *= lb.array[0]; 2583 la.ptr[1] *= lb.array[1]; 2584 return cast(__m128i)(la); 2585 } 2586 } 2587 } 2588 unittest 2589 { 2590 __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff); 2591 __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff); 2592 __m128i C = _mm_mul_epu32(A, B); 2593 long2 LC = cast(long2)C; 2594 assert(LC.array[0] == 18446744065119617025uL); 2595 assert(LC.array[1] == 12723420444339690338uL); 2596 } 2597 2598 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 2599 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 2600 { 2601 pragma(inline, true); 2602 return a * b; 2603 } 2604 unittest 2605 { 2606 __m128d a = [-2.0, 1.5]; 2607 a = _mm_mul_pd(a, a); 2608 assert(a.array == [4.0, 2.25]); 2609 } 2610 2611 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 2612 /// element of result, and copy the upper element from `a` to the upper element of result. 2613 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted 2614 { 2615 version(DigitalMars) 2616 { 2617 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 2618 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 2619 asm pure nothrow @nogc @trusted { nop;} 2620 a.array[0] = a.array[0] * b.array[0]; 2621 return a; 2622 } 2623 else static if (GDC_with_SSE2) 2624 { 2625 return __builtin_ia32_mulsd(a, b); 2626 } 2627 else 2628 { 2629 a.ptr[0] *= b.array[0]; 2630 return a; 2631 } 2632 } 2633 unittest 2634 { 2635 __m128d a = [-2.0, 1.5]; 2636 a = _mm_mul_sd(a, a); 2637 assert(a.array == [4.0, 1.5]); 2638 } 2639 2640 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 2641 /// and get an unsigned 64-bit result. 2642 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe 2643 { 2644 return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b))); 2645 } 2646 unittest 2647 { 2648 __m64 A = _mm_set_pi32(42, 0xDEADBEEF); 2649 __m64 B = _mm_set_pi32(42, 0xCAFEBABE); 2650 __m64 C = _mm_mul_su32(A, B); 2651 assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL); 2652 } 2653 2654 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2655 /// high 16 bits of the intermediate integers. 2656 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted 2657 { 2658 static if (GDC_with_SSE2) 2659 { 2660 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2661 } 2662 else static if (LDC_with_SSE2) 2663 { 2664 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2665 } 2666 else 2667 { 2668 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h 2669 // PERF: it seems the simde solution has one less instruction in ARM64. 2670 // PERF: Catastrophic in ARM32. 2671 short8 sa = cast(short8)a; 2672 short8 sb = cast(short8)b; 2673 short8 r = void; 2674 r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16; 2675 r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16; 2676 r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16; 2677 r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16; 2678 r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16; 2679 r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16; 2680 r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16; 2681 r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16; 2682 return cast(__m128i)r; 2683 } 2684 } 2685 unittest 2686 { 2687 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2688 __m128i B = _mm_set1_epi16(16384); 2689 short8 R = cast(short8)_mm_mulhi_epi16(A, B); 2690 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1]; 2691 assert(R.array == correct); 2692 } 2693 2694 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2695 /// high 16 bits of the intermediate integers. 2696 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted 2697 { 2698 static if (GDC_with_SSE2) 2699 { 2700 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2701 } 2702 else static if (LDC_with_SSE2) 2703 { 2704 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2705 } 2706 else 2707 { 2708 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h 2709 // it seems the simde solution has one less instruction in ARM64 2710 // PERF: Catastrophic in ARM32. 2711 short8 sa = cast(short8)a; 2712 short8 sb = cast(short8)b; 2713 short8 r = void; 2714 r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 ); 2715 r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 ); 2716 r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 ); 2717 r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 ); 2718 r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 ); 2719 r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 ); 2720 r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 ); 2721 r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 ); 2722 return cast(__m128i)r; 2723 } 2724 } 2725 unittest 2726 { 2727 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2728 __m128i B = _mm_set1_epi16(16384); 2729 short8 R = cast(short8)_mm_mulhi_epu16(A, B); 2730 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1]; 2731 assert(R.array == correct); 2732 } 2733 2734 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 2735 /// bits of the intermediate integers. 2736 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe 2737 { 2738 return cast(__m128i)(cast(short8)a * cast(short8)b); 2739 } 2740 unittest 2741 { 2742 __m128i A = _mm_setr_epi16(16384, -16, 0, 3, 4, 1, 16, 7); 2743 __m128i B = _mm_set1_epi16(16384); 2744 short8 R = cast(short8)_mm_mullo_epi16(A, B); 2745 short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384]; 2746 assert(R.array == correct); 2747 } 2748 2749 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 2750 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 2751 { 2752 pragma(inline, true); 2753 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 2754 } 2755 2756 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`. 2757 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 2758 { 2759 pragma(inline, true); 2760 return a | b; 2761 } 2762 2763 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation. 2764 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted 2765 { 2766 static if (GDC_with_SSE2) 2767 { 2768 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2769 } 2770 else static if (LDC_with_SSE2) 2771 { 2772 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2773 } 2774 else static if (LDC_with_ARM64) 2775 { 2776 short4 ra = vqmovn_s32(cast(int4)a); 2777 short4 rb = vqmovn_s32(cast(int4)b); 2778 return cast(__m128i)vcombine_s16(ra, rb); 2779 } 2780 else 2781 { 2782 // PERF: catastrophic on ARM32 2783 short8 r; 2784 r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]); 2785 r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]); 2786 r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]); 2787 r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]); 2788 r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]); 2789 r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]); 2790 r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]); 2791 r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]); 2792 return cast(__m128i)r; 2793 } 2794 } 2795 unittest 2796 { 2797 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 2798 short8 R = cast(short8) _mm_packs_epi32(A, A); 2799 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0]; 2800 assert(R.array == correct); 2801 } 2802 2803 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation. 2804 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted 2805 { 2806 static if (GDC_with_SSE2) 2807 { 2808 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2809 } 2810 else static if (LDC_with_SSE2) 2811 { 2812 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2813 } 2814 else static if (LDC_with_ARM64) 2815 { 2816 // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02 2817 byte8 ra = vqmovn_s16(cast(short8)a); 2818 byte8 rb = vqmovn_s16(cast(short8)b); 2819 return cast(__m128i)vcombine_s8(ra, rb); 2820 } 2821 else 2822 { 2823 // PERF: ARM32 is missing 2824 byte16 r; 2825 short8 sa = cast(short8)a; 2826 short8 sb = cast(short8)b; 2827 foreach(i; 0..8) 2828 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]); 2829 foreach(i; 0..8) 2830 r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]); 2831 return cast(__m128i)r; 2832 } 2833 } 2834 unittest 2835 { 2836 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0); 2837 byte16 R = cast(byte16) _mm_packs_epi16(A, A); 2838 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0, 2839 127, -128, 127, 0, 127, -128, 127, 0]; 2840 assert(R.array == correct); 2841 } 2842 2843 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation. 2844 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 2845 { 2846 static if (GDC_with_SSE2) 2847 { 2848 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2849 } 2850 else static if (LDC_with_SSE2) 2851 { 2852 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2853 } 2854 else static if (LDC_with_ARM64) 2855 { 2856 // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02 2857 byte8 ra = vqmovun_s16(cast(short8)a); 2858 byte8 rb = vqmovun_s16(cast(short8)b); 2859 return cast(__m128i)vcombine_s8(ra, rb); 2860 } 2861 else 2862 { 2863 short8 sa = cast(short8)a; 2864 short8 sb = cast(short8)b; 2865 ubyte[16] result = void; 2866 for (int i = 0; i < 8; ++i) 2867 { 2868 short s = sa[i]; 2869 if (s < 0) s = 0; 2870 if (s > 255) s = 255; 2871 result[i] = cast(ubyte)s; 2872 2873 s = sb[i]; 2874 if (s < 0) s = 0; 2875 if (s > 255) s = 255; 2876 result[i+8] = cast(ubyte)s; 2877 } 2878 return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr); 2879 } 2880 } 2881 unittest 2882 { 2883 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 2884 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 2885 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 2886 0, 255, 0, 255, 255, 2, 1, 0]; 2887 foreach(i; 0..16) 2888 assert(AA.array[i] == cast(byte)(correctResult[i])); 2889 } 2890 2891 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 2892 /// and power consumption of spin-wait loops. 2893 void _mm_pause() @trusted 2894 { 2895 version(GNU) 2896 { 2897 static if (GDC_with_SSE2) 2898 { 2899 __builtin_ia32_pause(); 2900 } 2901 else version(X86) 2902 { 2903 asm pure nothrow @nogc @trusted 2904 { 2905 "pause;\n" : : : ; 2906 } 2907 } 2908 else 2909 static assert(false); 2910 } 2911 else static if (LDC_with_SSE2) 2912 { 2913 __builtin_ia32_pause(); 2914 } 2915 else static if (DMD_with_asm) 2916 { 2917 asm nothrow @nogc pure @safe 2918 { 2919 rep; nop; // F3 90 = pause 2920 } 2921 } 2922 else version (LDC) 2923 { 2924 // PERF: Do nothing currently , could be the "yield" intruction on ARM. 2925 } 2926 else 2927 static assert(false); 2928 } 2929 unittest 2930 { 2931 _mm_pause(); 2932 } 2933 2934 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 2935 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 2936 /// low 16 bits of 64-bit elements in result. 2937 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted 2938 { 2939 static if (GDC_with_SSE2) 2940 { 2941 return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b); 2942 } 2943 else static if (LDC_with_SSE2) 2944 { 2945 return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b); 2946 } 2947 else static if (LDC_with_ARM64) 2948 { 2949 ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b)); 2950 2951 // PERF: Looks suboptimal vs addp 2952 ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]); 2953 ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]); 2954 ushort8 r = 0; 2955 r[0] = r0; 2956 r[4] = r4; 2957 return cast(__m128i) r; 2958 } 2959 else 2960 { 2961 // PERF: ARM32 is lacking 2962 byte16 ab = cast(byte16)a; 2963 byte16 bb = cast(byte16)b; 2964 ubyte[16] t; 2965 foreach(i; 0..16) 2966 { 2967 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]); 2968 if (diff < 0) diff = -diff; 2969 t[i] = cast(ubyte)(diff); 2970 } 2971 int4 r = _mm_setzero_si128(); 2972 r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 2973 r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15]; 2974 return r; 2975 } 2976 } 2977 unittest 2978 { 2979 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 2980 __m128i B = _mm_set1_epi8(1); 2981 __m128i R = _mm_sad_epu8(A, B); 2982 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 2983 0, 2984 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 2985 0]; 2986 assert(R.array == correct); 2987 } 2988 2989 /// Set packed 16-bit integers with the supplied values. 2990 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 2991 { 2992 short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7]; 2993 return cast(__m128i) loadUnaligned!(short8)(result.ptr); 2994 } 2995 unittest 2996 { 2997 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 2998 short8 B = cast(short8) A; 2999 foreach(i; 0..8) 3000 assert(B.array[i] == i); 3001 } 3002 3003 /// Set packed 32-bit integers with the supplied values. 3004 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3005 { 3006 pragma(inline, true); 3007 int[4] result = [e0, e1, e2, e3]; 3008 return loadUnaligned!(int4)(result.ptr); 3009 } 3010 unittest 3011 { 3012 __m128i A = _mm_set_epi32(3, 2, 1, 0); 3013 foreach(i; 0..4) 3014 assert(A.array[i] == i); 3015 } 3016 3017 /// Set packed 64-bit integers with the supplied values. 3018 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted 3019 { 3020 pragma(inline, true); 3021 long[2] result = [e0.array[0], e1.array[0]]; 3022 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 3023 } 3024 unittest 3025 { 3026 __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678)); 3027 long2 B = cast(long2) A; 3028 assert(B.array[0] == 5678); 3029 assert(B.array[1] == 1234); 3030 } 3031 3032 /// Set packed 64-bit integers with the supplied values. 3033 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 3034 { 3035 pragma(inline, true); 3036 long[2] result = [e0, e1]; 3037 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 3038 } 3039 unittest 3040 { 3041 __m128i A = _mm_set_epi64x(1234, 5678); 3042 long2 B = cast(long2) A; 3043 assert(B.array[0] == 5678); 3044 assert(B.array[1] == 1234); 3045 } 3046 3047 /// Set packed 8-bit integers with the supplied values. 3048 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 3049 byte e11, byte e10, byte e9, byte e8, 3050 byte e7, byte e6, byte e5, byte e4, 3051 byte e3, byte e2, byte e1, byte e0) pure @trusted 3052 { 3053 byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 3054 e8, e9, e10, e11, e12, e13, e14, e15]; 3055 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 3056 } 3057 3058 /// Set packed double-precision (64-bit) floating-point elements with the supplied values. 3059 __m128d _mm_set_pd (double e1, double e0) pure @trusted 3060 { 3061 pragma(inline, true); 3062 double[2] result = [e0, e1]; 3063 return loadUnaligned!(double2)(result.ptr); 3064 } 3065 unittest 3066 { 3067 __m128d A = _mm_set_pd(61.0, 55.0); 3068 double[2] correct = [55.0, 61.0]; 3069 assert(A.array == correct); 3070 } 3071 3072 /// Broadcast double-precision (64-bit) floating-point value `a` to all element. 3073 __m128d _mm_set_pd1 (double a) pure @trusted 3074 { 3075 pragma(inline, true); 3076 double[2] result = [a, a]; 3077 return loadUnaligned!(double2)(result.ptr); 3078 } 3079 unittest 3080 { 3081 __m128d A = _mm_set_pd1(61.0); 3082 double[2] correct = [61.0, 61.0]; 3083 assert(A.array == correct); 3084 } 3085 3086 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 3087 /// and zero the upper element. 3088 __m128d _mm_set_sd (double a) pure @trusted 3089 { 3090 double[2] result = [a, 0]; 3091 return loadUnaligned!(double2)(result.ptr); 3092 } 3093 3094 /// Broadcast 16-bit integer a to all elements of dst. 3095 __m128i _mm_set1_epi16 (short a) pure @trusted 3096 { 3097 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 3098 { 3099 short8 v = a; 3100 return cast(__m128i) v; 3101 } 3102 else 3103 { 3104 pragma(inline, true); 3105 return cast(__m128i)(short8(a)); 3106 } 3107 } 3108 unittest 3109 { 3110 short8 a = cast(short8) _mm_set1_epi16(31); 3111 for (int i = 0; i < 8; ++i) 3112 assert(a.array[i] == 31); 3113 } 3114 3115 /// Broadcast 32-bit integer `a` to all elements. 3116 __m128i _mm_set1_epi32 (int a) pure @trusted 3117 { 3118 pragma(inline, true); 3119 return cast(__m128i)(int4(a)); 3120 } 3121 unittest 3122 { 3123 int4 a = cast(int4) _mm_set1_epi32(31); 3124 for (int i = 0; i < 4; ++i) 3125 assert(a.array[i] == 31); 3126 } 3127 3128 /// Broadcast 64-bit integer `a` to all elements. 3129 __m128i _mm_set1_epi64 (__m64 a) pure @safe 3130 { 3131 return _mm_set_epi64(a, a); 3132 } 3133 unittest 3134 { 3135 long b = 0x1DEADCAFE; 3136 __m64 a; 3137 a.ptr[0] = b; 3138 long2 c = cast(long2) _mm_set1_epi64(a); 3139 assert(c.array[0] == b); 3140 assert(c.array[1] == b); 3141 } 3142 3143 /// Broadcast 64-bit integer `a` to all elements 3144 __m128i _mm_set1_epi64x (long a) pure @trusted 3145 { 3146 long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3147 return cast(__m128i)(b); 3148 } 3149 unittest 3150 { 3151 long b = 0x1DEADCAFE; 3152 long2 c = cast(long2) _mm_set1_epi64x(b); 3153 for (int i = 0; i < 2; ++i) 3154 assert(c.array[i] == b); 3155 } 3156 3157 /// Broadcast 8-bit integer `a` to all elements. 3158 __m128i _mm_set1_epi8 (byte a) pure @trusted 3159 { 3160 pragma(inline, true); 3161 byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3162 return cast(__m128i)(b); 3163 } 3164 unittest 3165 { 3166 byte16 b = cast(byte16) _mm_set1_epi8(31); 3167 for (int i = 0; i < 16; ++i) 3168 assert(b.array[i] == 31); 3169 } 3170 3171 alias _mm_set1_pd = _mm_set_pd1; 3172 3173 /// Set packed 16-bit integers with the supplied values in reverse order. 3174 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 3175 short e3, short e2, short e1, short e0) pure @trusted 3176 { 3177 short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0]; 3178 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 3179 } 3180 unittest 3181 { 3182 short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0); 3183 short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0]; 3184 assert(A.array == correct); 3185 } 3186 3187 /// Set packed 32-bit integers with the supplied values in reverse order. 3188 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3189 { 3190 pragma(inline, true); 3191 int[4] result = [e3, e2, e1, e0]; 3192 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 3193 } 3194 unittest 3195 { 3196 int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647); 3197 int[4] correct = [-1, 0, -2147483648, 2147483647]; 3198 assert(A.array == correct); 3199 } 3200 3201 /// Set packed 64-bit integers with the supplied values in reverse order. 3202 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 3203 { 3204 long[2] result = [e1, e0]; 3205 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 3206 } 3207 unittest 3208 { 3209 long2 A = cast(long2) _mm_setr_epi64(-1, 0); 3210 long[2] correct = [-1, 0]; 3211 assert(A.array == correct); 3212 } 3213 3214 /// Set packed 8-bit integers with the supplied values in reverse order. 3215 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12, 3216 byte e11, byte e10, byte e9, byte e8, 3217 byte e7, byte e6, byte e5, byte e4, 3218 byte e3, byte e2, byte e1, byte e0) pure @trusted 3219 { 3220 byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 3221 e7, e6, e5, e4, e3, e2, e1, e0]; 3222 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 3223 } 3224 3225 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order. 3226 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 3227 { 3228 pragma(inline, true); 3229 double2 result; 3230 result.ptr[0] = e1; 3231 result.ptr[1] = e0; 3232 return result; 3233 } 3234 unittest 3235 { 3236 __m128d A = _mm_setr_pd(61.0, 55.0); 3237 double[2] correct = [61.0, 55.0]; 3238 assert(A.array == correct); 3239 } 3240 3241 /// Return vector of type `__m128d` with all elements set to zero. 3242 __m128d _mm_setzero_pd () pure @trusted 3243 { 3244 pragma(inline, true); 3245 // Note: using loadUnaligned has better -O0 codegen compared to .ptr 3246 double[2] result = [0.0, 0.0]; 3247 return loadUnaligned!(double2)(result.ptr); 3248 } 3249 3250 /// Return vector of type `__m128i` with all elements set to zero. 3251 __m128i _mm_setzero_si128() pure @trusted 3252 { 3253 pragma(inline, true); 3254 // Note: using loadUnaligned has better -O0 codegen compared to .ptr 3255 int[4] result = [0, 0, 0, 0]; 3256 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 3257 } 3258 3259 /// Shuffle 32-bit integers in a using the control in `imm8`. 3260 /// See_also: `_MM_SHUFFLE`. 3261 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe 3262 { 3263 static if (GDC_with_SSE2) 3264 { 3265 return __builtin_ia32_pshufd(a, imm8); 3266 } 3267 else 3268 { 3269 return shufflevector!(int4, (imm8 >> 0) & 3, 3270 (imm8 >> 2) & 3, 3271 (imm8 >> 4) & 3, 3272 (imm8 >> 6) & 3)(a, a); 3273 } 3274 } 3275 unittest 3276 { 3277 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 3278 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3279 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 3280 int[4] expectedB = [ 3, 2, 1, 0 ]; 3281 assert(B.array == expectedB); 3282 } 3283 3284 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`. 3285 /// See_also: `_MM_SHUFFLE2`. 3286 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe 3287 { 3288 static if (GDC_with_SSE2) 3289 { 3290 return __builtin_ia32_shufpd(a, b, imm8); 3291 } 3292 else 3293 { 3294 return shufflevector!(double2, 0 + ( imm8 & 1 ), 3295 2 + ( (imm8 >> 1) & 1 ))(a, b); 3296 } 3297 } 3298 unittest 3299 { 3300 __m128d A = _mm_setr_pd(0.5, 2.0); 3301 __m128d B = _mm_setr_pd(4.0, 5.0); 3302 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 3303 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B); 3304 double[2] correct = [ 2.0, 5.0 ]; 3305 assert(R.array == correct); 3306 } 3307 3308 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 3309 /// 64 bits of result, with the low 64 bits being copied from from `a` to result. 3310 /// See also: `_MM_SHUFFLE`. 3311 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe 3312 { 3313 static if (GDC_with_SSE2) 3314 { 3315 return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8); 3316 } 3317 else 3318 { 3319 return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3, 3320 4 + ( (imm8 >> 0) & 3 ), 3321 4 + ( (imm8 >> 2) & 3 ), 3322 4 + ( (imm8 >> 4) & 3 ), 3323 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 3324 } 3325 } 3326 unittest 3327 { 3328 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3329 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3330 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 3331 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 3332 assert(C.array == expectedC); 3333 } 3334 3335 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 3336 /// bits of result, with the high 64 bits being copied from from `a` to result. 3337 /// See_also: `_MM_SHUFFLE`. 3338 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe 3339 { 3340 static if (GDC_with_SSE2) 3341 { 3342 return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8); 3343 } 3344 else 3345 { 3346 return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ), 3347 ( (imm8 >> 2) & 3 ), 3348 ( (imm8 >> 4) & 3 ), 3349 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 3350 } 3351 } 3352 unittest 3353 { 3354 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3355 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3356 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 3357 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 3358 assert(B.array == expectedB); 3359 } 3360 3361 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros. 3362 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted 3363 { 3364 static if (LDC_with_SSE2) 3365 { 3366 return __builtin_ia32_pslld128(a, count); 3367 } 3368 else static if (GDC_with_SSE2) 3369 { 3370 return __builtin_ia32_pslld128(a, count); 3371 } 3372 else static if (DMD_with_32bit_asm) 3373 { 3374 asm pure nothrow @nogc @trusted 3375 { 3376 movdqu XMM0, a; 3377 movdqu XMM1, count; 3378 pslld XMM0, XMM1; 3379 movdqu a, XMM0; 3380 } 3381 return a; 3382 } 3383 else 3384 { 3385 int4 r = void; 3386 long2 lc = cast(long2)count; 3387 int bits = cast(int)(lc.array[0]); 3388 foreach(i; 0..4) 3389 r[i] = cast(uint)(a[i]) << bits; 3390 return r; 3391 } 3392 } 3393 3394 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros. 3395 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted 3396 { 3397 static if (LDC_with_SSE2) 3398 { 3399 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3400 } 3401 else static if (GDC_with_SSE2) 3402 { 3403 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3404 } 3405 else static if (DMD_with_32bit_asm) 3406 { 3407 asm pure nothrow @nogc @trusted 3408 { 3409 movdqu XMM0, a; 3410 movdqu XMM1, count; 3411 psllq XMM0, XMM1; 3412 movdqu a, XMM0; 3413 } 3414 return a; 3415 } 3416 else 3417 { 3418 // ARM: good since LDC 1.12 -O2 3419 // ~but -O0 version is catastrophic 3420 long2 r = void; 3421 long2 sa = cast(long2)a; 3422 long2 lc = cast(long2)count; 3423 int bits = cast(int)(lc.array[0]); 3424 foreach(i; 0..2) 3425 r.array[i] = cast(ulong)(sa.array[i]) << bits; 3426 return cast(__m128i)r; 3427 } 3428 } 3429 3430 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros. 3431 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 3432 { 3433 static if (LDC_with_SSE2) 3434 { 3435 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3436 } 3437 else static if (GDC_with_SSE2) 3438 { 3439 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3440 } 3441 else static if (DMD_with_32bit_asm) 3442 { 3443 asm pure nothrow @nogc 3444 { 3445 movdqu XMM0, a; 3446 movdqu XMM1, count; 3447 psllw XMM0, XMM1; 3448 movdqu a, XMM0; 3449 } 3450 return a; 3451 } 3452 else 3453 { 3454 short8 sa = cast(short8)a; 3455 long2 lc = cast(long2)count; 3456 int bits = cast(int)(lc.array[0]); 3457 short8 r = void; 3458 foreach(i; 0..8) 3459 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits); 3460 return cast(int4)r; 3461 } 3462 } 3463 3464 3465 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 3466 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted 3467 { 3468 static if (GDC_with_SSE2) 3469 { 3470 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3471 } 3472 else static if (LDC_with_SSE2) 3473 { 3474 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3475 } 3476 else 3477 { 3478 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3479 // D says "It's illegal to shift by the same or more bits 3480 // than the size of the quantity being shifted" 3481 // and it's UB instead. 3482 int4 r = _mm_setzero_si128(); 3483 3484 ubyte count = cast(ubyte) imm8; 3485 if (count > 31) 3486 return r; 3487 3488 foreach(i; 0..4) 3489 r.array[i] = cast(uint)(a.array[i]) << count; 3490 return r; 3491 } 3492 } 3493 unittest 3494 { 3495 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3496 __m128i B = _mm_slli_epi32(A, 1); 3497 __m128i B2 = _mm_slli_epi32(A, 1 + 256); 3498 int[4] expectedB = [ 0, 4, 6, -8]; 3499 assert(B.array == expectedB); 3500 assert(B2.array == expectedB); 3501 3502 __m128i C = _mm_slli_epi32(A, 0); 3503 int[4] expectedC = [ 0, 2, 3, -4]; 3504 assert(C.array == expectedC); 3505 3506 __m128i D = _mm_slli_epi32(A, 65); 3507 int[4] expectedD = [ 0, 0, 0, 0]; 3508 assert(D.array == expectedD); 3509 } 3510 3511 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros. 3512 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted 3513 { 3514 static if (GDC_with_SSE2) 3515 { 3516 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3517 } 3518 else static if (LDC_with_SSE2) 3519 { 3520 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3521 } 3522 else 3523 { 3524 long2 sa = cast(long2)a; 3525 3526 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3527 // D says "It's illegal to shift by the same or more bits 3528 // than the size of the quantity being shifted" 3529 // and it's UB instead. 3530 long2 r = cast(long2) _mm_setzero_si128(); 3531 ubyte count = cast(ubyte) imm8; 3532 if (count > 63) 3533 return cast(__m128i)r; 3534 3535 r.ptr[0] = cast(ulong)(sa.array[0]) << count; 3536 r.ptr[1] = cast(ulong)(sa.array[1]) << count; 3537 return cast(__m128i)r; 3538 } 3539 } 3540 unittest 3541 { 3542 __m128i A = _mm_setr_epi64(8, -4); 3543 long2 B = cast(long2) _mm_slli_epi64(A, 1); 3544 long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024); 3545 long[2] expectedB = [ 16, -8]; 3546 assert(B.array == expectedB); 3547 assert(B2.array == expectedB); 3548 3549 long2 C = cast(long2) _mm_slli_epi64(A, 0); 3550 long[2] expectedC = [ 8, -4]; 3551 assert(C.array == expectedC); 3552 3553 long2 D = cast(long2) _mm_slli_epi64(A, 64); 3554 long[2] expectedD = [ 0, -0]; 3555 assert(D.array == expectedD); 3556 } 3557 3558 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 3559 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted 3560 { 3561 static if (GDC_with_SSE2) 3562 { 3563 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3564 } 3565 else static if (LDC_with_SSE2) 3566 { 3567 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3568 } 3569 else static if (LDC_with_ARM64) 3570 { 3571 short8 sa = cast(short8)a; 3572 short8 r = cast(short8)_mm_setzero_si128(); 3573 ubyte count = cast(ubyte) imm8; 3574 if (count > 15) 3575 return cast(__m128i)r; 3576 r = sa << short8(count); 3577 return cast(__m128i)r; 3578 } 3579 else 3580 { 3581 short8 sa = cast(short8)a; 3582 short8 r = cast(short8)_mm_setzero_si128(); 3583 ubyte count = cast(ubyte) imm8; 3584 if (count > 15) 3585 return cast(__m128i)r; 3586 foreach(i; 0..8) 3587 r.ptr[i] = cast(short)(sa.array[i] << count); 3588 return cast(__m128i)r; 3589 } 3590 } 3591 unittest 3592 { 3593 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3594 short8 B = cast(short8)( _mm_slli_epi16(A, 1) ); 3595 short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) ); 3596 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 3597 assert(B.array == expectedB); 3598 assert(B2.array == expectedB); 3599 3600 short8 C = cast(short8)( _mm_slli_epi16(A, 16) ); 3601 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 3602 assert(C.array == expectedC); 3603 } 3604 3605 3606 /// Shift `a` left by `bytes` bytes while shifting in zeros. 3607 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted 3608 { 3609 static if (bytes & 0xF0) 3610 { 3611 return _mm_setzero_si128(); 3612 } 3613 else 3614 { 3615 static if (GDC_with_SSE2) 3616 { 3617 return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 3618 } 3619 else version(DigitalMars) 3620 { 3621 version(D_InlineAsm_X86) 3622 { 3623 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64 3624 { 3625 movdqu XMM0, op; 3626 pslldq XMM0, bytes; 3627 movdqu op, XMM0; 3628 } 3629 return op; 3630 } 3631 else 3632 { 3633 byte16 A = cast(byte16)op; 3634 byte16 R; 3635 for (int n = 15; n >= bytes; --n) 3636 R.ptr[n] = A.array[n-bytes]; 3637 for (int n = bytes-1; n >= 0; --n) 3638 R.ptr[n] = 0; 3639 return cast(__m128i)R; 3640 } 3641 } 3642 else 3643 { 3644 return cast(__m128i) shufflevector!(byte16, 3645 16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes, 3646 22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes, 3647 28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes) 3648 (cast(byte16)_mm_setzero_si128(), cast(byte16)op); 3649 } 3650 } 3651 } 3652 unittest 3653 { 3654 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3655 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left 3656 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ]; 3657 assert(R.array == correct); 3658 3659 __m128i B = _mm_srli_si128!16(_mm_set1_epi32(-1)); 3660 int[4] expectedB = [0, 0, 0, 0]; 3661 assert(B.array == expectedB); 3662 } 3663 3664 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`. 3665 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted 3666 { 3667 version(LDC) 3668 { 3669 // Disappeared with LDC 1.11 3670 static if (__VERSION__ < 2081) 3671 return __builtin_ia32_sqrtpd(vec); 3672 else 3673 { 3674 vec.array[0] = llvm_sqrt(vec.array[0]); 3675 vec.array[1] = llvm_sqrt(vec.array[1]); 3676 return vec; 3677 } 3678 } 3679 else static if (GDC_with_SSE2) 3680 { 3681 return __builtin_ia32_sqrtpd(vec); 3682 } 3683 else 3684 { 3685 vec.ptr[0] = sqrt(vec.array[0]); 3686 vec.ptr[1] = sqrt(vec.array[1]); 3687 return vec; 3688 } 3689 } 3690 3691 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 3692 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 3693 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted 3694 { 3695 // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only. 3696 // "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 3697 // The quadword at bits 127:64 of the destination operand remains unchanged." 3698 version(LDC) 3699 { 3700 // Disappeared with LDC 1.11 3701 static if (__VERSION__ < 2081) 3702 { 3703 __m128d c = __builtin_ia32_sqrtsd(b); 3704 a[0] = c[0]; 3705 return a; 3706 } 3707 else 3708 { 3709 a.array[0] = llvm_sqrt(b.array[0]); 3710 return a; 3711 } 3712 } 3713 else static if (GDC_with_SSE2) 3714 { 3715 __m128d c = __builtin_ia32_sqrtsd(b); 3716 a.ptr[0] = c.array[0]; 3717 return a; 3718 } 3719 else 3720 { 3721 a.ptr[0] = sqrt(b.array[0]); 3722 return a; 3723 } 3724 } 3725 unittest 3726 { 3727 __m128d A = _mm_setr_pd(1.0, 3.0); 3728 __m128d B = _mm_setr_pd(4.0, 5.0); 3729 __m128d R = _mm_sqrt_sd(A, B); 3730 double[2] correct = [2.0, 3.0 ]; 3731 assert(R.array == correct); 3732 } 3733 3734 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits. 3735 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted 3736 { 3737 static if (GDC_with_SSE2) 3738 { 3739 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3740 } 3741 else static if (LDC_with_SSE2) 3742 { 3743 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3744 } 3745 else 3746 { 3747 short8 sa = cast(short8)a; 3748 long2 lc = cast(long2)count; 3749 int bits = cast(int)(lc.array[0]); 3750 short8 r = void; 3751 foreach(i; 0..8) 3752 r.ptr[i] = cast(short)(sa.array[i] >> bits); 3753 return cast(int4)r; 3754 } 3755 } 3756 3757 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits. 3758 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted 3759 { 3760 static if (LDC_with_SSE2) 3761 { 3762 return __builtin_ia32_psrad128(a, count); 3763 } 3764 else static if (GDC_with_SSE2) 3765 { 3766 return __builtin_ia32_psrad128(a, count); 3767 } 3768 else 3769 { 3770 int4 r = void; 3771 long2 lc = cast(long2)count; 3772 int bits = cast(int)(lc.array[0]); 3773 r.ptr[0] = (a.array[0] >> bits); 3774 r.ptr[1] = (a.array[1] >> bits); 3775 r.ptr[2] = (a.array[2] >> bits); 3776 r.ptr[3] = (a.array[3] >> bits); 3777 return r; 3778 } 3779 } 3780 3781 3782 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 3783 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted 3784 { 3785 static if (GDC_with_SSE2) 3786 { 3787 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 3788 } 3789 else static if (LDC_with_SSE2) 3790 { 3791 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 3792 } 3793 else static if (LDC_with_ARM64) 3794 { 3795 short8 sa = cast(short8)a; 3796 ubyte count = cast(ubyte)imm8; 3797 if (count > 15) 3798 count = 15; 3799 short8 r = sa >> short8(count); 3800 return cast(__m128i)r; 3801 } 3802 else 3803 { 3804 short8 sa = cast(short8)a; 3805 short8 r = void; 3806 3807 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3808 // D says "It's illegal to shift by the same or more bits 3809 // than the size of the quantity being shifted" 3810 // and it's UB instead. 3811 ubyte count = cast(ubyte)imm8; 3812 if (count > 15) 3813 count = 15; 3814 foreach(i; 0..8) 3815 r.ptr[i] = cast(short)(sa.array[i] >> count); 3816 return cast(int4)r; 3817 } 3818 } 3819 unittest 3820 { 3821 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3822 short8 B = cast(short8)( _mm_srai_epi16(A, 1) ); 3823 short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) ); 3824 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 3825 assert(B.array == expectedB); 3826 assert(B2.array == expectedB); 3827 3828 short8 C = cast(short8)( _mm_srai_epi16(A, 18) ); 3829 short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ]; 3830 assert(C.array == expectedC); 3831 } 3832 3833 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 3834 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted 3835 { 3836 static if (LDC_with_SSE2) 3837 { 3838 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 3839 } 3840 else static if (GDC_with_SSE2) 3841 { 3842 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 3843 } 3844 else 3845 { 3846 int4 r = void; 3847 3848 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3849 // D says "It's illegal to shift by the same or more bits 3850 // than the size of the quantity being shifted" 3851 // and it's UB instead. 3852 ubyte count = cast(ubyte) imm8; 3853 if (count > 31) 3854 count = 31; 3855 3856 r.ptr[0] = (a.array[0] >> count); 3857 r.ptr[1] = (a.array[1] >> count); 3858 r.ptr[2] = (a.array[2] >> count); 3859 r.ptr[3] = (a.array[3] >> count); 3860 return r; 3861 } 3862 } 3863 unittest 3864 { 3865 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3866 __m128i B = _mm_srai_epi32(A, 1); 3867 __m128i B2 = _mm_srai_epi32(A, 1 + 256); 3868 int[4] expectedB = [ 0, 1, 1, -2]; 3869 assert(B.array == expectedB); 3870 assert(B2.array == expectedB); 3871 3872 __m128i C = _mm_srai_epi32(A, 32); 3873 int[4] expectedC = [ 0, 0, 0, -1]; 3874 assert(C.array == expectedC); 3875 3876 __m128i D = _mm_srai_epi32(A, 0); 3877 int[4] expectedD = [ 0, 2, 3, -4]; 3878 assert(D.array == expectedD); 3879 } 3880 3881 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted 3882 { 3883 static if (LDC_with_SSE2) 3884 { 3885 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 3886 } 3887 else static if (GDC_with_SSE2) 3888 { 3889 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 3890 } 3891 else 3892 { 3893 short8 sa = cast(short8)a; 3894 long2 lc = cast(long2)count; 3895 int bits = cast(int)(lc.array[0]); 3896 short8 r = void; 3897 foreach(i; 0..8) 3898 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits); 3899 return cast(int4)r; 3900 } 3901 } 3902 3903 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted 3904 { 3905 static if (LDC_with_SSE2) 3906 { 3907 return __builtin_ia32_psrld128(a, count); 3908 } 3909 else static if (GDC_with_SSE2) 3910 { 3911 return __builtin_ia32_psrld128(a, count); 3912 } 3913 else 3914 { 3915 int4 r = void; 3916 long2 lc = cast(long2)count; 3917 int bits = cast(int)(lc.array[0]); 3918 r.ptr[0] = cast(uint)(a.array[0]) >> bits; 3919 r.ptr[1] = cast(uint)(a.array[1]) >> bits; 3920 r.ptr[2] = cast(uint)(a.array[2]) >> bits; 3921 r.ptr[3] = cast(uint)(a.array[3]) >> bits; 3922 return r; 3923 } 3924 } 3925 3926 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted 3927 { 3928 static if (LDC_with_SSE2) 3929 { 3930 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 3931 } 3932 else static if (GDC_with_SSE2) 3933 { 3934 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 3935 } 3936 else 3937 { 3938 long2 r = void; 3939 long2 sa = cast(long2)a; 3940 long2 lc = cast(long2)count; 3941 int bits = cast(int)(lc.array[0]); 3942 r.ptr[0] = cast(ulong)(sa.array[0]) >> bits; 3943 r.ptr[1] = cast(ulong)(sa.array[1]) >> bits; 3944 return cast(__m128i)r; 3945 } 3946 } 3947 3948 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 3949 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted 3950 { 3951 static if (GDC_with_SSE2) 3952 { 3953 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 3954 } 3955 else static if (LDC_with_SSE2) 3956 { 3957 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 3958 } 3959 else static if (LDC_with_ARM64) 3960 { 3961 short8 sa = cast(short8)a; 3962 short8 r = cast(short8) _mm_setzero_si128(); 3963 3964 ubyte count = cast(ubyte)imm8; 3965 if (count >= 16) 3966 return cast(__m128i)r; 3967 3968 r = sa >>> short8(count); // This facility offered with LDC, but not DMD. 3969 return cast(__m128i)r; 3970 } 3971 else 3972 { 3973 short8 sa = cast(short8)a; 3974 ubyte count = cast(ubyte)imm8; 3975 3976 short8 r = cast(short8) _mm_setzero_si128(); 3977 if (count >= 16) 3978 return cast(__m128i)r; 3979 3980 foreach(i; 0..8) 3981 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count); 3982 return cast(__m128i)r; 3983 } 3984 } 3985 unittest 3986 { 3987 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3988 short8 B = cast(short8)( _mm_srli_epi16(A, 1) ); 3989 short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) ); 3990 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 3991 assert(B.array == expectedB); 3992 assert(B2.array == expectedB); 3993 3994 short8 C = cast(short8)( _mm_srli_epi16(A, 16) ); 3995 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0]; 3996 assert(C.array == expectedC); 3997 3998 short8 D = cast(short8)( _mm_srli_epi16(A, 0) ); 3999 short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ]; 4000 assert(D.array == expectedD); 4001 } 4002 4003 4004 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 4005 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted 4006 { 4007 static if (GDC_with_SSE2) 4008 { 4009 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 4010 } 4011 else static if (LDC_with_SSE2) 4012 { 4013 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 4014 } 4015 else 4016 { 4017 ubyte count = cast(ubyte) imm8; 4018 4019 // Note: the intrinsics guarantee imm8[0..7] is taken, however 4020 // D says "It's illegal to shift by the same or more bits 4021 // than the size of the quantity being shifted" 4022 // and it's UB instead. 4023 int4 r = _mm_setzero_si128(); 4024 if (count >= 32) 4025 return r; 4026 r.ptr[0] = a.array[0] >>> count; 4027 r.ptr[1] = a.array[1] >>> count; 4028 r.ptr[2] = a.array[2] >>> count; 4029 r.ptr[3] = a.array[3] >>> count; 4030 return r; 4031 } 4032 } 4033 unittest 4034 { 4035 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 4036 __m128i B = _mm_srli_epi32(A, 1); 4037 __m128i B2 = _mm_srli_epi32(A, 1 + 256); 4038 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 4039 assert(B.array == expectedB); 4040 assert(B2.array == expectedB); 4041 4042 __m128i C = _mm_srli_epi32(A, 255); 4043 int[4] expectedC = [ 0, 0, 0, 0 ]; 4044 assert(C.array == expectedC); 4045 } 4046 4047 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros. 4048 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted 4049 { 4050 static if (GDC_with_SSE2) 4051 { 4052 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4053 } 4054 else static if (LDC_with_SSE2) 4055 { 4056 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4057 } 4058 else 4059 { 4060 long2 r = cast(long2) _mm_setzero_si128(); 4061 long2 sa = cast(long2)a; 4062 4063 ubyte count = cast(ubyte) imm8; 4064 if (count >= 64) 4065 return cast(__m128i)r; 4066 4067 r.ptr[0] = sa.array[0] >>> count; 4068 r.ptr[1] = sa.array[1] >>> count; 4069 return cast(__m128i)r; 4070 } 4071 } 4072 unittest 4073 { 4074 __m128i A = _mm_setr_epi64(8, -4); 4075 long2 B = cast(long2) _mm_srli_epi64(A, 1); 4076 long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512); 4077 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 4078 assert(B.array == expectedB); 4079 assert(B2.array == expectedB); 4080 4081 long2 C = cast(long2) _mm_srli_epi64(A, 64); 4082 long[2] expectedC = [ 0, 0 ]; 4083 assert(C.array == expectedC); 4084 } 4085 4086 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4087 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe 4088 { 4089 static if (bytes & 0xF0) 4090 { 4091 return _mm_setzero_si128(); 4092 } 4093 else static if (GDC_with_SSE2) 4094 { 4095 return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8)); 4096 } 4097 else static if (DMD_with_32bit_asm) 4098 { 4099 asm pure nothrow @nogc @trusted 4100 { 4101 movdqu XMM0, v; 4102 psrldq XMM0, bytes; 4103 movdqu v, XMM0; 4104 } 4105 return v; 4106 } 4107 else 4108 { 4109 return cast(__m128i) shufflevector!(byte16, 4110 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7, 4111 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15) 4112 (cast(byte16) v, cast(byte16)_mm_setzero_si128()); 4113 } 4114 } 4115 unittest 4116 { 4117 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1)); 4118 int[4] correct = [2, 3, 4, 0]; 4119 assert(R.array == correct); 4120 4121 __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1)); 4122 int[4] expectedA = [0, 0, 0, 0]; 4123 assert(A.array == expectedA); 4124 } 4125 4126 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4127 /// #BONUS 4128 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe 4129 { 4130 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v); 4131 } 4132 unittest 4133 { 4134 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)); 4135 float[4] correct = [3.0f, 4.0f, 0, 0]; 4136 assert(R.array == correct); 4137 } 4138 4139 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4140 /// #BONUS 4141 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe 4142 { 4143 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v); 4144 } 4145 4146 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4147 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4148 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted 4149 { 4150 pragma(inline, true); 4151 __m128d* aligned = cast(__m128d*)mem_addr; 4152 *aligned = a; 4153 } 4154 4155 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 4156 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4157 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted 4158 { 4159 __m128d* aligned = cast(__m128d*)mem_addr; 4160 __m128d r; 4161 r.ptr[0] = a.array[0]; 4162 r.ptr[1] = a.array[0]; 4163 *aligned = r; 4164 } 4165 4166 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 4167 /// be aligned on any particular boundary. 4168 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 4169 { 4170 pragma(inline, true); 4171 *mem_addr = a.array[0]; 4172 } 4173 4174 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 4175 /// general-protection exception may be generated. 4176 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 4177 { 4178 pragma(inline, true); 4179 *mem_addr = a; 4180 } 4181 4182 alias _mm_store1_pd = _mm_store_pd1; /// 4183 4184 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory. 4185 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 4186 { 4187 pragma(inline, true); 4188 *mem_addr = a.array[1]; 4189 } 4190 4191 // Note: `mem_addr` doesn't have to actually be aligned, which breaks 4192 // expectations from the user point of view. This problem also exist in C++. 4193 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 4194 { 4195 pragma(inline, true); 4196 long* dest = cast(long*)mem_addr; 4197 long2 la = cast(long2)a; 4198 *dest = la.array[0]; 4199 } 4200 unittest 4201 { 4202 long[3] A = [1, 2, 3]; 4203 _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 4204 long[3] correct = [1, 0x1_0000_0000, 3]; 4205 assert(A == correct); 4206 } 4207 4208 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. 4209 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 4210 { 4211 pragma(inline, true); 4212 *mem_addr = a.array[0]; 4213 } 4214 4215 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order. `mem_addr` must be 4216 /// aligned on a 16-byte boundary or a general-protection exception may be generated. 4217 void _mm_storer_pd (double* mem_addr, __m128d a) pure 4218 { 4219 __m128d* aligned = cast(__m128d*)mem_addr; 4220 *aligned = shufflevector!(double2, 1, 0)(a, a); 4221 } 4222 4223 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4224 /// `mem_addr` does not need to be aligned on any particular boundary. 4225 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe 4226 { 4227 pragma(inline, true); 4228 storeUnaligned!double2(a, mem_addr); 4229 } 4230 4231 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 4232 /// boundary. 4233 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe 4234 { 4235 pragma(inline, true); 4236 storeUnaligned!__m128i(a, cast(int*)mem_addr); 4237 } 4238 4239 /// Store 32-bit integer from the first element of `a` into memory. 4240 /// `mem_addr` does not need to be aligned on any particular boundary. 4241 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted 4242 { 4243 pragma(inline, true); 4244 int* dest = cast(int*)mem_addr; 4245 *dest = a.array[0]; 4246 } 4247 unittest 4248 { 4249 int[2] arr = [-24, 12]; 4250 _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7)); 4251 assert(arr == [-24, -1]); 4252 } 4253 4254 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 4255 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte 4256 /// boundary or a general-protection exception may be generated. 4257 void _mm_stream_pd (double* mem_addr, __m128d a) 4258 { 4259 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4260 __m128d* dest = cast(__m128d*)mem_addr; 4261 *dest = a; 4262 } 4263 4264 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 4265 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception 4266 /// may be generated. 4267 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) 4268 { 4269 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4270 __m128i* dest = cast(__m128i*)mem_addr; 4271 *dest = a; 4272 } 4273 4274 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 4275 /// pollution. If the cache line containing address mem_addr is already in the cache, 4276 /// the cache will be updated. 4277 void _mm_stream_si32 (int* mem_addr, int a) 4278 { 4279 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4280 *mem_addr = a; 4281 } 4282 4283 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 4284 /// cache pollution. If the cache line containing address mem_addr is already 4285 /// in the cache, the cache will be updated. 4286 void _mm_stream_si64 (long* mem_addr, long a) 4287 { 4288 // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4289 *mem_addr = a; 4290 } 4291 4292 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 4293 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 4294 { 4295 pragma(inline, true); 4296 return cast(__m128i)(cast(short8)a - cast(short8)b); 4297 } 4298 4299 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 4300 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 4301 { 4302 pragma(inline, true); 4303 return cast(__m128i)(cast(int4)a - cast(int4)b); 4304 } 4305 4306 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`. 4307 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 4308 { 4309 pragma(inline, true); 4310 return cast(__m128i)(cast(long2)a - cast(long2)b); 4311 } 4312 4313 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 4314 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 4315 { 4316 pragma(inline, true); 4317 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 4318 } 4319 4320 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 4321 /// floating-point elements in `a`. 4322 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 4323 { 4324 pragma(inline, true); 4325 return a - b; 4326 } 4327 4328 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 4329 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the 4330 /// upper element of result. 4331 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted 4332 { 4333 version(DigitalMars) 4334 { 4335 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 4336 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 4337 asm pure nothrow @nogc @trusted { nop;} 4338 a[0] = a[0] - b[0]; 4339 return a; 4340 } 4341 else static if (GDC_with_SSE2) 4342 { 4343 return __builtin_ia32_subsd(a, b); 4344 } 4345 else 4346 { 4347 a.ptr[0] -= b.array[0]; 4348 return a; 4349 } 4350 } 4351 unittest 4352 { 4353 __m128d a = [1.5, -2.0]; 4354 a = _mm_sub_sd(a, a); 4355 assert(a.array == [0.0, -2.0]); 4356 } 4357 4358 /// Subtract 64-bit integer `b` from 64-bit integer `a`. 4359 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe 4360 { 4361 pragma(inline, true); 4362 return a - b; 4363 } 4364 4365 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4366 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 4367 { 4368 version(LDC) 4369 { 4370 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4371 { 4372 // Generates PSUBSW since LDC 1.15 -O0 4373 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4374 4375 enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4376 enum ir = ` 4377 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4378 ret <8 x i16> %r`; 4379 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4380 } 4381 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4382 { 4383 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4384 short[8] res; 4385 short8 sa = cast(short8)a; 4386 short8 sb = cast(short8)b; 4387 foreach(i; 0..8) 4388 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4389 return _mm_loadu_si128(cast(int4*)res.ptr); 4390 } 4391 else static if (LDC_with_SSE2) 4392 { 4393 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 4394 } 4395 else 4396 static assert(false); 4397 } 4398 else static if (GDC_with_SSE2) 4399 { 4400 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 4401 } 4402 else 4403 { 4404 short[8] res; 4405 short8 sa = cast(short8)a; 4406 short8 sb = cast(short8)b; 4407 foreach(i; 0..8) 4408 res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4409 return _mm_loadu_si128(cast(int4*)res.ptr); 4410 } 4411 } 4412 unittest 4413 { 4414 short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0), 4415 _mm_setr_epi16(-10 , 16, 5, 4, 3, 2, 1, 0)); 4416 static immutable short[8] correctResult = [32767, -32768, 0, 0, 0, 0, 0, 0]; 4417 assert(res.array == correctResult); 4418 } 4419 4420 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 4421 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 4422 { 4423 version(LDC) 4424 { 4425 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4426 { 4427 // x86: Generates PSUBSB since LDC 1.15 -O0 4428 // ARM: Generates sqsub.16b since LDC 1.21 -O0 4429 enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 4430 enum ir = ` 4431 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 4432 ret <16 x i8> %r`; 4433 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 4434 } 4435 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4436 { 4437 byte[16] res; 4438 byte16 sa = cast(byte16)a; 4439 byte16 sb = cast(byte16)b; 4440 foreach(i; 0..16) 4441 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4442 return _mm_loadu_si128(cast(int4*)res.ptr); 4443 } 4444 else static if (LDC_with_SSE2) 4445 { 4446 return cast(__m128i) __builtin_ia32_psubsb128(cast(byte16) a, cast(byte16) b); 4447 } 4448 else 4449 static assert(false); 4450 } 4451 else static if (GDC_with_SSE2) 4452 { 4453 return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b); 4454 } 4455 else 4456 { 4457 byte[16] res; 4458 byte16 sa = cast(byte16)a; 4459 byte16 sb = cast(byte16)b; 4460 foreach(i; 0..16) 4461 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4462 return _mm_loadu_si128(cast(int4*)res.ptr); 4463 } 4464 } 4465 unittest 4466 { 4467 byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4468 _mm_setr_epi8( 15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4469 static immutable byte[16] correctResult = [-128, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4470 assert(res.array == correctResult); 4471 } 4472 4473 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 4474 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 4475 { 4476 version(LDC) 4477 { 4478 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4479 { 4480 // x86: Generates PSUBUSW since LDC 1.15 -O0 4481 // ARM: Generates uqsub.8h since LDC 1.21 -O0 4482 enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4483 enum ir = ` 4484 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4485 ret <8 x i16> %r`; 4486 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4487 } 4488 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4489 { 4490 short[8] res; 4491 short8 sa = cast(short8)a; 4492 short8 sb = cast(short8)b; 4493 foreach(i; 0..8) 4494 { 4495 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4496 res[i] = saturateSignedIntToUnsignedShort(sum); 4497 } 4498 return _mm_loadu_si128(cast(int4*)res.ptr); 4499 } 4500 else static if (LDC_with_SSE2) 4501 { 4502 return cast(__m128i) __builtin_ia32_psubusw128(a, b); 4503 } 4504 else 4505 static assert(false); 4506 } 4507 else static if (GDC_with_SSE2) 4508 { 4509 return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b); 4510 } 4511 else 4512 { 4513 short[8] res; 4514 short8 sa = cast(short8)a; 4515 short8 sb = cast(short8)b; 4516 foreach(i; 0..8) 4517 { 4518 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4519 res[i] = saturateSignedIntToUnsignedShort(sum); 4520 } 4521 return _mm_loadu_si128(cast(int4*)res.ptr); 4522 } 4523 } 4524 unittest 4525 { 4526 short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534, 1, 5, 4, 3, 2, 1, 0), 4527 _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0)); 4528 static immutable short[8] correct = [ 0, 0, 1, 0, 0, 2, 0, 0]; 4529 assert(R.array == correct); 4530 } 4531 4532 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4533 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4534 { 4535 version(LDC) 4536 { 4537 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4538 { 4539 // x86: Generates PSUBUSB since LDC 1.15 -O0 4540 // ARM: Generates uqsub.16b since LDC 1.21 -O0 4541 enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 4542 enum ir = ` 4543 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 4544 ret <16 x i8> %r`; 4545 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 4546 } 4547 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4548 { 4549 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4550 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4551 { 4552 ubyte[16] res; 4553 byte16 sa = cast(byte16)a; 4554 byte16 sb = cast(byte16)b; 4555 foreach(i; 0..16) 4556 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4557 return _mm_loadu_si128(cast(int4*)res.ptr); 4558 } 4559 } 4560 else static if (LDC_with_SSE2) 4561 { 4562 return __builtin_ia32_psubusb128(a, b); 4563 } 4564 else 4565 static assert(false); 4566 } 4567 else static if (GDC_with_SSE2) 4568 { 4569 return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b); 4570 } 4571 else 4572 { 4573 ubyte[16] res; 4574 byte16 sa = cast(byte16)a; 4575 byte16 sb = cast(byte16)b; 4576 foreach(i; 0..16) 4577 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4578 return _mm_loadu_si128(cast(int4*)res.ptr); 4579 } 4580 } 4581 unittest 4582 { 4583 byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4584 _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4585 static immutable byte[16] correctResult = [ 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4586 assert(res.array == correctResult); 4587 } 4588 4589 // Note: the only difference between these intrinsics is the signalling 4590 // behaviour of quiet NaNs. This is incorrect but the case where 4591 // you would want to differentiate between qNaN and sNaN and then 4592 // treat them differently on purpose seems extremely rare. 4593 alias _mm_ucomieq_sd = _mm_comieq_sd; /// 4594 alias _mm_ucomige_sd = _mm_comige_sd; /// 4595 alias _mm_ucomigt_sd = _mm_comigt_sd; /// 4596 alias _mm_ucomile_sd = _mm_comile_sd; /// 4597 alias _mm_ucomilt_sd = _mm_comilt_sd; /// 4598 alias _mm_ucomineq_sd = _mm_comineq_sd; /// 4599 4600 /// Return vector of type `__m128d` with undefined elements. 4601 __m128d _mm_undefined_pd() pure @safe 4602 { 4603 pragma(inline, true); 4604 __m128d result = void; 4605 return result; 4606 } 4607 4608 /// Return vector of type `__m128i` with undefined elements. 4609 __m128i _mm_undefined_si128() pure @safe 4610 { 4611 pragma(inline, true); 4612 __m128i result = void; 4613 return result; 4614 } 4615 4616 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. 4617 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe 4618 { 4619 static if (GDC_with_SSE2) 4620 { 4621 return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b); 4622 } 4623 else static if (DMD_with_32bit_asm) 4624 { 4625 asm pure nothrow @nogc @trusted 4626 { 4627 movdqu XMM0, a; 4628 movdqu XMM1, b; 4629 punpckhwd XMM0, XMM1; 4630 movdqu a, XMM0; 4631 } 4632 return a; 4633 } 4634 else 4635 { 4636 return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 4637 (cast(short8)a, cast(short8)b); 4638 } 4639 } 4640 unittest 4641 { 4642 __m128i A = _mm_setr_epi16(4, 5, 6, 7, 8, 9, 10, 11); 4643 __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19); 4644 short8 C = cast(short8)(_mm_unpackhi_epi16(A, B)); 4645 short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19]; 4646 assert(C.array == correct); 4647 } 4648 4649 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. 4650 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted 4651 { 4652 static if (GDC_with_SSE2) 4653 { 4654 return __builtin_ia32_punpckhdq128(a, b); 4655 } 4656 else version(DigitalMars) 4657 { 4658 __m128i r; 4659 r.ptr[0] = a.array[2]; 4660 r.ptr[1] = b.array[2]; 4661 r.ptr[2] = a.array[3]; 4662 r.ptr[3] = b.array[3]; 4663 return r; 4664 } 4665 else 4666 { 4667 return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 4668 } 4669 } 4670 unittest 4671 { 4672 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 4673 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 4674 __m128i C = _mm_unpackhi_epi32(A, B); 4675 int[4] correct = [3, 7, 4, 8]; 4676 assert(C.array == correct); 4677 } 4678 4679 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`. 4680 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted 4681 { 4682 static if (GDC_with_SSE2) 4683 { 4684 return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b); 4685 } 4686 else 4687 { 4688 __m128i r = cast(__m128i)b; 4689 r[0] = a[2]; 4690 r[1] = a[3]; 4691 return r; 4692 } 4693 } 4694 unittest // Issue #36 4695 { 4696 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4697 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4698 long2 C = cast(long2)(_mm_unpackhi_epi64(A, B)); 4699 long[2] correct = [0x33333333_33333333, 0x55555555_55555555]; 4700 assert(C.array == correct); 4701 } 4702 4703 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. 4704 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe 4705 { 4706 static if (GDC_with_SSE2) 4707 { 4708 return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b); 4709 } 4710 else static if (DMD_with_32bit_asm) 4711 { 4712 asm pure nothrow @nogc @trusted 4713 { 4714 movdqu XMM0, a; 4715 movdqu XMM1, b; 4716 punpckhbw XMM0, XMM1; 4717 movdqu a, XMM0; 4718 } 4719 return a; 4720 } 4721 else 4722 { 4723 return cast(__m128i)shufflevector!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 4724 12, 28, 13, 29, 14, 30, 15, 31) 4725 (cast(byte16)a, cast(byte16)b); 4726 } 4727 } 4728 unittest 4729 { 4730 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 4731 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 4732 byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B); 4733 byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]; 4734 assert(C.array == correct); 4735 } 4736 4737 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`. 4738 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe 4739 { 4740 static if (GDC_with_SSE2) 4741 { 4742 return __builtin_ia32_unpckhpd(a, b); 4743 } 4744 else 4745 { 4746 return shufflevector!(__m128d, 1, 3)(a, b); 4747 } 4748 } 4749 unittest 4750 { 4751 __m128d A = _mm_setr_pd(4.0, 6.0); 4752 __m128d B = _mm_setr_pd(7.0, 9.0); 4753 __m128d C = _mm_unpackhi_pd(A, B); 4754 double[2] correct = [6.0, 9.0]; 4755 assert(C.array == correct); 4756 } 4757 4758 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. 4759 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe 4760 { 4761 static if (GDC_with_SSE2) 4762 { 4763 return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b); 4764 } 4765 else static if (DMD_with_32bit_asm) 4766 { 4767 asm pure nothrow @nogc @trusted 4768 { 4769 movdqu XMM0, a; 4770 movdqu XMM1, b; 4771 punpcklwd XMM0, XMM1; 4772 movdqu a, XMM0; 4773 } 4774 return a; 4775 } 4776 else 4777 { 4778 return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11) 4779 (cast(short8)a, cast(short8)b); 4780 } 4781 } 4782 unittest 4783 { 4784 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 4785 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 4786 short8 C = cast(short8) _mm_unpacklo_epi16(A, B); 4787 short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11]; 4788 assert(C.array == correct); 4789 } 4790 4791 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. 4792 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted 4793 { 4794 static if (GDC_with_SSE2) 4795 { 4796 return __builtin_ia32_punpckldq128(a, b); 4797 } 4798 else version(DigitalMars) 4799 { 4800 __m128i r; 4801 r.ptr[0] = a.array[0]; 4802 r.ptr[1] = b.array[0]; 4803 r.ptr[2] = a.array[1]; 4804 r.ptr[3] = b.array[1]; 4805 return r; 4806 } 4807 else 4808 { 4809 return shufflevector!(int4, 0, 4, 1, 5)(cast(int4)a, cast(int4)b); 4810 } 4811 } 4812 unittest 4813 { 4814 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 4815 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 4816 __m128i C = _mm_unpacklo_epi32(A, B); 4817 int[4] correct = [1, 5, 2, 6]; 4818 assert(C.array == correct); 4819 } 4820 4821 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`. 4822 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted 4823 { 4824 static if (GDC_with_SSE2) 4825 { 4826 return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b); 4827 } 4828 else 4829 { 4830 long2 lA = cast(long2)a; 4831 long2 lB = cast(long2)b; 4832 long2 R; 4833 R.ptr[0] = lA.array[0]; 4834 R.ptr[1] = lB.array[0]; 4835 return cast(__m128i)R; 4836 } 4837 } 4838 unittest // Issue #36 4839 { 4840 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4841 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4842 long2 C = cast(long2)(_mm_unpacklo_epi64(A, B)); 4843 long[2] correct = [0x22222222_22222222, 0x44444444_44444444]; 4844 assert(C.array == correct); 4845 } 4846 4847 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. 4848 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe 4849 { 4850 static if (GDC_with_SSE2) 4851 { 4852 return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b); 4853 } 4854 else static if (DMD_with_32bit_asm) 4855 { 4856 asm pure nothrow @nogc @trusted 4857 { 4858 movdqu XMM0, a; 4859 movdqu XMM1, b; 4860 punpcklbw XMM0, XMM1; 4861 movdqu a, XMM0; 4862 } 4863 return a; 4864 } 4865 else 4866 { 4867 return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 4868 4, 20, 5, 21, 6, 22, 7, 23) 4869 (cast(byte16)a, cast(byte16)b); 4870 } 4871 } 4872 unittest 4873 { 4874 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 4875 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 4876 byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B); 4877 byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]; 4878 assert(C.array == correct); 4879 } 4880 4881 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`. 4882 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe 4883 { 4884 static if (GDC_with_SSE2) 4885 { 4886 return __builtin_ia32_unpcklpd(a, b); 4887 } 4888 else 4889 { 4890 return shufflevector!(__m128d, 0, 2)(a, b); 4891 } 4892 } 4893 unittest 4894 { 4895 __m128d A = _mm_setr_pd(4.0, 6.0); 4896 __m128d B = _mm_setr_pd(7.0, 9.0); 4897 __m128d C = _mm_unpacklo_pd(A, B); 4898 double[2] correct = [4.0, 7.0]; 4899 assert(C.array == correct); 4900 } 4901 4902 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 4903 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 4904 { 4905 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 4906 } 4907 // TODO unittest and thus force inline 4908 4909 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`. 4910 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 4911 { 4912 return a ^ b; 4913 } 4914 // TODO unittest and thus force inline 4915 4916 unittest 4917 { 4918 float distance(float[4] a, float[4] b) nothrow @nogc 4919 { 4920 __m128 va = _mm_loadu_ps(a.ptr); 4921 __m128 vb = _mm_loadu_ps(b.ptr); 4922 __m128 diffSquared = _mm_sub_ps(va, vb); 4923 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 4924 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared)); 4925 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum)); 4926 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 4927 } 4928 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 4929 }