1 /** 2 * SSE2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2 4 * 5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.emmintrin; 9 10 public import inteli.types; 11 public import inteli.xmmintrin; // SSE2 includes SSE1 12 import inteli.mmx; 13 import inteli.internals; 14 15 nothrow @nogc: 16 17 18 // SSE2 instructions 19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 20 21 /// Add packed 16-bit integers in `a` and `b`. 22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 23 { 24 pragma(inline, true); 25 return cast(__m128i)(cast(short8)a + cast(short8)b); 26 } 27 unittest 28 { 29 __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77); 30 short8 R = cast(short8) _mm_add_epi16(A, A); 31 short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154]; 32 assert(R.array == correct); 33 } 34 35 /// Add packed 32-bit integers in `a` and `b`. 36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 37 { 38 pragma(inline, true); 39 return cast(__m128i)(cast(int4)a + cast(int4)b); 40 } 41 unittest 42 { 43 __m128i A = _mm_setr_epi32( -7, -1, 0, 9); 44 int4 R = _mm_add_epi32(A, A); 45 int[4] correct = [ -14, -2, 0, 18 ]; 46 assert(R.array == correct); 47 } 48 49 /// Add packed 64-bit integers in `a` and `b`. 50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 51 { 52 pragma(inline, true); 53 return cast(__m128i)(cast(long2)a + cast(long2)b); 54 } 55 unittest 56 { 57 __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000); 58 long2 R = cast(long2) _mm_add_epi64(A, A); 59 long[2] correct = [ -2, 0 ]; 60 assert(R.array == correct); 61 } 62 63 /// Add packed 8-bit integers in `a` and `b`. 64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 65 { 66 pragma(inline, true); 67 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 68 } 69 unittest 70 { 71 __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78); 72 byte16 R = cast(byte16) _mm_add_epi8(A, A); 73 byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100]; 74 assert(R.array == correct); 75 } 76 77 /// Add the lower double-precision (64-bit) floating-point element 78 /// in `a` and `b`, store the result in the lower element of dst, 79 /// and copy the upper element from `a` to the upper element of destination. 80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 81 { 82 static if (GDC_with_SSE2) 83 { 84 return __builtin_ia32_addsd(a, b); 85 } 86 else version(DigitalMars) 87 { 88 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 89 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 90 asm pure nothrow @nogc @trusted { nop;} 91 a[0] = a[0] + b[0]; 92 return a; 93 } 94 else 95 { 96 a[0] += b[0]; 97 return a; 98 } 99 } 100 unittest 101 { 102 __m128d a = [1.5, -2.0]; 103 a = _mm_add_sd(a, a); 104 assert(a.array == [3.0, -2.0]); 105 } 106 107 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 108 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 109 { 110 pragma(inline, true); 111 return a + b; 112 } 113 unittest 114 { 115 __m128d a = [1.5, -2.0]; 116 a = _mm_add_pd(a, a); 117 assert(a.array == [3.0, -4.0]); 118 } 119 120 /// Add 64-bit integers `a` and `b`. 121 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe 122 { 123 pragma(inline, true); 124 return a + b; 125 } 126 127 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 128 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 129 { 130 static if (GDC_with_SSE2) 131 { 132 return cast(__m128i)__builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 133 } 134 else version(LDC) 135 { 136 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 137 { 138 // x86: Generates PADDSW since LDC 1.15 -O0 139 // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20 140 enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 141 enum ir = ` 142 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 143 ret <8 x i16> %r`; 144 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 145 } 146 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 147 { 148 short[8] res; 149 short8 sa = cast(short8)a; 150 short8 sb = cast(short8)b; 151 foreach(i; 0..8) 152 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 153 return _mm_loadu_si128(cast(int4*)res.ptr); 154 } 155 else 156 return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 157 } 158 else 159 { 160 short[8] res; 161 short8 sa = cast(short8)a; 162 short8 sb = cast(short8)b; 163 foreach(i; 0..8) 164 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 165 return _mm_loadu_si128(cast(int4*)res.ptr); 166 } 167 } 168 unittest 169 { 170 short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), 171 _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); 172 static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 173 assert(res.array == correctResult); 174 } 175 176 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 177 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 178 { 179 static if (GDC_with_SSE2) 180 { 181 return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b); 182 } 183 else version(LDC) 184 { 185 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 186 { 187 // x86: Generates PADDSB since LDC 1.15 -O0 188 // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20 189 enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 190 enum ir = ` 191 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 192 ret <16 x i8> %r`; 193 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 194 } 195 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 196 { 197 byte[16] res; 198 byte16 sa = cast(byte16)a; 199 byte16 sb = cast(byte16)b; 200 foreach(i; 0..16) 201 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 202 return _mm_loadu_si128(cast(int4*)res.ptr); 203 } 204 else 205 return cast(__m128i) __builtin_ia32_paddsb128(cast(byte16)a, cast(byte16)b); 206 } 207 else 208 { 209 byte[16] res; 210 byte16 sa = cast(byte16)a; 211 byte16 sb = cast(byte16)b; 212 foreach(i; 0..16) 213 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 214 return _mm_loadu_si128(cast(int4*)res.ptr); 215 } 216 } 217 unittest 218 { 219 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 220 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 221 static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14, 222 16, 18, 20, 22, 24, 26, 28, 30]; 223 assert(res.array == correctResult); 224 } 225 226 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 227 // PERF: #GDC version? 228 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 229 { 230 version(LDC) 231 { 232 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 233 { 234 // x86: Generates PADDUSB since LDC 1.15 -O0 235 // ARM: Generates uqadd.16b since LDC 1.21 -O1 236 enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 237 enum ir = ` 238 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 239 ret <16 x i8> %r`; 240 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 241 } 242 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 243 { 244 ubyte[16] res; 245 byte16 sa = cast(byte16)a; 246 byte16 sb = cast(byte16)b; 247 foreach(i; 0..16) 248 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 249 return _mm_loadu_si128(cast(int4*)res.ptr); 250 } 251 else 252 return __builtin_ia32_paddusb128(a, b); 253 } 254 else 255 { 256 ubyte[16] res; 257 byte16 sa = cast(byte16)a; 258 byte16 sb = cast(byte16)b; 259 foreach(i; 0..16) 260 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 261 return _mm_loadu_si128(cast(int4*)res.ptr); 262 } 263 } 264 unittest 265 { 266 byte16 res = cast(byte16) 267 _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0), 268 _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0)); 269 static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 270 0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 271 assert(res.array == correctResult); 272 } 273 274 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation. 275 // PERF: #GDC version? 276 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 277 { 278 version(LDC) 279 { 280 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 281 { 282 // x86: Generates PADDUSW since LDC 1.15 -O0 283 // ARM: Generates uqadd.8h since LDC 1.21 -O1 284 enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 285 enum ir = ` 286 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 287 ret <8 x i16> %r`; 288 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 289 } 290 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 291 { 292 ushort[8] res; 293 short8 sa = cast(short8)a; 294 short8 sb = cast(short8)b; 295 foreach(i; 0..8) 296 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 297 return _mm_loadu_si128(cast(int4*)res.ptr); 298 } 299 else 300 return __builtin_ia32_paddusw128(a, b); 301 } 302 else 303 { 304 ushort[8] res; 305 short8 sa = cast(short8)a; 306 short8 sb = cast(short8)b; 307 foreach(i; 0..8) 308 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 309 return _mm_loadu_si128(cast(int4*)res.ptr); 310 } 311 } 312 unittest 313 { 314 short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0), 315 _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0)); 316 static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6]; 317 assert(res.array == correctResult); 318 } 319 320 /// Compute the bitwise AND of packed double-precision (64-bit) 321 /// floating-point elements in `a` and `b`. 322 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 323 { 324 pragma(inline, true); 325 return cast(__m128d)( cast(long2)a & cast(long2)b ); 326 } 327 unittest 328 { 329 double a = 4.32; 330 double b = -78.99; 331 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 332 __m128d A = _mm_set_pd(a, b); 333 __m128d B = _mm_set_pd(b, a); 334 long2 R = cast(long2)( _mm_and_pd(A, B) ); 335 assert(R.array[0] == correct); 336 assert(R.array[1] == correct); 337 } 338 339 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`. 340 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 341 { 342 pragma(inline, true); 343 return a & b; 344 } 345 unittest 346 { 347 __m128i A = _mm_set1_epi32(7); 348 __m128i B = _mm_set1_epi32(14); 349 __m128i R = _mm_and_si128(A, B); 350 int[4] correct = [6, 6, 6, 6]; 351 assert(R.array == correct); 352 } 353 354 /// Compute the bitwise NOT of packed double-precision (64-bit) 355 /// floating-point elements in `a` and then AND with `b`. 356 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 357 { 358 return cast(__m128d)( ~(cast(long2)a) & cast(long2)b); 359 } 360 unittest 361 { 362 double a = 4.32; 363 double b = -78.99; 364 long correct = (~*cast(long*)(&a)) & ( *cast(long*)(&b)); 365 long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b)); 366 __m128d A = _mm_setr_pd(a, b); 367 __m128d B = _mm_setr_pd(b, a); 368 long2 R = cast(long2)( _mm_andnot_pd(A, B) ); 369 assert(R.array[0] == correct); 370 assert(R.array[1] == correct2); 371 } 372 373 /// Compute the bitwise NOT of 128 bits (representing integer data) 374 /// in `a` and then AND with `b`. 375 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 376 { 377 return (~a) & b; 378 } 379 unittest 380 { 381 __m128i A = _mm_set1_epi32(7); 382 __m128i B = _mm_set1_epi32(14); 383 __m128i R = _mm_andnot_si128(A, B); 384 int[4] correct = [8, 8, 8, 8]; 385 assert(R.array == correct); 386 } 387 388 /// Average packed unsigned 16-bit integers in `a` and `b`. 389 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted 390 { 391 static if (GDC_with_SSE2) 392 { 393 return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b); 394 } 395 else static if (LDC_with_ARM64) 396 { 397 return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b); 398 } 399 else version(LDC) 400 { 401 // Generates pavgw even in LDC 1.0, even in -O0 402 // But not in ARM 403 enum ir = ` 404 %ia = zext <8 x i16> %0 to <8 x i32> 405 %ib = zext <8 x i16> %1 to <8 x i32> 406 %isum = add <8 x i32> %ia, %ib 407 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 408 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 409 %r = trunc <8 x i32> %isums to <8 x i16> 410 ret <8 x i16> %r`; 411 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 412 } 413 else 414 { 415 short8 sa = cast(short8)a; 416 short8 sb = cast(short8)b; 417 short8 sr = void; 418 foreach(i; 0..8) 419 { 420 sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 ); 421 } 422 return cast(int4)sr; 423 } 424 } 425 unittest 426 { 427 __m128i A = _mm_set1_epi16(31); 428 __m128i B = _mm_set1_epi16(64); 429 short8 avg = cast(short8)(_mm_avg_epu16(A, B)); 430 foreach(i; 0..8) 431 assert(avg.array[i] == 48); 432 } 433 434 /// Average packed unsigned 8-bit integers in `a` and `b`. 435 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted 436 { 437 static if (GDC_with_SSE2) 438 { 439 return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b); 440 } 441 else static if (LDC_with_ARM64) 442 { 443 return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b); 444 } 445 else version(LDC) 446 { 447 // Generates pavgb even in LDC 1.0, even in -O0 448 // But not in ARM 449 enum ir = ` 450 %ia = zext <16 x i8> %0 to <16 x i16> 451 %ib = zext <16 x i8> %1 to <16 x i16> 452 %isum = add <16 x i16> %ia, %ib 453 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 454 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 455 %r = trunc <16 x i16> %isums to <16 x i8> 456 ret <16 x i8> %r`; 457 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 458 } 459 else 460 { 461 byte16 sa = cast(byte16)a; 462 byte16 sb = cast(byte16)b; 463 byte16 sr = void; 464 foreach(i; 0..16) 465 { 466 sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 ); 467 } 468 return cast(int4)sr; 469 } 470 } 471 unittest 472 { 473 __m128i A = _mm_set1_epi8(31); 474 __m128i B = _mm_set1_epi8(64); 475 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B)); 476 foreach(i; 0..16) 477 assert(avg.array[i] == 48); 478 } 479 480 /// Shift `a` left by `bytes` bytes while shifting in zeros. 481 alias _mm_bslli_si128 = _mm_slli_si128; 482 unittest 483 { 484 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 485 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 486 __m128i result = _mm_bslli_si128!5(toShift); 487 assert( (cast(byte16)result).array == exact); 488 } 489 490 /// Shift `v` right by `bytes` bytes while shifting in zeros. 491 alias _mm_bsrli_si128 = _mm_srli_si128; 492 unittest 493 { 494 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 495 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0]; 496 __m128i result = _mm_bsrli_si128!5(toShift); 497 assert( (cast(byte16)result).array == exact); 498 } 499 500 /// Cast vector of type `__m128d` to type `__m128`. 501 /// Note: Also possible with a regular `cast(__m128)(a)`. 502 __m128 _mm_castpd_ps (__m128d a) pure @safe 503 { 504 return cast(__m128)a; 505 } 506 507 /// Cast vector of type `__m128d` to type `__m128i`. 508 /// Note: Also possible with a regular `cast(__m128i)(a)`. 509 __m128i _mm_castpd_si128 (__m128d a) pure @safe 510 { 511 return cast(__m128i)a; 512 } 513 514 /// Cast vector of type `__m128` to type `__m128d`. 515 /// Note: Also possible with a regular `cast(__m128d)(a)`. 516 __m128d _mm_castps_pd (__m128 a) pure @safe 517 { 518 return cast(__m128d)a; 519 } 520 521 /// Cast vector of type `__m128` to type `__m128i`. 522 /// Note: Also possible with a regular `cast(__m128i)(a)`. 523 __m128i _mm_castps_si128 (__m128 a) pure @safe 524 { 525 return cast(__m128i)a; 526 } 527 528 /// Cast vector of type `__m128i` to type `__m128d`. 529 /// Note: Also possible with a regular `cast(__m128d)(a)`. 530 __m128d _mm_castsi128_pd (__m128i a) pure @safe 531 { 532 return cast(__m128d)a; 533 } 534 535 /// Cast vector of type `__m128i` to type `__m128`. 536 /// Note: Also possible with a regular `cast(__m128)(a)`. 537 __m128 _mm_castsi128_ps (__m128i a) pure @safe 538 { 539 return cast(__m128)a; 540 } 541 542 /// Invalidate and flush the cache line that contains `p` 543 /// from all levels of the cache hierarchy. 544 void _mm_clflush (const(void)* p) @trusted 545 { 546 static if (GDC_with_SSE2) 547 { 548 __builtin_ia32_clflush(p); 549 } 550 else static if (LDC_with_SSE2) 551 { 552 __builtin_ia32_clflush(cast(void*)p); 553 } 554 else version(D_InlineAsm_X86) 555 { 556 asm pure nothrow @nogc @safe 557 { 558 mov EAX, p; 559 clflush [EAX]; 560 } 561 } 562 else version(D_InlineAsm_X86_64) 563 { 564 asm pure nothrow @nogc @safe 565 { 566 mov RAX, p; 567 clflush [RAX]; 568 } 569 } 570 else 571 { 572 // Do nothing. Invalidating cacheline does 573 // not affect correctness. 574 } 575 } 576 unittest 577 { 578 ubyte[64] cacheline; 579 _mm_clflush(cacheline.ptr); 580 } 581 582 /// Compare packed 16-bit integers in `a` and `b` for equality. 583 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 584 { 585 static if (GDC_with_SSE2) 586 { 587 return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b); 588 } 589 else 590 { 591 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 592 } 593 } 594 unittest 595 { 596 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 597 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 598 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 599 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 600 assert(R.array == E); 601 } 602 603 /// Compare packed 32-bit integers in `a` and `b` for equality. 604 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 605 { 606 static if (GDC_with_SSE2) 607 { 608 return __builtin_ia32_pcmpeqd128(a, b); 609 } 610 else 611 { 612 return equalMask!__m128i(a, b); 613 } 614 } 615 unittest 616 { 617 int4 A = [-3, -2, -1, 0]; 618 int4 B = [ 4, -2, 2, 0]; 619 int[4] E = [ 0, -1, 0, -1]; 620 int4 R = cast(int4)(_mm_cmpeq_epi32(A, B)); 621 assert(R.array == E); 622 } 623 624 /// Compare packed 8-bit integers in `a` and `b` for equality. 625 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 626 { 627 static if (GDC_with_SSE2) 628 { 629 return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b); 630 } 631 else 632 { 633 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 634 } 635 } 636 unittest 637 { 638 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 639 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 640 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 641 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 642 assert(C.array == correct); 643 } 644 645 /// Compare packed double-precision (64-bit) floating-point elements 646 /// in `a` and `b` for equality. 647 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 648 { 649 static if (GDC_with_SSE2) 650 { 651 return __builtin_ia32_cmpeqpd(a, b); 652 } 653 else 654 { 655 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b); 656 } 657 } 658 659 /// Compare the lower double-precision (64-bit) floating-point elements 660 /// in `a` and `b` for equality, store the result in the lower element, 661 /// and copy the upper element from `a`. 662 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 663 { 664 static if (GDC_with_SSE2) 665 { 666 return __builtin_ia32_cmpeqsd(a, b); 667 } 668 else 669 { 670 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b); 671 } 672 } 673 674 /// Compare packed double-precision (64-bit) floating-point elements 675 /// in `a` and `b` for greater-than-or-equal. 676 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 677 { 678 static if (GDC_with_SSE2) 679 { 680 return __builtin_ia32_cmpgepd(a, b); 681 } 682 else 683 { 684 return cast(__m128d) cmppd!(FPComparison.oge)(a, b); 685 } 686 } 687 688 /// Compare the lower double-precision (64-bit) floating-point elements 689 /// in `a` and `b` for greater-than-or-equal, store the result in the 690 /// lower element, and copy the upper element from `a`. 691 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 692 { 693 // Note: There is no __builtin_ia32_cmpgesd builtin. 694 static if (GDC_with_SSE2) 695 { 696 return __builtin_ia32_cmpnltsd(b, a); 697 } 698 else 699 { 700 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b); 701 } 702 } 703 704 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 705 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 706 { 707 static if (GDC_with_SSE2) 708 { 709 return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b); 710 } 711 else 712 { 713 return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b); 714 } 715 } 716 unittest 717 { 718 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 719 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 720 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 721 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 722 assert(R.array == E); 723 } 724 725 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 726 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 727 { 728 static if (GDC_with_SSE2) 729 { 730 return __builtin_ia32_pcmpgtd128(a, b); 731 } 732 else 733 { 734 return cast(__m128i)( greaterMask!int4(a, b)); 735 } 736 } 737 unittest 738 { 739 int4 A = [-3, 2, -1, 0]; 740 int4 B = [ 4, -2, 2, 0]; 741 int[4] E = [ 0, -1, 0, 0]; 742 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 743 assert(R.array == E); 744 } 745 746 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 747 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 748 { 749 static if (GDC_with_SSE2) 750 { 751 return cast(__m128i) __builtin_ia32_pcmpgtb128(cast(ubyte16)a, cast(ubyte16)b); 752 } 753 else 754 { 755 return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b); 756 } 757 } 758 unittest 759 { 760 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 761 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 762 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 763 byte[16] correct = [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 764 __m128i D = _mm_cmpeq_epi8(A, B); 765 assert(C.array == correct); 766 } 767 768 /// Compare packed double-precision (64-bit) floating-point elements 769 /// in `a` and `b` for greater-than. 770 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 771 { 772 static if (GDC_with_SSE2) 773 { 774 return __builtin_ia32_cmpgtpd(a, b); 775 } 776 else 777 { 778 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b); 779 } 780 } 781 782 /// Compare the lower double-precision (64-bit) floating-point elements 783 /// in `a` and `b` for greater-than, store the result in the lower element, 784 /// and copy the upper element from `a`. 785 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 786 { 787 // Note: There is no __builtin_ia32_cmpgtsd builtin. 788 static if (GDC_with_SSE2) 789 { 790 return __builtin_ia32_cmpnlesd(b, a); 791 } 792 else 793 { 794 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b); 795 } 796 } 797 798 /// Compare packed double-precision (64-bit) floating-point elements 799 /// in `a` and `b` for less-than-or-equal. 800 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 801 { 802 static if (GDC_with_SSE2) 803 { 804 return __builtin_ia32_cmplepd(a, b); 805 } 806 else 807 { 808 return cast(__m128d) cmppd!(FPComparison.ole)(a, b); 809 } 810 } 811 812 /// Compare the lower double-precision (64-bit) floating-point elements 813 /// in `a` and `b` for less-than-or-equal, store the result in the 814 /// lower element, and copy the upper element from `a`. 815 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 816 { 817 static if (GDC_with_SSE2) 818 { 819 return __builtin_ia32_cmplesd(a, b); 820 } 821 else 822 { 823 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b); 824 } 825 } 826 827 /// Compare packed 16-bit integers in `a` and `b` for less-than. 828 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 829 { 830 return _mm_cmpgt_epi16(b, a); 831 } 832 833 /// Compare packed 32-bit integers in `a` and `b` for less-than. 834 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 835 { 836 return _mm_cmpgt_epi32(b, a); 837 } 838 839 /// Compare packed 8-bit integers in `a` and `b` for less-than. 840 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 841 { 842 return _mm_cmpgt_epi8(b, a); 843 } 844 845 /// Compare packed double-precision (64-bit) floating-point elements 846 /// in `a` and `b` for less-than. 847 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 848 { 849 static if (GDC_with_SSE2) 850 { 851 return __builtin_ia32_cmpltpd(a, b); 852 } 853 else 854 { 855 return cast(__m128d) cmppd!(FPComparison.olt)(a, b); 856 } 857 } 858 859 /// Compare the lower double-precision (64-bit) floating-point elements 860 /// in `a` and `b` for less-than, store the result in the lower 861 /// element, and copy the upper element from `a`. 862 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 863 { 864 static if (GDC_with_SSE2) 865 { 866 return __builtin_ia32_cmpltsd(a, b); 867 } 868 else 869 { 870 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b); 871 } 872 } 873 874 /// Compare packed double-precision (64-bit) floating-point elements 875 /// in `a` and `b` for not-equal. 876 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 877 { 878 static if (GDC_with_SSE2) 879 { 880 return __builtin_ia32_cmpneqpd(a, b); 881 } 882 else 883 { 884 return cast(__m128d) cmppd!(FPComparison.une)(a, b); 885 } 886 } 887 888 /// Compare the lower double-precision (64-bit) floating-point elements 889 /// in `a` and `b` for not-equal, store the result in the lower 890 /// element, and copy the upper element from `a`. 891 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 892 { 893 static if (GDC_with_SSE2) 894 { 895 return __builtin_ia32_cmpneqsd(a, b); 896 } 897 else 898 { 899 return cast(__m128d) cmpsd!(FPComparison.une)(a, b); 900 } 901 } 902 903 /// Compare packed double-precision (64-bit) floating-point elements 904 /// in `a` and `b` for not-greater-than-or-equal. 905 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 906 { 907 static if (GDC_with_SSE2) 908 { 909 return __builtin_ia32_cmpngepd(a, b); 910 } 911 else 912 { 913 return cast(__m128d) cmppd!(FPComparison.ult)(a, b); 914 } 915 } 916 917 /// Compare the lower double-precision (64-bit) floating-point elements 918 /// in `a` and `b` for not-greater-than-or-equal, store the result in 919 /// the lower element, and copy the upper element from `a`. 920 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 921 { 922 // Note: There is no __builtin_ia32_cmpngesd builtin. 923 static if (GDC_with_SSE2) 924 { 925 return __builtin_ia32_cmpltsd(b, a); 926 } 927 else 928 { 929 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b); 930 } 931 } 932 933 /// Compare packed double-precision (64-bit) floating-point elements 934 /// in `a` and `b` for not-greater-than. 935 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 936 { 937 static if (GDC_with_SSE2) 938 { 939 return __builtin_ia32_cmpngtpd(a, b); 940 } 941 else 942 { 943 return cast(__m128d) cmppd!(FPComparison.ule)(a, b); 944 } 945 } 946 947 /// Compare the lower double-precision (64-bit) floating-point elements 948 /// in `a` and `b` for not-greater-than, store the result in the 949 /// lower element, and copy the upper element from `a`. 950 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 951 { 952 // Note: There is no __builtin_ia32_cmpngtsd builtin. 953 static if (GDC_with_SSE2) 954 { 955 return __builtin_ia32_cmplesd(b, a); 956 } 957 else 958 { 959 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b); 960 } 961 } 962 963 /// Compare packed double-precision (64-bit) floating-point elements 964 /// in `a` and `b` for not-less-than-or-equal. 965 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 966 { 967 static if (GDC_with_SSE2) 968 { 969 return __builtin_ia32_cmpnlepd(a, b); 970 } 971 else 972 { 973 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b); 974 } 975 } 976 977 /// Compare the lower double-precision (64-bit) floating-point elements 978 /// in `a` and `b` for not-less-than-or-equal, store the result in the 979 /// lower element, and copy the upper element from `a`. 980 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 981 { 982 static if (GDC_with_SSE2) 983 { 984 return __builtin_ia32_cmpnlesd(a, b); 985 } 986 else 987 { 988 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b); 989 } 990 } 991 992 /// Compare packed double-precision (64-bit) floating-point elements 993 /// in `a` and `b` for not-less-than. 994 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 995 { 996 static if (GDC_with_SSE2) 997 { 998 return __builtin_ia32_cmpnltpd(a, b); 999 } 1000 else 1001 { 1002 return cast(__m128d) cmppd!(FPComparison.uge)(a, b); 1003 } 1004 } 1005 1006 /// Compare the lower double-precision (64-bit) floating-point elements 1007 /// in `a` and `b` for not-less-than, store the result in the lower 1008 /// element, and copy the upper element from `a`. 1009 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 1010 { 1011 static if (GDC_with_SSE2) 1012 { 1013 return __builtin_ia32_cmpnltsd(a, b); 1014 } 1015 else 1016 { 1017 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b); 1018 } 1019 } 1020 1021 /// Compare packed double-precision (64-bit) floating-point elements 1022 /// in `a` and `b` to see if neither is NaN. 1023 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 1024 { 1025 static if (GDC_with_SSE2) 1026 { 1027 return __builtin_ia32_cmpordpd(a, b); 1028 } 1029 else 1030 { 1031 return cast(__m128d) cmppd!(FPComparison.ord)(a, b); 1032 } 1033 } 1034 1035 /// Compare the lower double-precision (64-bit) floating-point elements 1036 /// in `a` and `b` to see if neither is NaN, store the result in the 1037 /// lower element, and copy the upper element from `a` to the upper element. 1038 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 1039 { 1040 static if (GDC_with_SSE2) 1041 { 1042 return __builtin_ia32_cmpordsd(a, b); 1043 } 1044 else 1045 { 1046 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b); 1047 } 1048 } 1049 1050 /// Compare packed double-precision (64-bit) floating-point elements 1051 /// in `a` and `b` to see if either is NaN. 1052 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 1053 { 1054 static if (GDC_with_SSE2) 1055 { 1056 return __builtin_ia32_cmpunordpd(a, b); 1057 } 1058 else 1059 { 1060 return cast(__m128d) cmppd!(FPComparison.uno)(a, b); 1061 } 1062 } 1063 1064 /// Compare the lower double-precision (64-bit) floating-point elements 1065 /// in `a` and `b` to see if either is NaN, store the result in the lower 1066 /// element, and copy the upper element from `a` to the upper element. 1067 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 1068 { 1069 static if (GDC_with_SSE2) 1070 { 1071 return __builtin_ia32_cmpunordsd(a, b); 1072 } 1073 else 1074 { 1075 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b); 1076 } 1077 } 1078 1079 /// Compare the lower double-precision (64-bit) floating-point element 1080 /// in `a` and `b` for equality, and return the boolean result (0 or 1). 1081 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe 1082 { 1083 // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 1084 // comisd instruction, it returns false in case of unordered instead. 1085 // 1086 // Actually C++ compilers disagree over the meaning of that instruction. 1087 // GCC will manage NaNs like the comisd instruction (return true if unordered), 1088 // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says. 1089 // We choose to do like the most numerous. It seems GCC is buggy with NaNs. 1090 return a.array[0] == b.array[0]; 1091 } 1092 unittest 1093 { 1094 assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1095 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1096 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1097 assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1098 assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1099 } 1100 1101 /// Compare the lower double-precision (64-bit) floating-point element 1102 /// in `a` and `b` for greater-than-or-equal, and return the boolean 1103 /// result (0 or 1). 1104 int _mm_comige_sd (__m128d a, __m128d b) pure @safe 1105 { 1106 return a.array[0] >= b.array[0]; 1107 } 1108 unittest 1109 { 1110 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1111 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1112 assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1113 assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1114 assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1115 assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1116 } 1117 1118 /// Compare the lower double-precision (64-bit) floating-point element 1119 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1). 1120 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe 1121 { 1122 return a.array[0] > b.array[0]; 1123 } 1124 unittest 1125 { 1126 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1127 assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1128 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1129 assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1130 assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1131 } 1132 1133 /// Compare the lower double-precision (64-bit) floating-point element 1134 /// in `a` and `b` for less-than-or-equal. 1135 int _mm_comile_sd (__m128d a, __m128d b) pure @safe 1136 { 1137 return a.array[0] <= b.array[0]; 1138 } 1139 unittest 1140 { 1141 assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1142 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1143 assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1144 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1145 assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1146 assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1147 } 1148 1149 /// Compare the lower double-precision (64-bit) floating-point element 1150 /// in `a` and `b` for less-than, and return the boolean result (0 or 1). 1151 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe 1152 { 1153 return a.array[0] < b.array[0]; 1154 } 1155 unittest 1156 { 1157 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1158 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1159 assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1160 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1161 assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1162 assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1163 } 1164 1165 /// Compare the lower double-precision (64-bit) floating-point element 1166 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1). 1167 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe 1168 { 1169 return a.array[0] != b.array[0]; 1170 } 1171 unittest 1172 { 1173 assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1174 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1175 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1176 assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1177 assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1178 } 1179 1180 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) 1181 /// floating-point elements. 1182 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted 1183 { 1184 version(LDC) 1185 { 1186 // Generates cvtdq2pd since LDC 1.0, even without optimizations 1187 enum ir = ` 1188 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 1189 %r = sitofp <2 x i32> %v to <2 x double> 1190 ret <2 x double> %r`; 1191 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a); 1192 } 1193 else static if (GDC_with_SSE2) 1194 { 1195 return __builtin_ia32_cvtdq2pd(a); 1196 } 1197 else 1198 { 1199 double2 r = void; 1200 r.ptr[0] = a.array[0]; 1201 r.ptr[1] = a.array[1]; 1202 return r; 1203 } 1204 } 1205 unittest 1206 { 1207 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54)); 1208 assert(A.array[0] == 54.0); 1209 assert(A.array[1] == 54.0); 1210 } 1211 1212 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 1213 /// floating-point elements. 1214 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted 1215 { 1216 static if (GDC_with_SSE2) 1217 { 1218 return __builtin_ia32_cvtdq2ps(a); 1219 } 1220 else version(LDC) 1221 { 1222 // See #86 for why we had to resort to LLVM IR. 1223 // Plain code below was leading to catastrophic behaviour. 1224 // x86: Generates cvtdq2ps since LDC 1.1.0 -O0 1225 // ARM: Generats scvtf.4s since LDC 1.8.0 -O0 1226 enum ir = ` 1227 %r = sitofp <4 x i32> %0 to <4 x float> 1228 ret <4 x float> %r`; 1229 return cast(__m128) LDCInlineIR!(ir, float4, int4)(a); 1230 } 1231 else 1232 { 1233 __m128 res; 1234 res.ptr[0] = cast(float)a.array[0]; 1235 res.ptr[1] = cast(float)a.array[1]; 1236 res.ptr[2] = cast(float)a.array[2]; 1237 res.ptr[3] = cast(float)a.array[3]; 1238 return res; 1239 } 1240 } 1241 unittest 1242 { 1243 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 1244 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 1245 } 1246 1247 /// Convert packed double-precision (64-bit) floating-point elements 1248 /// in `a` to packed 32-bit integers. 1249 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted 1250 { 1251 // PERF ARM32 1252 static if (LDC_with_SSE2) 1253 { 1254 return __builtin_ia32_cvtpd2dq(a); 1255 } 1256 else static if (GDC_with_SSE2) 1257 { 1258 return __builtin_ia32_cvtpd2dq(a); 1259 } 1260 else static if (LDC_with_ARM64) 1261 { 1262 // Get current rounding mode. 1263 uint fpscr = arm_get_fpcr(); 1264 long2 i; 1265 switch(fpscr & _MM_ROUND_MASK_ARM) 1266 { 1267 default: 1268 case _MM_ROUND_NEAREST_ARM: i = vcvtnq_s64_f64(a); break; 1269 case _MM_ROUND_DOWN_ARM: i = vcvtmq_s64_f64(a); break; 1270 case _MM_ROUND_UP_ARM: i = vcvtpq_s64_f64(a); break; 1271 case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break; 1272 } 1273 int4 zero = 0; 1274 return cast(__m128i) shufflevector!(int4, 0, 2, 4, 6)(cast(int4)i, zero); 1275 } 1276 else 1277 { 1278 // PERF ARM32 1279 __m128i r = _mm_setzero_si128(); 1280 r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]); 1281 r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]); 1282 return r; 1283 } 1284 } 1285 unittest 1286 { 1287 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0)); 1288 assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0); 1289 } 1290 1291 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1292 /// to packed 32-bit integers 1293 __m64 _mm_cvtpd_pi32 (__m128d v) @safe 1294 { 1295 return to_m64(_mm_cvtpd_epi32(v)); 1296 } 1297 unittest 1298 { 1299 int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0)); 1300 assert(A.array[0] == 55 && A.array[1] == 61); 1301 } 1302 1303 /// Convert packed double-precision (64-bit) floating-point elements 1304 /// in `a` to packed single-precision (32-bit) floating-point elements. 1305 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted 1306 { 1307 static if (LDC_with_SSE2) 1308 { 1309 return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately 1310 } 1311 else static if (GDC_with_SSE2) 1312 { 1313 return __builtin_ia32_cvtpd2ps(a); 1314 } 1315 else 1316 { 1317 __m128 r = void; 1318 r.ptr[0] = a.array[0]; 1319 r.ptr[1] = a.array[1]; 1320 r.ptr[2] = 0; 1321 r.ptr[3] = 0; 1322 return r; 1323 } 1324 } 1325 unittest 1326 { 1327 __m128d A = _mm_set_pd(5.25, 4.0); 1328 __m128 B = _mm_cvtpd_ps(A); 1329 assert(B.array == [4.0f, 5.25f, 0, 0]); 1330 } 1331 1332 /// Convert packed 32-bit integers in `v` to packed double-precision 1333 /// (64-bit) floating-point elements. 1334 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe 1335 { 1336 return _mm_cvtepi32_pd(to_m128i(v)); 1337 } 1338 unittest 1339 { 1340 __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5)); 1341 assert(A.array[0] == 4.0 && A.array[1] == -5.0); 1342 } 1343 1344 /// Convert packed single-precision (32-bit) floating-point elements 1345 /// in `a` to packed 32-bit integers 1346 __m128i _mm_cvtps_epi32 (__m128 a) @trusted 1347 { 1348 static if (LDC_with_SSE2) 1349 { 1350 return cast(__m128i) __builtin_ia32_cvtps2dq(a); 1351 } 1352 else static if (GDC_with_SSE2) 1353 { 1354 return __builtin_ia32_cvtps2dq(a); 1355 } 1356 else static if (LDC_with_ARM64) 1357 { 1358 // Get current rounding mode. 1359 uint fpscr = arm_get_fpcr(); 1360 switch(fpscr & _MM_ROUND_MASK_ARM) 1361 { 1362 default: 1363 case _MM_ROUND_NEAREST_ARM: return vcvtnq_s32_f32(a); 1364 case _MM_ROUND_DOWN_ARM: return vcvtmq_s32_f32(a); 1365 case _MM_ROUND_UP_ARM: return vcvtpq_s32_f32(a); 1366 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a); 1367 } 1368 } 1369 else 1370 { 1371 __m128i r = void; 1372 r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]); 1373 r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]); 1374 r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]); 1375 r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]); 1376 return r; 1377 } 1378 } 1379 unittest 1380 { 1381 // GDC bug #98607 1382 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607 1383 // GDC does not provide optimization barrier for rounding mode. 1384 // Workarounded with different literals. This bug will likely only manifest in unittest. 1385 // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't. 1386 1387 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1388 1389 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1390 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1391 assert(A.array == [1, -2, 54, -3]); 1392 1393 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1394 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f)); 1395 assert(A.array == [1, -3, 53, -3]); 1396 1397 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1398 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f)); 1399 assert(A.array == [2, -2, 54, -2]); 1400 1401 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1402 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f)); 1403 assert(A.array == [1, -2, 53, -2]); 1404 1405 _MM_SET_ROUNDING_MODE(savedRounding); 1406 } 1407 1408 /// Convert packed single-precision (32-bit) floating-point elements 1409 /// in `a` to packed double-precision (64-bit) floating-point elements. 1410 __m128d _mm_cvtps_pd (__m128 a) pure @trusted 1411 { 1412 version(LDC) 1413 { 1414 // Generates cvtps2pd since LDC 1.0 -O0 1415 enum ir = ` 1416 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1> 1417 %r = fpext <2 x float> %v to <2 x double> 1418 ret <2 x double> %r`; 1419 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a); 1420 } 1421 else static if (GDC_with_SSE2) 1422 { 1423 return __builtin_ia32_cvtps2pd(a); 1424 } 1425 else 1426 { 1427 double2 r = void; 1428 r.ptr[0] = a.array[0]; 1429 r.ptr[1] = a.array[1]; 1430 return r; 1431 } 1432 } 1433 unittest 1434 { 1435 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f)); 1436 assert(A.array[0] == 54.0); 1437 assert(A.array[1] == 54.0); 1438 } 1439 1440 /// Copy the lower double-precision (64-bit) floating-point element of `a`. 1441 double _mm_cvtsd_f64 (__m128d a) pure @safe 1442 { 1443 return a.array[0]; 1444 } 1445 1446 /// Convert the lower double-precision (64-bit) floating-point element 1447 /// in `a` to a 32-bit integer. 1448 int _mm_cvtsd_si32 (__m128d a) @safe 1449 { 1450 static if (LDC_with_SSE2) 1451 { 1452 return __builtin_ia32_cvtsd2si(a); 1453 } 1454 else static if (GDC_with_SSE2) 1455 { 1456 return __builtin_ia32_cvtsd2si(a); 1457 } 1458 else 1459 { 1460 return convertDoubleToInt32UsingMXCSR(a[0]); 1461 } 1462 } 1463 unittest 1464 { 1465 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0))); 1466 } 1467 1468 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer. 1469 long _mm_cvtsd_si64 (__m128d a) @trusted 1470 { 1471 version (LDC) 1472 { 1473 version (X86_64) 1474 { 1475 return __builtin_ia32_cvtsd2si64(a); 1476 } 1477 else 1478 { 1479 // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer 1480 // using SSE instructions only. So the builtin doesn't exit for this arch. 1481 return convertDoubleToInt64UsingMXCSR(a[0]); 1482 } 1483 } 1484 else 1485 { 1486 return convertDoubleToInt64UsingMXCSR(a.array[0]); 1487 } 1488 } 1489 unittest 1490 { 1491 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0))); 1492 1493 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1494 1495 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1496 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49))); 1497 1498 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1499 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1))); 1500 1501 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1502 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1))); 1503 1504 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1505 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9))); 1506 1507 _MM_SET_ROUNDING_MODE(savedRounding); 1508 } 1509 1510 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; /// 1511 1512 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 1513 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a` 1514 /// to the upper elements of result. 1515 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted 1516 { 1517 static if (GDC_with_SSE2) 1518 { 1519 return __builtin_ia32_cvtsd2ss(a, b); 1520 } 1521 else 1522 { 1523 // Generates cvtsd2ss since LDC 1.3 -O0 1524 a.ptr[0] = b.array[0]; 1525 return a; 1526 } 1527 } 1528 unittest 1529 { 1530 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0)); 1531 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]); 1532 } 1533 1534 /// Get the lower 32-bit integer in `a`. 1535 int _mm_cvtsi128_si32 (__m128i a) pure @safe 1536 { 1537 return a.array[0]; 1538 } 1539 1540 /// Get the lower 64-bit integer in `a`. 1541 long _mm_cvtsi128_si64 (__m128i a) pure @safe 1542 { 1543 long2 la = cast(long2)a; 1544 return la.array[0]; 1545 } 1546 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 1547 1548 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 1549 /// lower element of result, and copy the upper element from `a` to the upper element of result. 1550 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted 1551 { 1552 a.ptr[0] = cast(double)b; 1553 return a; 1554 } 1555 unittest 1556 { 1557 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 1558 assert(a.array == [42.0, 0]); 1559 } 1560 1561 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements. 1562 __m128i _mm_cvtsi32_si128 (int a) pure @trusted 1563 { 1564 int4 r = [0, 0, 0, 0]; 1565 r.ptr[0] = a; 1566 return r; 1567 } 1568 unittest 1569 { 1570 __m128i a = _mm_cvtsi32_si128(65); 1571 assert(a.array == [65, 0, 0, 0]); 1572 } 1573 1574 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 1575 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 1576 1577 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted 1578 { 1579 a.ptr[0] = cast(double)b; 1580 return a; 1581 } 1582 unittest 1583 { 1584 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 1585 assert(a.array == [42.0, 0]); 1586 } 1587 1588 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element. 1589 __m128i _mm_cvtsi64_si128 (long a) pure @trusted 1590 { 1591 long2 r = [0, 0]; 1592 r.ptr[0] = a; 1593 return cast(__m128i)(r); 1594 } 1595 1596 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; /// 1597 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; /// 1598 1599 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 1600 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 1601 // element of result. 1602 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted 1603 { 1604 a.ptr[0] = b.array[0]; 1605 return a; 1606 } 1607 unittest 1608 { 1609 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 1610 assert(a.array == [42.0, 0]); 1611 } 1612 1613 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation. 1614 long _mm_cvttss_si64 (__m128 a) pure @safe 1615 { 1616 return cast(long)(a.array[0]); // Generates cvttss2si as expected 1617 } 1618 unittest 1619 { 1620 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1621 } 1622 1623 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1624 /// Put zeroes in the upper elements of result. 1625 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted 1626 { 1627 static if (LDC_with_SSE2) 1628 { 1629 return __builtin_ia32_cvttpd2dq(a); 1630 } 1631 else static if (GDC_with_SSE2) 1632 { 1633 return __builtin_ia32_cvttpd2dq(a); 1634 } 1635 else 1636 { 1637 // Note: doesn't generate cvttpd2dq as of LDC 1.13 1638 __m128i r; 1639 r.ptr[0] = cast(int)a.array[0]; 1640 r.ptr[1] = cast(int)a.array[1]; 1641 r.ptr[2] = 0; 1642 r.ptr[3] = 0; 1643 return r; 1644 } 1645 } 1646 unittest 1647 { 1648 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f)); 1649 assert(R.array == [-4, 45641, 0, 0]); 1650 } 1651 1652 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1653 /// to packed 32-bit integers with truncation. 1654 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe 1655 { 1656 return to_m64(_mm_cvttpd_epi32(v)); 1657 } 1658 unittest 1659 { 1660 int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f)); 1661 int[2] correct = [-4, 45641]; 1662 assert(R.array == correct); 1663 } 1664 1665 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1666 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted 1667 { 1668 // x86: Generates cvttps2dq since LDC 1.3 -O2 1669 // ARM64: generates fcvtze since LDC 1.8 -O2 1670 __m128i r; 1671 r.ptr[0] = cast(int)a.array[0]; 1672 r.ptr[1] = cast(int)a.array[1]; 1673 r.ptr[2] = cast(int)a.array[2]; 1674 r.ptr[3] = cast(int)a.array[3]; 1675 return r; 1676 } 1677 unittest 1678 { 1679 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f)); 1680 assert(R.array == [-4, 45641, 0, 1]); 1681 } 1682 1683 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation. 1684 int _mm_cvttsd_si32 (__m128d a) 1685 { 1686 // Generates cvttsd2si since LDC 1.3 -O0 1687 return cast(int)a.array[0]; 1688 } 1689 1690 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation. 1691 long _mm_cvttsd_si64 (__m128d a) 1692 { 1693 // Generates cvttsd2si since LDC 1.3 -O0 1694 // but in 32-bit instead, it's a long sequence that resort to FPU 1695 return cast(long)a.array[0]; 1696 } 1697 1698 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; /// 1699 1700 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`. 1701 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe 1702 { 1703 pragma(inline, true); 1704 return a / b; 1705 } 1706 1707 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted 1708 { 1709 static if (GDC_with_SSE2) 1710 { 1711 return __builtin_ia32_divsd(a, b); 1712 } 1713 else version(DigitalMars) 1714 { 1715 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1716 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 1717 asm pure nothrow @nogc @trusted { nop;} 1718 a.array[0] = a.array[0] / b.array[0]; 1719 return a; 1720 } 1721 else 1722 { 1723 a.ptr[0] /= b.array[0]; 1724 return a; 1725 } 1726 } 1727 unittest 1728 { 1729 __m128d a = [2.0, 4.5]; 1730 a = _mm_div_sd(a, a); 1731 assert(a.array == [1.0, 4.5]); 1732 } 1733 1734 /// Extract a 16-bit integer from `v`, selected with `index`. 1735 /// Warning: the returned value is zero-extended to 32-bits. 1736 int _mm_extract_epi16(__m128i v, int index) pure @safe 1737 { 1738 short8 r = cast(short8)v; 1739 return cast(ushort)(r.array[index & 7]); 1740 } 1741 unittest 1742 { 1743 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1); 1744 assert(_mm_extract_epi16(A, 6) == 6); 1745 assert(_mm_extract_epi16(A, 0) == 65535); 1746 assert(_mm_extract_epi16(A, 5 + 8) == 5); 1747 } 1748 1749 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`. 1750 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted 1751 { 1752 short8 r = cast(short8)v; 1753 r.ptr[index & 7] = cast(short)i; 1754 return cast(__m128i)r; 1755 } 1756 unittest 1757 { 1758 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1759 short8 R = cast(short8) _mm_insert_epi16(A, 42, 6); 1760 short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7]; 1761 assert(R.array == correct); 1762 } 1763 1764 1765 void _mm_lfence() @trusted 1766 { 1767 version(GNU) 1768 { 1769 1770 static if (GDC_with_SSE2) 1771 { 1772 __builtin_ia32_lfence(); 1773 } 1774 else version(X86) 1775 { 1776 asm pure nothrow @nogc @trusted 1777 { 1778 "lfence;\n" : : : ; 1779 } 1780 } 1781 else 1782 static assert(false); 1783 } 1784 else static if (LDC_with_SSE2) 1785 { 1786 __builtin_ia32_lfence(); 1787 } 1788 else static if (DMD_with_asm) 1789 { 1790 asm nothrow @nogc pure @safe 1791 { 1792 lfence; 1793 } 1794 } 1795 else version(LDC) 1796 { 1797 llvm_memory_fence(); // PERF actually generates mfence 1798 } 1799 else 1800 static assert(false); 1801 } 1802 unittest 1803 { 1804 _mm_lfence(); 1805 } 1806 1807 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1808 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1809 __m128d _mm_load_pd (const(double) * mem_addr) pure 1810 { 1811 pragma(inline, true); 1812 __m128d* aligned = cast(__m128d*)mem_addr; 1813 return *aligned; 1814 } 1815 unittest 1816 { 1817 align(16) double[2] S = [-5.0, 7.0]; 1818 __m128d R = _mm_load_pd(S.ptr); 1819 assert(R.array == S); 1820 } 1821 1822 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst. 1823 /// `mem_addr` does not need to be aligned on any particular boundary. 1824 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 1825 { 1826 double m = *mem_addr; 1827 __m128d r; 1828 r.ptr[0] = m; 1829 r.ptr[1] = m; 1830 return r; 1831 } 1832 unittest 1833 { 1834 double what = 4; 1835 __m128d R = _mm_load_pd1(&what); 1836 double[2] correct = [4.0, 4]; 1837 assert(R.array == correct); 1838 } 1839 1840 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 1841 /// element. `mem_addr` does not need to be aligned on any particular boundary. 1842 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted 1843 { 1844 double2 r = [0, 0]; 1845 r.ptr[0] = *mem_addr; 1846 return r; 1847 } 1848 unittest 1849 { 1850 double x = -42; 1851 __m128d a = _mm_load_sd(&x); 1852 assert(a.array == [-42.0, 0.0]); 1853 } 1854 1855 /// Load 128-bits of integer data from memory into dst. 1856 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1857 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted // TODO: shoudln't be trusted because alignment, Issue #62 1858 { 1859 pragma(inline, true); 1860 return *mem_addr; 1861 } 1862 unittest 1863 { 1864 align(16) int[4] correct = [-1, 2, 3, 4]; 1865 int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr); 1866 assert(A.array == correct); 1867 } 1868 1869 alias _mm_load1_pd = _mm_load_pd1; /// 1870 1871 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 1872 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary. 1873 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted 1874 { 1875 pragma(inline, true); 1876 a.ptr[1] = *mem_addr; 1877 return a; 1878 } 1879 unittest 1880 { 1881 double A = 7.0; 1882 __m128d B = _mm_setr_pd(4.0, -5.0); 1883 __m128d R = _mm_loadh_pd(B, &A); 1884 double[2] correct = [ 4.0, 7.0 ]; 1885 assert(R.array == correct); 1886 } 1887 1888 /// Load 64-bit integer from memory into the first element of result. Zero out the other. 1889 // Note: strange signature since the memory doesn't have to aligned (Issue #60) 1890 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted // TODO signature 1891 { 1892 pragma(inline, true); 1893 auto pLong = cast(const(long)*)mem_addr; 1894 long2 r = [0, 0]; 1895 r.ptr[0] = *pLong; 1896 return cast(__m128i)(r); 1897 } 1898 unittest 1899 { 1900 long A = 0x7878787870707070; 1901 long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A); 1902 long[2] correct = [0x7878787870707070, 0]; 1903 assert(R.array == correct); 1904 } 1905 1906 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 1907 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary. 1908 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted 1909 { 1910 a.ptr[0] = *mem_addr; 1911 return a; 1912 } 1913 unittest 1914 { 1915 double A = 7.0; 1916 __m128d B = _mm_setr_pd(4.0, -5.0); 1917 __m128d R = _mm_loadl_pd(B, &A); 1918 double[2] correct = [ 7.0, -5.0 ]; 1919 assert(R.array == correct); 1920 } 1921 1922 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 1923 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1924 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted 1925 { 1926 __m128d a = *cast(__m128d*)(mem_addr); 1927 __m128d r; 1928 r.ptr[0] = a.array[1]; 1929 r.ptr[1] = a.array[0]; 1930 return r; 1931 } 1932 unittest 1933 { 1934 align(16) double[2] A = [56.0, -74.0]; 1935 __m128d R = _mm_loadr_pd(A.ptr); 1936 double[2] correct = [-74.0, 56.0]; 1937 assert(R.array == correct); 1938 } 1939 1940 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1941 /// `mem_addr` does not need to be aligned on any particular boundary. 1942 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted 1943 { 1944 pragma(inline, true); 1945 static if (GDC_with_SSE2) 1946 { 1947 return __builtin_ia32_loadupd(mem_addr); 1948 } 1949 else version(LDC) 1950 { 1951 return loadUnaligned!(double2)(mem_addr); 1952 } 1953 else version(DigitalMars) 1954 { 1955 static if (DMD_with_DSIMD) 1956 { 1957 return cast(__m128d)__simd(XMM.LODUPD, *mem_addr); 1958 } 1959 else static if (SSESizedVectorsAreEmulated) 1960 { 1961 // Since this vector is emulated, it doesn't have alignement constraints 1962 // and as such we can just cast it. 1963 return *cast(__m128d*)(mem_addr); 1964 } 1965 else 1966 { 1967 __m128d result; 1968 result.ptr[0] = mem_addr[0]; 1969 result.ptr[1] = mem_addr[1]; 1970 return result; 1971 } 1972 } 1973 else 1974 { 1975 __m128d result; 1976 result.ptr[0] = mem_addr[0]; 1977 result.ptr[1] = mem_addr[1]; 1978 return result; 1979 } 1980 } 1981 unittest 1982 { 1983 double[2] A = [56.0, -75.0]; 1984 __m128d R = _mm_loadu_pd(A.ptr); 1985 double[2] correct = [56.0, -75.0]; 1986 assert(R.array == correct); 1987 } 1988 1989 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary. 1990 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 1991 { 1992 pragma(inline, true); 1993 static if (GDC_with_SSE2) 1994 { 1995 return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr); 1996 } 1997 else 1998 { 1999 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 2000 } 2001 } 2002 unittest 2003 { 2004 align(16) int[4] correct = [-1, 2, -3, 4]; 2005 int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr); 2006 assert(A.array == correct); 2007 } 2008 2009 /// Load unaligned 32-bit integer from memory into the first element of result. 2010 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted 2011 { 2012 pragma(inline, true); 2013 int r = *cast(int*)(mem_addr); 2014 int4 result = [0, 0, 0, 0]; 2015 result.ptr[0] = r; 2016 return result; 2017 } 2018 unittest 2019 { 2020 int r = 42; 2021 __m128i A = _mm_loadu_si32(&r); 2022 int[4] correct = [42, 0, 0, 0]; 2023 assert(A.array == correct); 2024 } 2025 2026 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 2027 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 2028 /// and pack the results in destination. 2029 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted 2030 { 2031 static if (GDC_with_SSE2) 2032 { 2033 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2034 } 2035 else static if (LDC_with_SSE2) 2036 { 2037 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2038 } 2039 else static if (LDC_with_ARM64) 2040 { 2041 int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b)); 2042 int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b)); 2043 int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl)); 2044 int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph)); 2045 return vcombine_s32(rl, rh); 2046 } 2047 else 2048 { 2049 short8 sa = cast(short8)a; 2050 short8 sb = cast(short8)b; 2051 int4 r; 2052 foreach(i; 0..4) 2053 { 2054 r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1]; 2055 } 2056 return r; 2057 } 2058 } 2059 unittest 2060 { 2061 short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2062 short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2063 int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B); 2064 int[4] correct = [1, 13, -2147483648, 2*32767*32767]; 2065 assert(R.array == correct); 2066 } 2067 2068 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 2069 /// (elements are not stored when the highest bit is not set in the corresponding element) 2070 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 2071 /// boundary. 2072 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted 2073 { 2074 static if (GDC_with_SSE2) 2075 { 2076 return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr); 2077 } 2078 else static if (LDC_with_SSE2) 2079 { 2080 return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr); 2081 } 2082 else static if (LDC_with_ARM64) 2083 { 2084 // PERF: catastrophic on ARM32 2085 byte16 bmask = cast(byte16)mask; 2086 byte16 shift = 7; 2087 bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask 2088 mask = cast(__m128i) bmask; 2089 __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr); 2090 dest = (a & mask) | (dest & ~mask); 2091 storeUnaligned!__m128i(dest, cast(int*)mem_addr); 2092 } 2093 else 2094 { 2095 byte16 b = cast(byte16)a; 2096 byte16 m = cast(byte16)mask; 2097 byte* dest = cast(byte*)(mem_addr); 2098 foreach(j; 0..16) 2099 { 2100 if (m.array[j] & 128) 2101 { 2102 dest[j] = b.array[j]; 2103 } 2104 } 2105 } 2106 } 2107 unittest 2108 { 2109 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]; 2110 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0); 2111 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); 2112 _mm_maskmoveu_si128(A, mask, dest.ptr); 2113 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42]; 2114 assert(dest == correct); 2115 } 2116 2117 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values. 2118 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe 2119 { 2120 static if (GDC_with_SSE2) 2121 { 2122 return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b); 2123 } 2124 else version(LDC) 2125 { 2126 // x86: pmaxsw since LDC 1.0 -O1 2127 // ARM: smax.8h since LDC 1.5 -01 2128 short8 sa = cast(short8)a; 2129 short8 sb = cast(short8)b; 2130 short8 greater = greaterMask!short8(sa, sb); 2131 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2132 } 2133 else 2134 { 2135 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else 2136 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2137 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2138 return _mm_xor_si128(b, mask); 2139 } 2140 } 2141 unittest 2142 { 2143 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57), 2144 _mm_setr_epi16(-4,-8, 9, 7, 0,-32768, 0, 0)); 2145 short[8] correct = [32767, 1, 9, 7, 9, 7, 0, 0]; 2146 assert(R.array == correct); 2147 } 2148 2149 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values. 2150 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe 2151 { 2152 version(LDC) 2153 { 2154 // x86: pmaxub since LDC 1.0.0 -O1 2155 // ARM64: umax.16b since LDC 1.5.0 -O1 2156 // PERF: catastrophic on ARM32 2157 ubyte16 sa = cast(ubyte16)a; 2158 ubyte16 sb = cast(ubyte16)b; 2159 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2160 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2161 } 2162 else 2163 { 2164 __m128i value128 = _mm_set1_epi8(-128); 2165 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2166 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2167 __m128i mask = _mm_and_si128(aTob, higher); 2168 return _mm_xor_si128(b, mask); 2169 } 2170 } 2171 unittest 2172 { 2173 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2174 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2175 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 2176 assert(R.array == correct); 2177 } 2178 2179 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed maximum values. 2180 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted 2181 { 2182 static if (GDC_with_SSE2) 2183 { 2184 return __builtin_ia32_maxpd(a, b); 2185 } 2186 else 2187 { 2188 // x86: Generates maxpd starting with LDC 1.9 -O2 2189 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2190 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2191 return a; 2192 } 2193 } 2194 unittest 2195 { 2196 __m128d A = _mm_setr_pd(4.0, 1.0); 2197 __m128d B = _mm_setr_pd(1.0, 8.0); 2198 __m128d M = _mm_max_pd(A, B); 2199 assert(M.array[0] == 4.0); 2200 assert(M.array[1] == 8.0); 2201 } 2202 2203 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 2204 /// lower element of result, and copy the upper element from `a` to the upper element of result. 2205 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted 2206 { 2207 static if (GDC_with_SSE2) 2208 { 2209 return __builtin_ia32_maxsd(a, b); 2210 } 2211 else 2212 { 2213 __m128d r = a; 2214 // Generates maxsd starting with LDC 1.3 2215 r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2216 return r; 2217 } 2218 } 2219 unittest 2220 { 2221 __m128d A = _mm_setr_pd(1.0, 1.0); 2222 __m128d B = _mm_setr_pd(4.0, 2.0); 2223 __m128d M = _mm_max_sd(A, B); 2224 assert(M.array[0] == 4.0); 2225 assert(M.array[1] == 1.0); 2226 } 2227 2228 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 2229 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 2230 /// is globally visible before any memory instruction which follows the fence in program order. 2231 void _mm_mfence() @trusted 2232 { 2233 version(GNU) 2234 { 2235 static if (GDC_with_SSE2) 2236 { 2237 __builtin_ia32_mfence(); 2238 } 2239 else version(X86) 2240 { 2241 asm pure nothrow @nogc @trusted 2242 { 2243 "mfence;\n" : : : ; 2244 } 2245 } 2246 else 2247 static assert(false); 2248 } 2249 else static if (LDC_with_SSE2) 2250 { 2251 __builtin_ia32_mfence(); 2252 } 2253 else static if (DMD_with_asm) 2254 { 2255 asm nothrow @nogc pure @safe 2256 { 2257 mfence; 2258 } 2259 } 2260 else version(LDC) 2261 { 2262 void _mm_mfence() pure @safe 2263 { 2264 // Note: will generate the DMB instruction on ARM 2265 llvm_memory_fence(); 2266 } 2267 } 2268 else 2269 static assert(false); 2270 } 2271 unittest 2272 { 2273 _mm_mfence(); 2274 } 2275 2276 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values. 2277 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe 2278 { 2279 static if (GDC_with_SSE2) 2280 { 2281 return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b); 2282 } 2283 else version(LDC) 2284 { 2285 // x86: pminsw since LDC 1.0 -O1 2286 // ARM64: smin.8h since LDC 1.5 -01 2287 short8 sa = cast(short8)a; 2288 short8 sb = cast(short8)b; 2289 short8 greater = greaterMask!short8(sa, sb); 2290 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2291 } 2292 else 2293 { 2294 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else 2295 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2296 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2297 return _mm_xor_si128(b, mask); 2298 } 2299 } 2300 unittest 2301 { 2302 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-32768), 2303 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 2304 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -32768]; 2305 assert(R.array == correct); 2306 } 2307 2308 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values. 2309 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe 2310 { 2311 version(LDC) 2312 { 2313 // x86: pminub since LDC 1.0.0 -O1 2314 // ARM: umin.16b since LDC 1.5.0 -O1 2315 // PERF: catastrophic on ARM32 2316 ubyte16 sa = cast(ubyte16)a; 2317 ubyte16 sb = cast(ubyte16)b; 2318 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2319 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2320 } 2321 else 2322 { 2323 __m128i value128 = _mm_set1_epi8(-128); 2324 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2325 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2326 __m128i mask = _mm_and_si128(aTob, lower); 2327 return _mm_xor_si128(b, mask); 2328 } 2329 } 2330 unittest 2331 { 2332 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2333 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2334 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 2335 assert(R.array == correct); 2336 } 2337 2338 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values. 2339 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted 2340 { 2341 static if (GDC_with_SSE2) 2342 { 2343 return __builtin_ia32_minpd(a, b); 2344 } 2345 else 2346 { 2347 // Generates minpd starting with LDC 1.9 2348 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2349 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2350 return a; 2351 } 2352 } 2353 unittest 2354 { 2355 __m128d A = _mm_setr_pd(1.0, 2.0); 2356 __m128d B = _mm_setr_pd(4.0, 1.0); 2357 __m128d M = _mm_min_pd(A, B); 2358 assert(M.array[0] == 1.0); 2359 assert(M.array[1] == 1.0); 2360 } 2361 2362 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 2363 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 2364 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe 2365 { 2366 static if (GDC_with_SSE2) 2367 { 2368 return __builtin_ia32_minsd(a, b); 2369 } 2370 else 2371 { 2372 // Generates minsd starting with LDC 1.3 2373 __m128d r = a; 2374 r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2375 return r; 2376 } 2377 } 2378 unittest 2379 { 2380 __m128d A = _mm_setr_pd(1.0, 3.0); 2381 __m128d B = _mm_setr_pd(4.0, 2.0); 2382 __m128d M = _mm_min_sd(A, B); 2383 assert(M.array[0] == 1.0); 2384 assert(M.array[1] == 3.0); 2385 } 2386 2387 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element. 2388 __m128i _mm_move_epi64 (__m128i a) pure @trusted 2389 { 2390 static if (GDC_with_SSE2) 2391 { 2392 // slightly better with GDC -O0 2393 return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 2394 } 2395 else 2396 { 2397 long2 result = [ 0, 0 ]; 2398 long2 la = cast(long2) a; 2399 result.ptr[0] = la.array[0]; 2400 return cast(__m128i)(result); 2401 } 2402 } 2403 unittest 2404 { 2405 long2 A = [13, 47]; 2406 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 2407 long[2] correct = [13, 0]; 2408 assert(B.array == correct); 2409 } 2410 2411 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 2412 /// the upper element from `a` to the upper element of dst. 2413 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted 2414 { 2415 static if (GDC_with_SSE2) 2416 { 2417 return __builtin_ia32_movsd(a, b); 2418 } 2419 else 2420 { 2421 b.ptr[1] = a.array[1]; 2422 return b; 2423 } 2424 } 2425 unittest 2426 { 2427 double2 A = [13.0, 47.0]; 2428 double2 B = [34.0, 58.0]; 2429 double2 C = _mm_move_sd(A, B); 2430 double[2] correct = [34.0, 47.0]; 2431 assert(C.array == correct); 2432 } 2433 2434 /// Create mask from the most significant bit of each 8-bit element in `v`. 2435 int _mm_movemask_epi8 (__m128i a) pure @trusted 2436 { 2437 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2438 static if (GDC_with_SSE2) 2439 { 2440 return __builtin_ia32_pmovmskb128(cast(ubyte16)a); 2441 } 2442 else static if (LDC_with_SSE2) 2443 { 2444 return __builtin_ia32_pmovmskb128(cast(byte16)a); 2445 } 2446 else static if (LDC_with_ARM64) 2447 { 2448 // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon 2449 // The other two solutions lead to unfound intrinsics in LLVM and that took a long time. 2450 // SO there might be something a bit faster, but this one is reasonable and branchless. 2451 byte8 mask_shift; 2452 mask_shift.ptr[0] = 7; 2453 mask_shift.ptr[1] = 6; 2454 mask_shift.ptr[2] = 5; 2455 mask_shift.ptr[3] = 4; 2456 mask_shift.ptr[4] = 3; 2457 mask_shift.ptr[5] = 2; 2458 mask_shift.ptr[6] = 1; 2459 mask_shift.ptr[7] = 0; 2460 byte8 mask_and = byte8(-128); 2461 byte8 lo = vget_low_u8(cast(byte16)a); 2462 byte8 hi = vget_high_u8(cast(byte16)a); 2463 lo = vand_u8(lo, mask_and); 2464 lo = vshr_u8(lo, mask_shift); 2465 hi = vand_u8(hi, mask_and); 2466 hi = vshr_u8(hi, mask_shift); 2467 lo = vpadd_u8(lo,lo); 2468 lo = vpadd_u8(lo,lo); 2469 lo = vpadd_u8(lo,lo); 2470 hi = vpadd_u8(hi,hi); 2471 hi = vpadd_u8(hi,hi); 2472 hi = vpadd_u8(hi,hi); 2473 return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]); 2474 } 2475 else 2476 { 2477 byte16 ai = cast(byte16)a; 2478 int r = 0; 2479 foreach(bit; 0..16) 2480 { 2481 if (ai.array[bit] < 0) r += (1 << bit); 2482 } 2483 return r; 2484 } 2485 } 2486 unittest 2487 { 2488 assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0))); 2489 } 2490 2491 /// Create mask from the most significant bit of each 16-bit element in `v`. #BONUS 2492 int _mm_movemask_epi16 (__m128i a) pure @trusted 2493 { 2494 return _mm_movemask_epi8(_mm_packs_epi16(a, _mm_setzero_si128())); 2495 } 2496 unittest 2497 { 2498 assert(0x9C == _mm_movemask_epi16(_mm_set_epi16(-1, 1, 2, -3, -32768, -1, 32767, 8))); 2499 } 2500 2501 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 2502 /// loating-point element in `v`. 2503 int _mm_movemask_pd(__m128d v) pure @safe 2504 { 2505 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2506 static if (GDC_with_SSE2) 2507 { 2508 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2509 /// packed double-precision (64-bit) floating-point element in `v`. 2510 return __builtin_ia32_movmskpd(v); 2511 } 2512 else static if (LDC_with_SSE2) 2513 { 2514 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2515 /// packed double-precision (64-bit) floating-point element in `v`. 2516 return __builtin_ia32_movmskpd(v); 2517 } 2518 else 2519 { 2520 long2 lv = cast(long2)v; 2521 int r = 0; 2522 if (lv.array[0] < 0) r += 1; 2523 if (lv.array[1] < 0) r += 2; 2524 return r; 2525 } 2526 } 2527 unittest 2528 { 2529 __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0); 2530 assert(_mm_movemask_pd(A) == 2); 2531 } 2532 2533 /// Copy the lower 64-bit integer in `v`. 2534 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe 2535 { 2536 long2 lv = cast(long2)v; 2537 return long1(lv.array[0]); 2538 } 2539 unittest 2540 { 2541 __m128i A = _mm_set_epi64x(-1, -2); 2542 __m64 R = _mm_movepi64_pi64(A); 2543 assert(R.array[0] == -2); 2544 } 2545 2546 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element. 2547 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted 2548 { 2549 long2 r; 2550 r.ptr[0] = a.array[0]; 2551 r.ptr[1] = 0; 2552 return cast(__m128i)r; 2553 } 2554 2555 // Note: generates pmuludq in LDC with -O1 2556 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted 2557 { 2558 __m128i zero = _mm_setzero_si128(); 2559 2560 static if (__VERSION__ >= 2088) 2561 { 2562 // Need LLVM9 to avoid this shufflevector 2563 long2 la, lb; 2564 la.ptr[0] = cast(uint)a.array[0]; 2565 la.ptr[1] = cast(uint)a.array[2]; 2566 lb.ptr[0] = cast(uint)b.array[0]; 2567 lb.ptr[1] = cast(uint)b.array[2]; 2568 } 2569 else 2570 { 2571 long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero); 2572 long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero); 2573 } 2574 2575 version(DigitalMars) 2576 { 2577 // DMD has no long2 mul 2578 // long2 mul not supported before LDC 1.5 2579 la.ptr[0] *= lb.array[0]; 2580 la.ptr[1] *= lb.array[1]; 2581 return cast(__m128i)(la); 2582 } 2583 else 2584 { 2585 static if (__VERSION__ >= 2076) 2586 { 2587 return cast(__m128i)(la * lb); 2588 } 2589 else 2590 { 2591 // long2 mul not supported before LDC 1.5 2592 la.ptr[0] *= lb.array[0]; 2593 la.ptr[1] *= lb.array[1]; 2594 return cast(__m128i)(la); 2595 } 2596 } 2597 } 2598 unittest 2599 { 2600 __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff); 2601 __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff); 2602 __m128i C = _mm_mul_epu32(A, B); 2603 long2 LC = cast(long2)C; 2604 assert(LC.array[0] == 18446744065119617025uL); 2605 assert(LC.array[1] == 12723420444339690338uL); 2606 } 2607 2608 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 2609 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 2610 { 2611 pragma(inline, true); 2612 return a * b; 2613 } 2614 unittest 2615 { 2616 __m128d a = [-2.0, 1.5]; 2617 a = _mm_mul_pd(a, a); 2618 assert(a.array == [4.0, 2.25]); 2619 } 2620 2621 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 2622 /// element of result, and copy the upper element from `a` to the upper element of result. 2623 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted 2624 { 2625 version(DigitalMars) 2626 { 2627 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 2628 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 2629 asm pure nothrow @nogc @trusted { nop;} 2630 a.array[0] = a.array[0] * b.array[0]; 2631 return a; 2632 } 2633 else static if (GDC_with_SSE2) 2634 { 2635 return __builtin_ia32_mulsd(a, b); 2636 } 2637 else 2638 { 2639 a.ptr[0] *= b.array[0]; 2640 return a; 2641 } 2642 } 2643 unittest 2644 { 2645 __m128d a = [-2.0, 1.5]; 2646 a = _mm_mul_sd(a, a); 2647 assert(a.array == [4.0, 1.5]); 2648 } 2649 2650 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 2651 /// and get an unsigned 64-bit result. 2652 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe 2653 { 2654 return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b))); 2655 } 2656 unittest 2657 { 2658 __m64 A = _mm_set_pi32(42, 0xDEADBEEF); 2659 __m64 B = _mm_set_pi32(42, 0xCAFEBABE); 2660 __m64 C = _mm_mul_su32(A, B); 2661 assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL); 2662 } 2663 2664 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2665 /// high 16 bits of the intermediate integers. 2666 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted 2667 { 2668 static if (GDC_with_SSE2) 2669 { 2670 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2671 } 2672 else static if (LDC_with_SSE2) 2673 { 2674 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2675 } 2676 else 2677 { 2678 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h 2679 // PERF: it seems the simde solution has one less instruction in ARM64. 2680 // PERF: Catastrophic in ARM32. 2681 short8 sa = cast(short8)a; 2682 short8 sb = cast(short8)b; 2683 short8 r = void; 2684 r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16; 2685 r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16; 2686 r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16; 2687 r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16; 2688 r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16; 2689 r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16; 2690 r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16; 2691 r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16; 2692 return cast(__m128i)r; 2693 } 2694 } 2695 unittest 2696 { 2697 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2698 __m128i B = _mm_set1_epi16(16384); 2699 short8 R = cast(short8)_mm_mulhi_epi16(A, B); 2700 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1]; 2701 assert(R.array == correct); 2702 } 2703 2704 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2705 /// high 16 bits of the intermediate integers. 2706 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted 2707 { 2708 static if (GDC_with_SSE2) 2709 { 2710 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2711 } 2712 else static if (LDC_with_SSE2) 2713 { 2714 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2715 } 2716 else 2717 { 2718 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h 2719 // it seems the simde solution has one less instruction in ARM64 2720 // PERF: Catastrophic in ARM32. 2721 short8 sa = cast(short8)a; 2722 short8 sb = cast(short8)b; 2723 short8 r = void; 2724 r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 ); 2725 r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 ); 2726 r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 ); 2727 r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 ); 2728 r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 ); 2729 r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 ); 2730 r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 ); 2731 r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 ); 2732 return cast(__m128i)r; 2733 } 2734 } 2735 unittest 2736 { 2737 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2738 __m128i B = _mm_set1_epi16(16384); 2739 short8 R = cast(short8)_mm_mulhi_epu16(A, B); 2740 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1]; 2741 assert(R.array == correct); 2742 } 2743 2744 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 2745 /// bits of the intermediate integers. 2746 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe 2747 { 2748 return cast(__m128i)(cast(short8)a * cast(short8)b); 2749 } 2750 unittest 2751 { 2752 __m128i A = _mm_setr_epi16(16384, -16, 0, 3, 4, 1, 16, 7); 2753 __m128i B = _mm_set1_epi16(16384); 2754 short8 R = cast(short8)_mm_mullo_epi16(A, B); 2755 short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384]; 2756 assert(R.array == correct); 2757 } 2758 2759 /// Compute the bitwise NOT of 128 bits in `a`. #BONUS 2760 __m128i _mm_not_si128 (__m128i a) pure @safe 2761 { 2762 return ~a; 2763 } 2764 unittest 2765 { 2766 __m128i A = _mm_set1_epi32(-748); 2767 int4 notA = cast(int4) _mm_not_si128(A); 2768 int[4] correct = [747, 747, 747, 747]; 2769 assert(notA.array == correct); 2770 } 2771 2772 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 2773 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 2774 { 2775 pragma(inline, true); 2776 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 2777 } 2778 2779 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`. 2780 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 2781 { 2782 pragma(inline, true); 2783 return a | b; 2784 } 2785 2786 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation. 2787 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted 2788 { 2789 static if (GDC_with_SSE2) 2790 { 2791 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2792 } 2793 else static if (LDC_with_SSE2) 2794 { 2795 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2796 } 2797 else static if (LDC_with_ARM64) 2798 { 2799 short4 ra = vqmovn_s32(cast(int4)a); 2800 short4 rb = vqmovn_s32(cast(int4)b); 2801 return cast(__m128i)vcombine_s16(ra, rb); 2802 } 2803 else 2804 { 2805 // PERF: catastrophic on ARM32 2806 short8 r; 2807 r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]); 2808 r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]); 2809 r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]); 2810 r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]); 2811 r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]); 2812 r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]); 2813 r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]); 2814 r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]); 2815 return cast(__m128i)r; 2816 } 2817 } 2818 unittest 2819 { 2820 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 2821 short8 R = cast(short8) _mm_packs_epi32(A, A); 2822 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0]; 2823 assert(R.array == correct); 2824 } 2825 2826 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation. 2827 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted 2828 { 2829 static if (GDC_with_SSE2) 2830 { 2831 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2832 } 2833 else static if (LDC_with_SSE2) 2834 { 2835 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2836 } 2837 else static if (LDC_with_ARM64) 2838 { 2839 // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02 2840 byte8 ra = vqmovn_s16(cast(short8)a); 2841 byte8 rb = vqmovn_s16(cast(short8)b); 2842 return cast(__m128i)vcombine_s8(ra, rb); 2843 } 2844 else 2845 { 2846 // PERF: ARM32 is missing 2847 byte16 r; 2848 short8 sa = cast(short8)a; 2849 short8 sb = cast(short8)b; 2850 foreach(i; 0..8) 2851 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]); 2852 foreach(i; 0..8) 2853 r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]); 2854 return cast(__m128i)r; 2855 } 2856 } 2857 unittest 2858 { 2859 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0); 2860 byte16 R = cast(byte16) _mm_packs_epi16(A, A); 2861 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0, 2862 127, -128, 127, 0, 127, -128, 127, 0]; 2863 assert(R.array == correct); 2864 } 2865 2866 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation. 2867 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 2868 { 2869 static if (GDC_with_SSE2) 2870 { 2871 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2872 } 2873 else static if (LDC_with_SSE2) 2874 { 2875 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2876 } 2877 else static if (LDC_with_ARM64) 2878 { 2879 // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02 2880 byte8 ra = vqmovun_s16(cast(short8)a); 2881 byte8 rb = vqmovun_s16(cast(short8)b); 2882 return cast(__m128i)vcombine_s8(ra, rb); 2883 } 2884 else 2885 { 2886 short8 sa = cast(short8)a; 2887 short8 sb = cast(short8)b; 2888 ubyte[16] result = void; 2889 for (int i = 0; i < 8; ++i) 2890 { 2891 short s = sa[i]; 2892 if (s < 0) s = 0; 2893 if (s > 255) s = 255; 2894 result[i] = cast(ubyte)s; 2895 2896 s = sb[i]; 2897 if (s < 0) s = 0; 2898 if (s > 255) s = 255; 2899 result[i+8] = cast(ubyte)s; 2900 } 2901 return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr); 2902 } 2903 } 2904 unittest 2905 { 2906 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 2907 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 2908 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 2909 0, 255, 0, 255, 255, 2, 1, 0]; 2910 foreach(i; 0..16) 2911 assert(AA.array[i] == cast(byte)(correctResult[i])); 2912 } 2913 2914 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 2915 /// and power consumption of spin-wait loops. 2916 void _mm_pause() @trusted 2917 { 2918 version(GNU) 2919 { 2920 static if (GDC_with_SSE2) 2921 { 2922 __builtin_ia32_pause(); 2923 } 2924 else version(X86) 2925 { 2926 asm pure nothrow @nogc @trusted 2927 { 2928 "pause;\n" : : : ; 2929 } 2930 } 2931 else 2932 static assert(false); 2933 } 2934 else static if (LDC_with_SSE2) 2935 { 2936 __builtin_ia32_pause(); 2937 } 2938 else static if (DMD_with_asm) 2939 { 2940 asm nothrow @nogc pure @safe 2941 { 2942 rep; nop; // F3 90 = pause 2943 } 2944 } 2945 else version (LDC) 2946 { 2947 // PERF: Do nothing currently , could be the "yield" intruction on ARM. 2948 } 2949 else 2950 static assert(false); 2951 } 2952 unittest 2953 { 2954 _mm_pause(); 2955 } 2956 2957 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 2958 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 2959 /// low 16 bits of 64-bit elements in result. 2960 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted 2961 { 2962 static if (GDC_with_SSE2) 2963 { 2964 return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b); 2965 } 2966 else static if (LDC_with_SSE2) 2967 { 2968 return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b); 2969 } 2970 else static if (LDC_with_ARM64) 2971 { 2972 ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b)); 2973 2974 // PERF: Looks suboptimal vs addp 2975 ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]); 2976 ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]); 2977 ushort8 r = 0; 2978 r[0] = r0; 2979 r[4] = r4; 2980 return cast(__m128i) r; 2981 } 2982 else 2983 { 2984 // PERF: ARM32 is lacking 2985 byte16 ab = cast(byte16)a; 2986 byte16 bb = cast(byte16)b; 2987 ubyte[16] t; 2988 foreach(i; 0..16) 2989 { 2990 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]); 2991 if (diff < 0) diff = -diff; 2992 t[i] = cast(ubyte)(diff); 2993 } 2994 int4 r = _mm_setzero_si128(); 2995 r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 2996 r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15]; 2997 return r; 2998 } 2999 } 3000 unittest 3001 { 3002 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 3003 __m128i B = _mm_set1_epi8(1); 3004 __m128i R = _mm_sad_epu8(A, B); 3005 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 3006 0, 3007 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 3008 0]; 3009 assert(R.array == correct); 3010 } 3011 3012 /// Set packed 16-bit integers with the supplied values. 3013 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 3014 { 3015 short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7]; 3016 return cast(__m128i) loadUnaligned!(short8)(result.ptr); 3017 } 3018 unittest 3019 { 3020 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 3021 short8 B = cast(short8) A; 3022 foreach(i; 0..8) 3023 assert(B.array[i] == i); 3024 } 3025 3026 /// Set packed 32-bit integers with the supplied values. 3027 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3028 { 3029 pragma(inline, true); 3030 int[4] result = [e0, e1, e2, e3]; 3031 return loadUnaligned!(int4)(result.ptr); 3032 } 3033 unittest 3034 { 3035 __m128i A = _mm_set_epi32(3, 2, 1, 0); 3036 foreach(i; 0..4) 3037 assert(A.array[i] == i); 3038 } 3039 3040 /// Set packed 64-bit integers with the supplied values. 3041 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted 3042 { 3043 pragma(inline, true); 3044 long[2] result = [e0.array[0], e1.array[0]]; 3045 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 3046 } 3047 unittest 3048 { 3049 __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678)); 3050 long2 B = cast(long2) A; 3051 assert(B.array[0] == 5678); 3052 assert(B.array[1] == 1234); 3053 } 3054 3055 /// Set packed 64-bit integers with the supplied values. 3056 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 3057 { 3058 pragma(inline, true); 3059 long[2] result = [e0, e1]; 3060 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 3061 } 3062 unittest 3063 { 3064 __m128i A = _mm_set_epi64x(1234, 5678); 3065 long2 B = cast(long2) A; 3066 assert(B.array[0] == 5678); 3067 assert(B.array[1] == 1234); 3068 } 3069 3070 /// Set packed 8-bit integers with the supplied values. 3071 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 3072 byte e11, byte e10, byte e9, byte e8, 3073 byte e7, byte e6, byte e5, byte e4, 3074 byte e3, byte e2, byte e1, byte e0) pure @trusted 3075 { 3076 byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 3077 e8, e9, e10, e11, e12, e13, e14, e15]; 3078 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 3079 } 3080 3081 /// Set packed double-precision (64-bit) floating-point elements with the supplied values. 3082 __m128d _mm_set_pd (double e1, double e0) pure @trusted 3083 { 3084 pragma(inline, true); 3085 double[2] result = [e0, e1]; 3086 return loadUnaligned!(double2)(result.ptr); 3087 } 3088 unittest 3089 { 3090 __m128d A = _mm_set_pd(61.0, 55.0); 3091 double[2] correct = [55.0, 61.0]; 3092 assert(A.array == correct); 3093 } 3094 3095 /// Broadcast double-precision (64-bit) floating-point value `a` to all element. 3096 __m128d _mm_set_pd1 (double a) pure @trusted 3097 { 3098 pragma(inline, true); 3099 double[2] result = [a, a]; 3100 return loadUnaligned!(double2)(result.ptr); 3101 } 3102 unittest 3103 { 3104 __m128d A = _mm_set_pd1(61.0); 3105 double[2] correct = [61.0, 61.0]; 3106 assert(A.array == correct); 3107 } 3108 3109 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 3110 /// and zero the upper element. 3111 __m128d _mm_set_sd (double a) pure @trusted 3112 { 3113 double[2] result = [a, 0]; 3114 return loadUnaligned!(double2)(result.ptr); 3115 } 3116 3117 /// Broadcast 16-bit integer a to all elements of dst. 3118 __m128i _mm_set1_epi16 (short a) pure @trusted 3119 { 3120 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 3121 { 3122 short8 v = a; 3123 return cast(__m128i) v; 3124 } 3125 else 3126 { 3127 pragma(inline, true); 3128 return cast(__m128i)(short8(a)); 3129 } 3130 } 3131 unittest 3132 { 3133 short8 a = cast(short8) _mm_set1_epi16(31); 3134 for (int i = 0; i < 8; ++i) 3135 assert(a.array[i] == 31); 3136 } 3137 3138 /// Broadcast 32-bit integer `a` to all elements. 3139 __m128i _mm_set1_epi32 (int a) pure @trusted 3140 { 3141 pragma(inline, true); 3142 return cast(__m128i)(int4(a)); 3143 } 3144 unittest 3145 { 3146 int4 a = cast(int4) _mm_set1_epi32(31); 3147 for (int i = 0; i < 4; ++i) 3148 assert(a.array[i] == 31); 3149 } 3150 3151 /// Broadcast 64-bit integer `a` to all elements. 3152 __m128i _mm_set1_epi64 (__m64 a) pure @safe 3153 { 3154 return _mm_set_epi64(a, a); 3155 } 3156 unittest 3157 { 3158 long b = 0x1DEADCAFE; 3159 __m64 a; 3160 a.ptr[0] = b; 3161 long2 c = cast(long2) _mm_set1_epi64(a); 3162 assert(c.array[0] == b); 3163 assert(c.array[1] == b); 3164 } 3165 3166 /// Broadcast 64-bit integer `a` to all elements 3167 __m128i _mm_set1_epi64x (long a) pure @trusted 3168 { 3169 long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3170 return cast(__m128i)(b); 3171 } 3172 unittest 3173 { 3174 long b = 0x1DEADCAFE; 3175 long2 c = cast(long2) _mm_set1_epi64x(b); 3176 for (int i = 0; i < 2; ++i) 3177 assert(c.array[i] == b); 3178 } 3179 3180 /// Broadcast 8-bit integer `a` to all elements. 3181 __m128i _mm_set1_epi8 (byte a) pure @trusted 3182 { 3183 pragma(inline, true); 3184 byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3185 return cast(__m128i)(b); 3186 } 3187 unittest 3188 { 3189 byte16 b = cast(byte16) _mm_set1_epi8(31); 3190 for (int i = 0; i < 16; ++i) 3191 assert(b.array[i] == 31); 3192 } 3193 3194 alias _mm_set1_pd = _mm_set_pd1; 3195 3196 /// Set packed 16-bit integers with the supplied values in reverse order. 3197 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 3198 short e3, short e2, short e1, short e0) pure @trusted 3199 { 3200 short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0]; 3201 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 3202 } 3203 unittest 3204 { 3205 short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0); 3206 short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0]; 3207 assert(A.array == correct); 3208 } 3209 3210 /// Set packed 32-bit integers with the supplied values in reverse order. 3211 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3212 { 3213 pragma(inline, true); 3214 int[4] result = [e3, e2, e1, e0]; 3215 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 3216 } 3217 unittest 3218 { 3219 int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647); 3220 int[4] correct = [-1, 0, -2147483648, 2147483647]; 3221 assert(A.array == correct); 3222 } 3223 3224 /// Set packed 64-bit integers with the supplied values in reverse order. 3225 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 3226 { 3227 long[2] result = [e1, e0]; 3228 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 3229 } 3230 unittest 3231 { 3232 long2 A = cast(long2) _mm_setr_epi64(-1, 0); 3233 long[2] correct = [-1, 0]; 3234 assert(A.array == correct); 3235 } 3236 3237 /// Set packed 8-bit integers with the supplied values in reverse order. 3238 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12, 3239 byte e11, byte e10, byte e9, byte e8, 3240 byte e7, byte e6, byte e5, byte e4, 3241 byte e3, byte e2, byte e1, byte e0) pure @trusted 3242 { 3243 byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 3244 e7, e6, e5, e4, e3, e2, e1, e0]; 3245 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 3246 } 3247 3248 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order. 3249 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 3250 { 3251 pragma(inline, true); 3252 double2 result; 3253 result.ptr[0] = e1; 3254 result.ptr[1] = e0; 3255 return result; 3256 } 3257 unittest 3258 { 3259 __m128d A = _mm_setr_pd(61.0, 55.0); 3260 double[2] correct = [61.0, 55.0]; 3261 assert(A.array == correct); 3262 } 3263 3264 /// Return vector of type `__m128d` with all elements set to zero. 3265 __m128d _mm_setzero_pd () pure @trusted 3266 { 3267 pragma(inline, true); 3268 // Note: using loadUnaligned has better -O0 codegen compared to .ptr 3269 double[2] result = [0.0, 0.0]; 3270 return loadUnaligned!(double2)(result.ptr); 3271 } 3272 3273 /// Return vector of type `__m128i` with all elements set to zero. 3274 __m128i _mm_setzero_si128() pure @trusted 3275 { 3276 pragma(inline, true); 3277 // Note: using loadUnaligned has better -O0 codegen compared to .ptr 3278 int[4] result = [0, 0, 0, 0]; 3279 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 3280 } 3281 3282 /// Shuffle 32-bit integers in a using the control in `imm8`. 3283 /// See_also: `_MM_SHUFFLE`. 3284 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe 3285 { 3286 static if (GDC_with_SSE2) 3287 { 3288 return __builtin_ia32_pshufd(a, imm8); 3289 } 3290 else 3291 { 3292 return shufflevector!(int4, (imm8 >> 0) & 3, 3293 (imm8 >> 2) & 3, 3294 (imm8 >> 4) & 3, 3295 (imm8 >> 6) & 3)(a, a); 3296 } 3297 } 3298 unittest 3299 { 3300 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 3301 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3302 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 3303 int[4] expectedB = [ 3, 2, 1, 0 ]; 3304 assert(B.array == expectedB); 3305 } 3306 3307 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`. 3308 /// See_also: `_MM_SHUFFLE2`. 3309 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe 3310 { 3311 static if (GDC_with_SSE2) 3312 { 3313 return __builtin_ia32_shufpd(a, b, imm8); 3314 } 3315 else 3316 { 3317 return shufflevector!(double2, 0 + ( imm8 & 1 ), 3318 2 + ( (imm8 >> 1) & 1 ))(a, b); 3319 } 3320 } 3321 unittest 3322 { 3323 __m128d A = _mm_setr_pd(0.5, 2.0); 3324 __m128d B = _mm_setr_pd(4.0, 5.0); 3325 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 3326 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B); 3327 double[2] correct = [ 2.0, 5.0 ]; 3328 assert(R.array == correct); 3329 } 3330 3331 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 3332 /// 64 bits of result, with the low 64 bits being copied from from `a` to result. 3333 /// See also: `_MM_SHUFFLE`. 3334 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe 3335 { 3336 static if (GDC_with_SSE2) 3337 { 3338 return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8); 3339 } 3340 else 3341 { 3342 return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3, 3343 4 + ( (imm8 >> 0) & 3 ), 3344 4 + ( (imm8 >> 2) & 3 ), 3345 4 + ( (imm8 >> 4) & 3 ), 3346 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 3347 } 3348 } 3349 unittest 3350 { 3351 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3352 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3353 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 3354 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 3355 assert(C.array == expectedC); 3356 } 3357 3358 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 3359 /// bits of result, with the high 64 bits being copied from from `a` to result. 3360 /// See_also: `_MM_SHUFFLE`. 3361 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe 3362 { 3363 static if (GDC_with_SSE2) 3364 { 3365 return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8); 3366 } 3367 else 3368 { 3369 return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ), 3370 ( (imm8 >> 2) & 3 ), 3371 ( (imm8 >> 4) & 3 ), 3372 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 3373 } 3374 } 3375 unittest 3376 { 3377 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3378 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3379 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 3380 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 3381 assert(B.array == expectedB); 3382 } 3383 3384 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros. 3385 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted 3386 { 3387 static if (LDC_with_SSE2) 3388 { 3389 return __builtin_ia32_pslld128(a, count); 3390 } 3391 else static if (GDC_with_SSE2) 3392 { 3393 return __builtin_ia32_pslld128(a, count); 3394 } 3395 else static if (DMD_with_32bit_asm) 3396 { 3397 asm pure nothrow @nogc @trusted 3398 { 3399 movdqu XMM0, a; 3400 movdqu XMM1, count; 3401 pslld XMM0, XMM1; 3402 movdqu a, XMM0; 3403 } 3404 return a; 3405 } 3406 else 3407 { 3408 int4 r = void; 3409 long2 lc = cast(long2)count; 3410 int bits = cast(int)(lc.array[0]); 3411 foreach(i; 0..4) 3412 r[i] = cast(uint)(a[i]) << bits; 3413 return r; 3414 } 3415 } 3416 3417 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros. 3418 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted 3419 { 3420 static if (LDC_with_SSE2) 3421 { 3422 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3423 } 3424 else static if (GDC_with_SSE2) 3425 { 3426 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3427 } 3428 else static if (DMD_with_32bit_asm) 3429 { 3430 asm pure nothrow @nogc @trusted 3431 { 3432 movdqu XMM0, a; 3433 movdqu XMM1, count; 3434 psllq XMM0, XMM1; 3435 movdqu a, XMM0; 3436 } 3437 return a; 3438 } 3439 else 3440 { 3441 // ARM: good since LDC 1.12 -O2 3442 // ~but -O0 version is catastrophic 3443 long2 r = void; 3444 long2 sa = cast(long2)a; 3445 long2 lc = cast(long2)count; 3446 int bits = cast(int)(lc.array[0]); 3447 foreach(i; 0..2) 3448 r.array[i] = cast(ulong)(sa.array[i]) << bits; 3449 return cast(__m128i)r; 3450 } 3451 } 3452 3453 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros. 3454 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 3455 { 3456 static if (LDC_with_SSE2) 3457 { 3458 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3459 } 3460 else static if (GDC_with_SSE2) 3461 { 3462 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3463 } 3464 else static if (DMD_with_32bit_asm) 3465 { 3466 asm pure nothrow @nogc 3467 { 3468 movdqu XMM0, a; 3469 movdqu XMM1, count; 3470 psllw XMM0, XMM1; 3471 movdqu a, XMM0; 3472 } 3473 return a; 3474 } 3475 else 3476 { 3477 short8 sa = cast(short8)a; 3478 long2 lc = cast(long2)count; 3479 int bits = cast(int)(lc.array[0]); 3480 short8 r = void; 3481 foreach(i; 0..8) 3482 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits); 3483 return cast(int4)r; 3484 } 3485 } 3486 3487 3488 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 3489 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted 3490 { 3491 static if (GDC_with_SSE2) 3492 { 3493 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3494 } 3495 else static if (LDC_with_SSE2) 3496 { 3497 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3498 } 3499 else 3500 { 3501 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3502 // D says "It's illegal to shift by the same or more bits 3503 // than the size of the quantity being shifted" 3504 // and it's UB instead. 3505 int4 r = _mm_setzero_si128(); 3506 3507 ubyte count = cast(ubyte) imm8; 3508 if (count > 31) 3509 return r; 3510 3511 foreach(i; 0..4) 3512 r.array[i] = cast(uint)(a.array[i]) << count; 3513 return r; 3514 } 3515 } 3516 unittest 3517 { 3518 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3519 __m128i B = _mm_slli_epi32(A, 1); 3520 __m128i B2 = _mm_slli_epi32(A, 1 + 256); 3521 int[4] expectedB = [ 0, 4, 6, -8]; 3522 assert(B.array == expectedB); 3523 assert(B2.array == expectedB); 3524 3525 __m128i C = _mm_slli_epi32(A, 0); 3526 int[4] expectedC = [ 0, 2, 3, -4]; 3527 assert(C.array == expectedC); 3528 3529 __m128i D = _mm_slli_epi32(A, 65); 3530 int[4] expectedD = [ 0, 0, 0, 0]; 3531 assert(D.array == expectedD); 3532 } 3533 3534 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros. 3535 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted 3536 { 3537 static if (GDC_with_SSE2) 3538 { 3539 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3540 } 3541 else static if (LDC_with_SSE2) 3542 { 3543 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3544 } 3545 else 3546 { 3547 long2 sa = cast(long2)a; 3548 3549 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3550 // D says "It's illegal to shift by the same or more bits 3551 // than the size of the quantity being shifted" 3552 // and it's UB instead. 3553 long2 r = cast(long2) _mm_setzero_si128(); 3554 ubyte count = cast(ubyte) imm8; 3555 if (count > 63) 3556 return cast(__m128i)r; 3557 3558 r.ptr[0] = cast(ulong)(sa.array[0]) << count; 3559 r.ptr[1] = cast(ulong)(sa.array[1]) << count; 3560 return cast(__m128i)r; 3561 } 3562 } 3563 unittest 3564 { 3565 __m128i A = _mm_setr_epi64(8, -4); 3566 long2 B = cast(long2) _mm_slli_epi64(A, 1); 3567 long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024); 3568 long[2] expectedB = [ 16, -8]; 3569 assert(B.array == expectedB); 3570 assert(B2.array == expectedB); 3571 3572 long2 C = cast(long2) _mm_slli_epi64(A, 0); 3573 long[2] expectedC = [ 8, -4]; 3574 assert(C.array == expectedC); 3575 3576 long2 D = cast(long2) _mm_slli_epi64(A, 64); 3577 long[2] expectedD = [ 0, -0]; 3578 assert(D.array == expectedD); 3579 } 3580 3581 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 3582 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted 3583 { 3584 static if (GDC_with_SSE2) 3585 { 3586 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3587 } 3588 else static if (LDC_with_SSE2) 3589 { 3590 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3591 } 3592 else static if (LDC_with_ARM64) 3593 { 3594 short8 sa = cast(short8)a; 3595 short8 r = cast(short8)_mm_setzero_si128(); 3596 ubyte count = cast(ubyte) imm8; 3597 if (count > 15) 3598 return cast(__m128i)r; 3599 r = sa << short8(count); 3600 return cast(__m128i)r; 3601 } 3602 else 3603 { 3604 short8 sa = cast(short8)a; 3605 short8 r = cast(short8)_mm_setzero_si128(); 3606 ubyte count = cast(ubyte) imm8; 3607 if (count > 15) 3608 return cast(__m128i)r; 3609 foreach(i; 0..8) 3610 r.ptr[i] = cast(short)(sa.array[i] << count); 3611 return cast(__m128i)r; 3612 } 3613 } 3614 unittest 3615 { 3616 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3617 short8 B = cast(short8)( _mm_slli_epi16(A, 1) ); 3618 short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) ); 3619 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 3620 assert(B.array == expectedB); 3621 assert(B2.array == expectedB); 3622 3623 short8 C = cast(short8)( _mm_slli_epi16(A, 16) ); 3624 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 3625 assert(C.array == expectedC); 3626 } 3627 3628 3629 /// Shift `a` left by `bytes` bytes while shifting in zeros. 3630 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted 3631 { 3632 static if (bytes & 0xF0) 3633 { 3634 return _mm_setzero_si128(); 3635 } 3636 else 3637 { 3638 static if (GDC_with_SSE2) 3639 { 3640 return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 3641 } 3642 else version(DigitalMars) 3643 { 3644 version(D_InlineAsm_X86) 3645 { 3646 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64 3647 { 3648 movdqu XMM0, op; 3649 pslldq XMM0, bytes; 3650 movdqu op, XMM0; 3651 } 3652 return op; 3653 } 3654 else 3655 { 3656 byte16 A = cast(byte16)op; 3657 byte16 R; 3658 for (int n = 15; n >= bytes; --n) 3659 R.ptr[n] = A.array[n-bytes]; 3660 for (int n = bytes-1; n >= 0; --n) 3661 R.ptr[n] = 0; 3662 return cast(__m128i)R; 3663 } 3664 } 3665 else 3666 { 3667 return cast(__m128i) shufflevector!(byte16, 3668 16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes, 3669 22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes, 3670 28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes) 3671 (cast(byte16)_mm_setzero_si128(), cast(byte16)op); 3672 } 3673 } 3674 } 3675 unittest 3676 { 3677 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3678 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left 3679 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ]; 3680 assert(R.array == correct); 3681 3682 __m128i B = _mm_srli_si128!16(_mm_set1_epi32(-1)); 3683 int[4] expectedB = [0, 0, 0, 0]; 3684 assert(B.array == expectedB); 3685 } 3686 3687 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`. 3688 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted 3689 { 3690 version(LDC) 3691 { 3692 // Disappeared with LDC 1.11 3693 static if (__VERSION__ < 2081) 3694 return __builtin_ia32_sqrtpd(vec); 3695 else 3696 { 3697 vec.array[0] = llvm_sqrt(vec.array[0]); 3698 vec.array[1] = llvm_sqrt(vec.array[1]); 3699 return vec; 3700 } 3701 } 3702 else static if (GDC_with_SSE2) 3703 { 3704 return __builtin_ia32_sqrtpd(vec); 3705 } 3706 else 3707 { 3708 vec.ptr[0] = sqrt(vec.array[0]); 3709 vec.ptr[1] = sqrt(vec.array[1]); 3710 return vec; 3711 } 3712 } 3713 3714 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 3715 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 3716 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted 3717 { 3718 // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only. 3719 // "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 3720 // The quadword at bits 127:64 of the destination operand remains unchanged." 3721 version(LDC) 3722 { 3723 // Disappeared with LDC 1.11 3724 static if (__VERSION__ < 2081) 3725 { 3726 __m128d c = __builtin_ia32_sqrtsd(b); 3727 a[0] = c[0]; 3728 return a; 3729 } 3730 else 3731 { 3732 a.array[0] = llvm_sqrt(b.array[0]); 3733 return a; 3734 } 3735 } 3736 else static if (GDC_with_SSE2) 3737 { 3738 __m128d c = __builtin_ia32_sqrtsd(b); 3739 a.ptr[0] = c.array[0]; 3740 return a; 3741 } 3742 else 3743 { 3744 a.ptr[0] = sqrt(b.array[0]); 3745 return a; 3746 } 3747 } 3748 unittest 3749 { 3750 __m128d A = _mm_setr_pd(1.0, 3.0); 3751 __m128d B = _mm_setr_pd(4.0, 5.0); 3752 __m128d R = _mm_sqrt_sd(A, B); 3753 double[2] correct = [2.0, 3.0 ]; 3754 assert(R.array == correct); 3755 } 3756 3757 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits. 3758 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted 3759 { 3760 static if (GDC_with_SSE2) 3761 { 3762 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3763 } 3764 else static if (LDC_with_SSE2) 3765 { 3766 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3767 } 3768 else 3769 { 3770 short8 sa = cast(short8)a; 3771 long2 lc = cast(long2)count; 3772 int bits = cast(int)(lc.array[0]); 3773 short8 r = void; 3774 foreach(i; 0..8) 3775 r.ptr[i] = cast(short)(sa.array[i] >> bits); 3776 return cast(int4)r; 3777 } 3778 } 3779 3780 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits. 3781 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted 3782 { 3783 static if (LDC_with_SSE2) 3784 { 3785 return __builtin_ia32_psrad128(a, count); 3786 } 3787 else static if (GDC_with_SSE2) 3788 { 3789 return __builtin_ia32_psrad128(a, count); 3790 } 3791 else 3792 { 3793 int4 r = void; 3794 long2 lc = cast(long2)count; 3795 int bits = cast(int)(lc.array[0]); 3796 r.ptr[0] = (a.array[0] >> bits); 3797 r.ptr[1] = (a.array[1] >> bits); 3798 r.ptr[2] = (a.array[2] >> bits); 3799 r.ptr[3] = (a.array[3] >> bits); 3800 return r; 3801 } 3802 } 3803 3804 3805 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 3806 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted 3807 { 3808 static if (GDC_with_SSE2) 3809 { 3810 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 3811 } 3812 else static if (LDC_with_SSE2) 3813 { 3814 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 3815 } 3816 else static if (LDC_with_ARM64) 3817 { 3818 short8 sa = cast(short8)a; 3819 ubyte count = cast(ubyte)imm8; 3820 if (count > 15) 3821 count = 15; 3822 short8 r = sa >> short8(count); 3823 return cast(__m128i)r; 3824 } 3825 else 3826 { 3827 short8 sa = cast(short8)a; 3828 short8 r = void; 3829 3830 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3831 // D says "It's illegal to shift by the same or more bits 3832 // than the size of the quantity being shifted" 3833 // and it's UB instead. 3834 ubyte count = cast(ubyte)imm8; 3835 if (count > 15) 3836 count = 15; 3837 foreach(i; 0..8) 3838 r.ptr[i] = cast(short)(sa.array[i] >> count); 3839 return cast(int4)r; 3840 } 3841 } 3842 unittest 3843 { 3844 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3845 short8 B = cast(short8)( _mm_srai_epi16(A, 1) ); 3846 short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) ); 3847 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 3848 assert(B.array == expectedB); 3849 assert(B2.array == expectedB); 3850 3851 short8 C = cast(short8)( _mm_srai_epi16(A, 18) ); 3852 short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ]; 3853 assert(C.array == expectedC); 3854 } 3855 3856 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 3857 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted 3858 { 3859 static if (LDC_with_SSE2) 3860 { 3861 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 3862 } 3863 else static if (GDC_with_SSE2) 3864 { 3865 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 3866 } 3867 else 3868 { 3869 int4 r = void; 3870 3871 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3872 // D says "It's illegal to shift by the same or more bits 3873 // than the size of the quantity being shifted" 3874 // and it's UB instead. 3875 ubyte count = cast(ubyte) imm8; 3876 if (count > 31) 3877 count = 31; 3878 3879 r.ptr[0] = (a.array[0] >> count); 3880 r.ptr[1] = (a.array[1] >> count); 3881 r.ptr[2] = (a.array[2] >> count); 3882 r.ptr[3] = (a.array[3] >> count); 3883 return r; 3884 } 3885 } 3886 unittest 3887 { 3888 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3889 __m128i B = _mm_srai_epi32(A, 1); 3890 __m128i B2 = _mm_srai_epi32(A, 1 + 256); 3891 int[4] expectedB = [ 0, 1, 1, -2]; 3892 assert(B.array == expectedB); 3893 assert(B2.array == expectedB); 3894 3895 __m128i C = _mm_srai_epi32(A, 32); 3896 int[4] expectedC = [ 0, 0, 0, -1]; 3897 assert(C.array == expectedC); 3898 3899 __m128i D = _mm_srai_epi32(A, 0); 3900 int[4] expectedD = [ 0, 2, 3, -4]; 3901 assert(D.array == expectedD); 3902 } 3903 3904 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted 3905 { 3906 static if (LDC_with_SSE2) 3907 { 3908 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 3909 } 3910 else static if (GDC_with_SSE2) 3911 { 3912 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 3913 } 3914 else 3915 { 3916 short8 sa = cast(short8)a; 3917 long2 lc = cast(long2)count; 3918 int bits = cast(int)(lc.array[0]); 3919 short8 r = void; 3920 foreach(i; 0..8) 3921 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits); 3922 return cast(int4)r; 3923 } 3924 } 3925 3926 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted 3927 { 3928 static if (LDC_with_SSE2) 3929 { 3930 return __builtin_ia32_psrld128(a, count); 3931 } 3932 else static if (GDC_with_SSE2) 3933 { 3934 return __builtin_ia32_psrld128(a, count); 3935 } 3936 else 3937 { 3938 int4 r = void; 3939 long2 lc = cast(long2)count; 3940 int bits = cast(int)(lc.array[0]); 3941 r.ptr[0] = cast(uint)(a.array[0]) >> bits; 3942 r.ptr[1] = cast(uint)(a.array[1]) >> bits; 3943 r.ptr[2] = cast(uint)(a.array[2]) >> bits; 3944 r.ptr[3] = cast(uint)(a.array[3]) >> bits; 3945 return r; 3946 } 3947 } 3948 3949 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted 3950 { 3951 static if (LDC_with_SSE2) 3952 { 3953 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 3954 } 3955 else static if (GDC_with_SSE2) 3956 { 3957 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 3958 } 3959 else 3960 { 3961 long2 r = void; 3962 long2 sa = cast(long2)a; 3963 long2 lc = cast(long2)count; 3964 int bits = cast(int)(lc.array[0]); 3965 r.ptr[0] = cast(ulong)(sa.array[0]) >> bits; 3966 r.ptr[1] = cast(ulong)(sa.array[1]) >> bits; 3967 return cast(__m128i)r; 3968 } 3969 } 3970 3971 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 3972 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted 3973 { 3974 static if (GDC_with_SSE2) 3975 { 3976 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 3977 } 3978 else static if (LDC_with_SSE2) 3979 { 3980 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 3981 } 3982 else static if (LDC_with_ARM64) 3983 { 3984 short8 sa = cast(short8)a; 3985 short8 r = cast(short8) _mm_setzero_si128(); 3986 3987 ubyte count = cast(ubyte)imm8; 3988 if (count >= 16) 3989 return cast(__m128i)r; 3990 3991 r = sa >>> short8(count); // This facility offered with LDC, but not DMD. 3992 return cast(__m128i)r; 3993 } 3994 else 3995 { 3996 short8 sa = cast(short8)a; 3997 ubyte count = cast(ubyte)imm8; 3998 3999 short8 r = cast(short8) _mm_setzero_si128(); 4000 if (count >= 16) 4001 return cast(__m128i)r; 4002 4003 foreach(i; 0..8) 4004 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count); 4005 return cast(__m128i)r; 4006 } 4007 } 4008 unittest 4009 { 4010 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 4011 short8 B = cast(short8)( _mm_srli_epi16(A, 1) ); 4012 short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) ); 4013 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 4014 assert(B.array == expectedB); 4015 assert(B2.array == expectedB); 4016 4017 short8 C = cast(short8)( _mm_srli_epi16(A, 16) ); 4018 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0]; 4019 assert(C.array == expectedC); 4020 4021 short8 D = cast(short8)( _mm_srli_epi16(A, 0) ); 4022 short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ]; 4023 assert(D.array == expectedD); 4024 } 4025 4026 4027 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 4028 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted 4029 { 4030 static if (GDC_with_SSE2) 4031 { 4032 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 4033 } 4034 else static if (LDC_with_SSE2) 4035 { 4036 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 4037 } 4038 else 4039 { 4040 ubyte count = cast(ubyte) imm8; 4041 4042 // Note: the intrinsics guarantee imm8[0..7] is taken, however 4043 // D says "It's illegal to shift by the same or more bits 4044 // than the size of the quantity being shifted" 4045 // and it's UB instead. 4046 int4 r = _mm_setzero_si128(); 4047 if (count >= 32) 4048 return r; 4049 r.ptr[0] = a.array[0] >>> count; 4050 r.ptr[1] = a.array[1] >>> count; 4051 r.ptr[2] = a.array[2] >>> count; 4052 r.ptr[3] = a.array[3] >>> count; 4053 return r; 4054 } 4055 } 4056 unittest 4057 { 4058 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 4059 __m128i B = _mm_srli_epi32(A, 1); 4060 __m128i B2 = _mm_srli_epi32(A, 1 + 256); 4061 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 4062 assert(B.array == expectedB); 4063 assert(B2.array == expectedB); 4064 4065 __m128i C = _mm_srli_epi32(A, 255); 4066 int[4] expectedC = [ 0, 0, 0, 0 ]; 4067 assert(C.array == expectedC); 4068 } 4069 4070 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros. 4071 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted 4072 { 4073 static if (GDC_with_SSE2) 4074 { 4075 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4076 } 4077 else static if (LDC_with_SSE2) 4078 { 4079 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4080 } 4081 else 4082 { 4083 long2 r = cast(long2) _mm_setzero_si128(); 4084 long2 sa = cast(long2)a; 4085 4086 ubyte count = cast(ubyte) imm8; 4087 if (count >= 64) 4088 return cast(__m128i)r; 4089 4090 r.ptr[0] = sa.array[0] >>> count; 4091 r.ptr[1] = sa.array[1] >>> count; 4092 return cast(__m128i)r; 4093 } 4094 } 4095 unittest 4096 { 4097 __m128i A = _mm_setr_epi64(8, -4); 4098 long2 B = cast(long2) _mm_srli_epi64(A, 1); 4099 long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512); 4100 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 4101 assert(B.array == expectedB); 4102 assert(B2.array == expectedB); 4103 4104 long2 C = cast(long2) _mm_srli_epi64(A, 64); 4105 long[2] expectedC = [ 0, 0 ]; 4106 assert(C.array == expectedC); 4107 } 4108 4109 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4110 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe 4111 { 4112 static if (bytes & 0xF0) 4113 { 4114 return _mm_setzero_si128(); 4115 } 4116 else static if (GDC_with_SSE2) 4117 { 4118 return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8)); 4119 } 4120 else static if (DMD_with_32bit_asm) 4121 { 4122 asm pure nothrow @nogc @trusted 4123 { 4124 movdqu XMM0, v; 4125 psrldq XMM0, bytes; 4126 movdqu v, XMM0; 4127 } 4128 return v; 4129 } 4130 else 4131 { 4132 return cast(__m128i) shufflevector!(byte16, 4133 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7, 4134 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15) 4135 (cast(byte16) v, cast(byte16)_mm_setzero_si128()); 4136 } 4137 } 4138 unittest 4139 { 4140 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1)); 4141 int[4] correct = [2, 3, 4, 0]; 4142 assert(R.array == correct); 4143 4144 __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1)); 4145 int[4] expectedA = [0, 0, 0, 0]; 4146 assert(A.array == expectedA); 4147 } 4148 4149 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4150 /// #BONUS 4151 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe 4152 { 4153 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v); 4154 } 4155 unittest 4156 { 4157 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)); 4158 float[4] correct = [3.0f, 4.0f, 0, 0]; 4159 assert(R.array == correct); 4160 } 4161 4162 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4163 /// #BONUS 4164 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe 4165 { 4166 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v); 4167 } 4168 4169 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4170 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4171 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted 4172 { 4173 pragma(inline, true); 4174 __m128d* aligned = cast(__m128d*)mem_addr; 4175 *aligned = a; 4176 } 4177 4178 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 4179 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4180 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted 4181 { 4182 __m128d* aligned = cast(__m128d*)mem_addr; 4183 __m128d r; 4184 r.ptr[0] = a.array[0]; 4185 r.ptr[1] = a.array[0]; 4186 *aligned = r; 4187 } 4188 4189 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 4190 /// be aligned on any particular boundary. 4191 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 4192 { 4193 pragma(inline, true); 4194 *mem_addr = a.array[0]; 4195 } 4196 4197 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 4198 /// general-protection exception may be generated. 4199 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 4200 { 4201 pragma(inline, true); 4202 *mem_addr = a; 4203 } 4204 4205 alias _mm_store1_pd = _mm_store_pd1; /// 4206 4207 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory. 4208 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 4209 { 4210 pragma(inline, true); 4211 *mem_addr = a.array[1]; 4212 } 4213 4214 // Note: `mem_addr` doesn't have to actually be aligned, which breaks 4215 // expectations from the user point of view. This problem also exist in C++. 4216 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 4217 { 4218 pragma(inline, true); 4219 long* dest = cast(long*)mem_addr; 4220 long2 la = cast(long2)a; 4221 *dest = la.array[0]; 4222 } 4223 unittest 4224 { 4225 long[3] A = [1, 2, 3]; 4226 _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 4227 long[3] correct = [1, 0x1_0000_0000, 3]; 4228 assert(A == correct); 4229 } 4230 4231 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. 4232 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 4233 { 4234 pragma(inline, true); 4235 *mem_addr = a.array[0]; 4236 } 4237 4238 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order. `mem_addr` must be 4239 /// aligned on a 16-byte boundary or a general-protection exception may be generated. 4240 void _mm_storer_pd (double* mem_addr, __m128d a) pure 4241 { 4242 __m128d* aligned = cast(__m128d*)mem_addr; 4243 *aligned = shufflevector!(double2, 1, 0)(a, a); 4244 } 4245 4246 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4247 /// `mem_addr` does not need to be aligned on any particular boundary. 4248 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe 4249 { 4250 pragma(inline, true); 4251 storeUnaligned!double2(a, mem_addr); 4252 } 4253 4254 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 4255 /// boundary. 4256 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe 4257 { 4258 pragma(inline, true); 4259 storeUnaligned!__m128i(a, cast(int*)mem_addr); 4260 } 4261 4262 /// Store 32-bit integer from the first element of `a` into memory. 4263 /// `mem_addr` does not need to be aligned on any particular boundary. 4264 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted 4265 { 4266 pragma(inline, true); 4267 int* dest = cast(int*)mem_addr; 4268 *dest = a.array[0]; 4269 } 4270 unittest 4271 { 4272 int[2] arr = [-24, 12]; 4273 _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7)); 4274 assert(arr == [-24, -1]); 4275 } 4276 4277 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 4278 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte 4279 /// boundary or a general-protection exception may be generated. 4280 void _mm_stream_pd (double* mem_addr, __m128d a) 4281 { 4282 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4283 __m128d* dest = cast(__m128d*)mem_addr; 4284 *dest = a; 4285 } 4286 4287 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 4288 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception 4289 /// may be generated. 4290 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) 4291 { 4292 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4293 __m128i* dest = cast(__m128i*)mem_addr; 4294 *dest = a; 4295 } 4296 4297 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 4298 /// pollution. If the cache line containing address mem_addr is already in the cache, 4299 /// the cache will be updated. 4300 void _mm_stream_si32 (int* mem_addr, int a) 4301 { 4302 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4303 *mem_addr = a; 4304 } 4305 4306 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 4307 /// cache pollution. If the cache line containing address mem_addr is already 4308 /// in the cache, the cache will be updated. 4309 void _mm_stream_si64 (long* mem_addr, long a) 4310 { 4311 // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4312 *mem_addr = a; 4313 } 4314 4315 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 4316 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 4317 { 4318 pragma(inline, true); 4319 return cast(__m128i)(cast(short8)a - cast(short8)b); 4320 } 4321 4322 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 4323 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 4324 { 4325 pragma(inline, true); 4326 return cast(__m128i)(cast(int4)a - cast(int4)b); 4327 } 4328 4329 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`. 4330 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 4331 { 4332 pragma(inline, true); 4333 return cast(__m128i)(cast(long2)a - cast(long2)b); 4334 } 4335 4336 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 4337 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 4338 { 4339 pragma(inline, true); 4340 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 4341 } 4342 4343 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 4344 /// floating-point elements in `a`. 4345 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 4346 { 4347 pragma(inline, true); 4348 return a - b; 4349 } 4350 4351 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 4352 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the 4353 /// upper element of result. 4354 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted 4355 { 4356 version(DigitalMars) 4357 { 4358 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 4359 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 4360 asm pure nothrow @nogc @trusted { nop;} 4361 a[0] = a[0] - b[0]; 4362 return a; 4363 } 4364 else static if (GDC_with_SSE2) 4365 { 4366 return __builtin_ia32_subsd(a, b); 4367 } 4368 else 4369 { 4370 a.ptr[0] -= b.array[0]; 4371 return a; 4372 } 4373 } 4374 unittest 4375 { 4376 __m128d a = [1.5, -2.0]; 4377 a = _mm_sub_sd(a, a); 4378 assert(a.array == [0.0, -2.0]); 4379 } 4380 4381 /// Subtract 64-bit integer `b` from 64-bit integer `a`. 4382 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe 4383 { 4384 pragma(inline, true); 4385 return a - b; 4386 } 4387 4388 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4389 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 4390 { 4391 version(LDC) 4392 { 4393 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4394 { 4395 // Generates PSUBSW since LDC 1.15 -O0 4396 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4397 4398 enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4399 enum ir = ` 4400 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4401 ret <8 x i16> %r`; 4402 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4403 } 4404 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4405 { 4406 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4407 short[8] res; 4408 short8 sa = cast(short8)a; 4409 short8 sb = cast(short8)b; 4410 foreach(i; 0..8) 4411 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4412 return _mm_loadu_si128(cast(int4*)res.ptr); 4413 } 4414 else static if (LDC_with_SSE2) 4415 { 4416 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 4417 } 4418 else 4419 static assert(false); 4420 } 4421 else static if (GDC_with_SSE2) 4422 { 4423 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 4424 } 4425 else 4426 { 4427 short[8] res; 4428 short8 sa = cast(short8)a; 4429 short8 sb = cast(short8)b; 4430 foreach(i; 0..8) 4431 res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4432 return _mm_loadu_si128(cast(int4*)res.ptr); 4433 } 4434 } 4435 unittest 4436 { 4437 short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0), 4438 _mm_setr_epi16(-10 , 16, 5, 4, 3, 2, 1, 0)); 4439 static immutable short[8] correctResult = [32767, -32768, 0, 0, 0, 0, 0, 0]; 4440 assert(res.array == correctResult); 4441 } 4442 4443 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 4444 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 4445 { 4446 version(LDC) 4447 { 4448 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4449 { 4450 // x86: Generates PSUBSB since LDC 1.15 -O0 4451 // ARM: Generates sqsub.16b since LDC 1.21 -O0 4452 enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 4453 enum ir = ` 4454 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 4455 ret <16 x i8> %r`; 4456 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 4457 } 4458 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4459 { 4460 byte[16] res; 4461 byte16 sa = cast(byte16)a; 4462 byte16 sb = cast(byte16)b; 4463 foreach(i; 0..16) 4464 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4465 return _mm_loadu_si128(cast(int4*)res.ptr); 4466 } 4467 else static if (LDC_with_SSE2) 4468 { 4469 return cast(__m128i) __builtin_ia32_psubsb128(cast(byte16) a, cast(byte16) b); 4470 } 4471 else 4472 static assert(false); 4473 } 4474 else static if (GDC_with_SSE2) 4475 { 4476 return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b); 4477 } 4478 else 4479 { 4480 byte[16] res; 4481 byte16 sa = cast(byte16)a; 4482 byte16 sb = cast(byte16)b; 4483 foreach(i; 0..16) 4484 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4485 return _mm_loadu_si128(cast(int4*)res.ptr); 4486 } 4487 } 4488 unittest 4489 { 4490 byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4491 _mm_setr_epi8( 15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4492 static immutable byte[16] correctResult = [-128, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4493 assert(res.array == correctResult); 4494 } 4495 4496 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 4497 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 4498 { 4499 version(LDC) 4500 { 4501 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4502 { 4503 // x86: Generates PSUBUSW since LDC 1.15 -O0 4504 // ARM: Generates uqsub.8h since LDC 1.21 -O0 4505 enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4506 enum ir = ` 4507 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4508 ret <8 x i16> %r`; 4509 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4510 } 4511 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4512 { 4513 short[8] res; 4514 short8 sa = cast(short8)a; 4515 short8 sb = cast(short8)b; 4516 foreach(i; 0..8) 4517 { 4518 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4519 res[i] = saturateSignedIntToUnsignedShort(sum); 4520 } 4521 return _mm_loadu_si128(cast(int4*)res.ptr); 4522 } 4523 else static if (LDC_with_SSE2) 4524 { 4525 return cast(__m128i) __builtin_ia32_psubusw128(a, b); 4526 } 4527 else 4528 static assert(false); 4529 } 4530 else static if (GDC_with_SSE2) 4531 { 4532 return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b); 4533 } 4534 else 4535 { 4536 short[8] res; 4537 short8 sa = cast(short8)a; 4538 short8 sb = cast(short8)b; 4539 foreach(i; 0..8) 4540 { 4541 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4542 res[i] = saturateSignedIntToUnsignedShort(sum); 4543 } 4544 return _mm_loadu_si128(cast(int4*)res.ptr); 4545 } 4546 } 4547 unittest 4548 { 4549 short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534, 1, 5, 4, 3, 2, 1, 0), 4550 _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0)); 4551 static immutable short[8] correct = [ 0, 0, 1, 0, 0, 2, 0, 0]; 4552 assert(R.array == correct); 4553 } 4554 4555 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4556 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4557 { 4558 version(LDC) 4559 { 4560 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4561 { 4562 // x86: Generates PSUBUSB since LDC 1.15 -O0 4563 // ARM: Generates uqsub.16b since LDC 1.21 -O0 4564 enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 4565 enum ir = ` 4566 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 4567 ret <16 x i8> %r`; 4568 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 4569 } 4570 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4571 { 4572 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4573 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4574 { 4575 ubyte[16] res; 4576 byte16 sa = cast(byte16)a; 4577 byte16 sb = cast(byte16)b; 4578 foreach(i; 0..16) 4579 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4580 return _mm_loadu_si128(cast(int4*)res.ptr); 4581 } 4582 } 4583 else static if (LDC_with_SSE2) 4584 { 4585 return __builtin_ia32_psubusb128(a, b); 4586 } 4587 else 4588 static assert(false); 4589 } 4590 else static if (GDC_with_SSE2) 4591 { 4592 return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b); 4593 } 4594 else 4595 { 4596 ubyte[16] res; 4597 byte16 sa = cast(byte16)a; 4598 byte16 sb = cast(byte16)b; 4599 foreach(i; 0..16) 4600 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4601 return _mm_loadu_si128(cast(int4*)res.ptr); 4602 } 4603 } 4604 unittest 4605 { 4606 byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4607 _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4608 static immutable byte[16] correctResult = [ 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4609 assert(res.array == correctResult); 4610 } 4611 4612 // Note: the only difference between these intrinsics is the signalling 4613 // behaviour of quiet NaNs. This is incorrect but the case where 4614 // you would want to differentiate between qNaN and sNaN and then 4615 // treat them differently on purpose seems extremely rare. 4616 alias _mm_ucomieq_sd = _mm_comieq_sd; /// 4617 alias _mm_ucomige_sd = _mm_comige_sd; /// 4618 alias _mm_ucomigt_sd = _mm_comigt_sd; /// 4619 alias _mm_ucomile_sd = _mm_comile_sd; /// 4620 alias _mm_ucomilt_sd = _mm_comilt_sd; /// 4621 alias _mm_ucomineq_sd = _mm_comineq_sd; /// 4622 4623 /// Return vector of type `__m128d` with undefined elements. 4624 __m128d _mm_undefined_pd() pure @safe 4625 { 4626 pragma(inline, true); 4627 __m128d result = void; 4628 return result; 4629 } 4630 4631 /// Return vector of type `__m128i` with undefined elements. 4632 __m128i _mm_undefined_si128() pure @safe 4633 { 4634 pragma(inline, true); 4635 __m128i result = void; 4636 return result; 4637 } 4638 4639 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. 4640 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe 4641 { 4642 static if (GDC_with_SSE2) 4643 { 4644 return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b); 4645 } 4646 else static if (DMD_with_32bit_asm) 4647 { 4648 asm pure nothrow @nogc @trusted 4649 { 4650 movdqu XMM0, a; 4651 movdqu XMM1, b; 4652 punpckhwd XMM0, XMM1; 4653 movdqu a, XMM0; 4654 } 4655 return a; 4656 } 4657 else 4658 { 4659 return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 4660 (cast(short8)a, cast(short8)b); 4661 } 4662 } 4663 unittest 4664 { 4665 __m128i A = _mm_setr_epi16(4, 5, 6, 7, 8, 9, 10, 11); 4666 __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19); 4667 short8 C = cast(short8)(_mm_unpackhi_epi16(A, B)); 4668 short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19]; 4669 assert(C.array == correct); 4670 } 4671 4672 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. 4673 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted 4674 { 4675 static if (GDC_with_SSE2) 4676 { 4677 return __builtin_ia32_punpckhdq128(a, b); 4678 } 4679 else version(DigitalMars) 4680 { 4681 __m128i r; 4682 r.ptr[0] = a.array[2]; 4683 r.ptr[1] = b.array[2]; 4684 r.ptr[2] = a.array[3]; 4685 r.ptr[3] = b.array[3]; 4686 return r; 4687 } 4688 else 4689 { 4690 return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 4691 } 4692 } 4693 unittest 4694 { 4695 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 4696 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 4697 __m128i C = _mm_unpackhi_epi32(A, B); 4698 int[4] correct = [3, 7, 4, 8]; 4699 assert(C.array == correct); 4700 } 4701 4702 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`. 4703 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted 4704 { 4705 static if (GDC_with_SSE2) 4706 { 4707 return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b); 4708 } 4709 else 4710 { 4711 __m128i r = cast(__m128i)b; 4712 r[0] = a[2]; 4713 r[1] = a[3]; 4714 return r; 4715 } 4716 } 4717 unittest // Issue #36 4718 { 4719 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4720 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4721 long2 C = cast(long2)(_mm_unpackhi_epi64(A, B)); 4722 long[2] correct = [0x33333333_33333333, 0x55555555_55555555]; 4723 assert(C.array == correct); 4724 } 4725 4726 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. 4727 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe 4728 { 4729 static if (GDC_with_SSE2) 4730 { 4731 return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b); 4732 } 4733 else static if (DMD_with_32bit_asm) 4734 { 4735 asm pure nothrow @nogc @trusted 4736 { 4737 movdqu XMM0, a; 4738 movdqu XMM1, b; 4739 punpckhbw XMM0, XMM1; 4740 movdqu a, XMM0; 4741 } 4742 return a; 4743 } 4744 else 4745 { 4746 return cast(__m128i)shufflevector!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 4747 12, 28, 13, 29, 14, 30, 15, 31) 4748 (cast(byte16)a, cast(byte16)b); 4749 } 4750 } 4751 unittest 4752 { 4753 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 4754 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 4755 byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B); 4756 byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]; 4757 assert(C.array == correct); 4758 } 4759 4760 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`. 4761 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe 4762 { 4763 static if (GDC_with_SSE2) 4764 { 4765 return __builtin_ia32_unpckhpd(a, b); 4766 } 4767 else 4768 { 4769 return shufflevector!(__m128d, 1, 3)(a, b); 4770 } 4771 } 4772 unittest 4773 { 4774 __m128d A = _mm_setr_pd(4.0, 6.0); 4775 __m128d B = _mm_setr_pd(7.0, 9.0); 4776 __m128d C = _mm_unpackhi_pd(A, B); 4777 double[2] correct = [6.0, 9.0]; 4778 assert(C.array == correct); 4779 } 4780 4781 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. 4782 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe 4783 { 4784 static if (GDC_with_SSE2) 4785 { 4786 return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b); 4787 } 4788 else static if (DMD_with_32bit_asm) 4789 { 4790 asm pure nothrow @nogc @trusted 4791 { 4792 movdqu XMM0, a; 4793 movdqu XMM1, b; 4794 punpcklwd XMM0, XMM1; 4795 movdqu a, XMM0; 4796 } 4797 return a; 4798 } 4799 else 4800 { 4801 return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11) 4802 (cast(short8)a, cast(short8)b); 4803 } 4804 } 4805 unittest 4806 { 4807 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 4808 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 4809 short8 C = cast(short8) _mm_unpacklo_epi16(A, B); 4810 short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11]; 4811 assert(C.array == correct); 4812 } 4813 4814 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. 4815 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted 4816 { 4817 static if (GDC_with_SSE2) 4818 { 4819 return __builtin_ia32_punpckldq128(a, b); 4820 } 4821 else version(DigitalMars) 4822 { 4823 __m128i r; 4824 r.ptr[0] = a.array[0]; 4825 r.ptr[1] = b.array[0]; 4826 r.ptr[2] = a.array[1]; 4827 r.ptr[3] = b.array[1]; 4828 return r; 4829 } 4830 else 4831 { 4832 return shufflevector!(int4, 0, 4, 1, 5)(cast(int4)a, cast(int4)b); 4833 } 4834 } 4835 unittest 4836 { 4837 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 4838 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 4839 __m128i C = _mm_unpacklo_epi32(A, B); 4840 int[4] correct = [1, 5, 2, 6]; 4841 assert(C.array == correct); 4842 } 4843 4844 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`. 4845 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted 4846 { 4847 static if (GDC_with_SSE2) 4848 { 4849 return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b); 4850 } 4851 else 4852 { 4853 long2 lA = cast(long2)a; 4854 long2 lB = cast(long2)b; 4855 long2 R; 4856 R.ptr[0] = lA.array[0]; 4857 R.ptr[1] = lB.array[0]; 4858 return cast(__m128i)R; 4859 } 4860 } 4861 unittest // Issue #36 4862 { 4863 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4864 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4865 long2 C = cast(long2)(_mm_unpacklo_epi64(A, B)); 4866 long[2] correct = [0x22222222_22222222, 0x44444444_44444444]; 4867 assert(C.array == correct); 4868 } 4869 4870 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. 4871 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe 4872 { 4873 static if (GDC_with_SSE2) 4874 { 4875 return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b); 4876 } 4877 else static if (DMD_with_32bit_asm) 4878 { 4879 asm pure nothrow @nogc @trusted 4880 { 4881 movdqu XMM0, a; 4882 movdqu XMM1, b; 4883 punpcklbw XMM0, XMM1; 4884 movdqu a, XMM0; 4885 } 4886 return a; 4887 } 4888 else 4889 { 4890 return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 4891 4, 20, 5, 21, 6, 22, 7, 23) 4892 (cast(byte16)a, cast(byte16)b); 4893 } 4894 } 4895 unittest 4896 { 4897 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 4898 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 4899 byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B); 4900 byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]; 4901 assert(C.array == correct); 4902 } 4903 4904 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`. 4905 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe 4906 { 4907 static if (GDC_with_SSE2) 4908 { 4909 return __builtin_ia32_unpcklpd(a, b); 4910 } 4911 else 4912 { 4913 return shufflevector!(__m128d, 0, 2)(a, b); 4914 } 4915 } 4916 unittest 4917 { 4918 __m128d A = _mm_setr_pd(4.0, 6.0); 4919 __m128d B = _mm_setr_pd(7.0, 9.0); 4920 __m128d C = _mm_unpacklo_pd(A, B); 4921 double[2] correct = [4.0, 7.0]; 4922 assert(C.array == correct); 4923 } 4924 4925 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 4926 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 4927 { 4928 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 4929 } 4930 // TODO unittest and thus force inline 4931 4932 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`. 4933 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 4934 { 4935 return a ^ b; 4936 } 4937 // TODO unittest and thus force inline 4938 4939 unittest 4940 { 4941 float distance(float[4] a, float[4] b) nothrow @nogc 4942 { 4943 __m128 va = _mm_loadu_ps(a.ptr); 4944 __m128 vb = _mm_loadu_ps(b.ptr); 4945 __m128 diffSquared = _mm_sub_ps(va, vb); 4946 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 4947 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared)); 4948 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum)); 4949 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 4950 } 4951 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 4952 }