1 /** 2 * SSE2 intrinsics. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.emmintrin; 8 9 public import inteli.types; 10 public import inteli.xmmintrin; // SSE2 includes SSE1 11 import inteli.mmx; 12 import inteli.internals; 13 14 nothrow @nogc: 15 16 17 // SSE2 instructions 18 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 19 20 /// Add packed 16-bit integers in `a` and `b`. 21 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 22 { 23 return cast(__m128i)(cast(short8)a + cast(short8)b); 24 } 25 unittest 26 { 27 __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77); 28 short8 R = cast(short8) _mm_add_epi16(A, A); 29 short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154]; 30 assert(R.array == correct); 31 } 32 33 /// Add packed 32-bit integers in `a` and `b`. 34 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 35 { 36 return cast(__m128i)(cast(int4)a + cast(int4)b); 37 } 38 unittest 39 { 40 __m128i A = _mm_setr_epi32( -7, -1, 0, 9); 41 int4 R = _mm_add_epi32(A, A); 42 int[4] correct = [ -14, -2, 0, 18 ]; 43 assert(R.array == correct); 44 } 45 46 /// Add packed 64-bit integers in `a` and `b`. 47 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 48 { 49 return cast(__m128i)(cast(long2)a + cast(long2)b); 50 } 51 unittest 52 { 53 __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000); 54 long2 R = cast(long2) _mm_add_epi64(A, A); 55 long[2] correct = [ -2, 0 ]; 56 assert(R.array == correct); 57 } 58 59 /// Add packed 8-bit integers in `a` and `b`. 60 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 61 { 62 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 63 } 64 unittest 65 { 66 __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78); 67 byte16 R = cast(byte16) _mm_add_epi8(A, A); 68 byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100]; 69 assert(R.array == correct); 70 } 71 72 /// Add the lower double-precision (64-bit) floating-point element 73 /// in `a` and `b`, store the result in the lower element of dst, 74 /// and copy the upper element from `a` to the upper element of destination. 75 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 76 { 77 static if (GDC_with_SSE2) 78 { 79 return __builtin_ia32_addsd(a, b); 80 } 81 else version(DigitalMars) 82 { 83 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 84 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 85 asm pure nothrow @nogc @trusted { nop;} 86 a[0] = a[0] + b[0]; 87 return a; 88 } 89 else 90 { 91 a[0] += b[0]; 92 return a; 93 } 94 } 95 unittest 96 { 97 __m128d a = [1.5, -2.0]; 98 a = _mm_add_sd(a, a); 99 assert(a.array == [3.0, -2.0]); 100 } 101 102 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 103 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 104 { 105 return a + b; 106 } 107 unittest 108 { 109 __m128d a = [1.5, -2.0]; 110 a = _mm_add_pd(a, a); 111 assert(a.array == [3.0, -4.0]); 112 } 113 114 /// Add 64-bit integers `a` and `b`. 115 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe 116 { 117 return a + b; 118 } 119 120 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 121 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 122 { 123 static if (GDC_with_SSE2) 124 { 125 return cast(__m128i)__builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 126 } 127 else version(LDC) 128 { 129 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 130 { 131 // x86: Generates PADDSW since LDC 1.15 -O0 132 // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20 133 enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 134 enum ir = ` 135 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 136 ret <8 x i16> %r`; 137 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 138 } 139 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 140 { 141 short[8] res; 142 short8 sa = cast(short8)a; 143 short8 sb = cast(short8)b; 144 foreach(i; 0..8) 145 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 146 return _mm_loadu_si128(cast(int4*)res.ptr); 147 } 148 else 149 return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 150 } 151 else 152 { 153 short[8] res; 154 short8 sa = cast(short8)a; 155 short8 sb = cast(short8)b; 156 foreach(i; 0..8) 157 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 158 return _mm_loadu_si128(cast(int4*)res.ptr); 159 } 160 } 161 unittest 162 { 163 short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), 164 _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); 165 static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 166 assert(res.array == correctResult); 167 } 168 169 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 170 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 171 { 172 static if (GDC_with_SSE2) 173 { 174 return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b); 175 } 176 else version(LDC) 177 { 178 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 179 { 180 // x86: Generates PADDSB since LDC 1.15 -O0 181 // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20 182 enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 183 enum ir = ` 184 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 185 ret <16 x i8> %r`; 186 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 187 } 188 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 189 { 190 byte[16] res; 191 byte16 sa = cast(byte16)a; 192 byte16 sb = cast(byte16)b; 193 foreach(i; 0..16) 194 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 195 return _mm_loadu_si128(cast(int4*)res.ptr); 196 } 197 else 198 return cast(__m128i) __builtin_ia32_paddsb128(cast(byte16)a, cast(byte16)b); 199 } 200 else 201 { 202 byte[16] res; 203 byte16 sa = cast(byte16)a; 204 byte16 sb = cast(byte16)b; 205 foreach(i; 0..16) 206 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 207 return _mm_loadu_si128(cast(int4*)res.ptr); 208 } 209 } 210 unittest 211 { 212 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 213 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 214 static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14, 215 16, 18, 20, 22, 24, 26, 28, 30]; 216 assert(res.array == correctResult); 217 } 218 219 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 220 // PERF: #GDC version? 221 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 222 { 223 version(LDC) 224 { 225 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 226 { 227 // x86: Generates PADDUSB since LDC 1.15 -O0 228 // ARM: Generates uqadd.16b since LDC 1.21 -O1 229 enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 230 enum ir = ` 231 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 232 ret <16 x i8> %r`; 233 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 234 } 235 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 236 { 237 ubyte[16] res; 238 byte16 sa = cast(byte16)a; 239 byte16 sb = cast(byte16)b; 240 foreach(i; 0..16) 241 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 242 return _mm_loadu_si128(cast(int4*)res.ptr); 243 } 244 else 245 return __builtin_ia32_paddusb128(a, b); 246 } 247 else 248 { 249 ubyte[16] res; 250 byte16 sa = cast(byte16)a; 251 byte16 sb = cast(byte16)b; 252 foreach(i; 0..16) 253 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 254 return _mm_loadu_si128(cast(int4*)res.ptr); 255 } 256 } 257 unittest 258 { 259 byte16 res = cast(byte16) 260 _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0), 261 _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0)); 262 static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 263 0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 264 assert(res.array == correctResult); 265 } 266 267 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation. 268 // PERF: #GDC version? 269 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 270 { 271 version(LDC) 272 { 273 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 274 { 275 // x86: Generates PADDUSW since LDC 1.15 -O0 276 // ARM: Generates uqadd.8h since LDC 1.21 -O1 277 enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 278 enum ir = ` 279 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 280 ret <8 x i16> %r`; 281 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 282 } 283 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 284 { 285 ushort[8] res; 286 short8 sa = cast(short8)a; 287 short8 sb = cast(short8)b; 288 foreach(i; 0..8) 289 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 290 return _mm_loadu_si128(cast(int4*)res.ptr); 291 } 292 else 293 return __builtin_ia32_paddusw128(a, b); 294 } 295 else 296 { 297 ushort[8] res; 298 short8 sa = cast(short8)a; 299 short8 sb = cast(short8)b; 300 foreach(i; 0..8) 301 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 302 return _mm_loadu_si128(cast(int4*)res.ptr); 303 } 304 } 305 unittest 306 { 307 short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0), 308 _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0)); 309 static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6]; 310 assert(res.array == correctResult); 311 } 312 313 /// Compute the bitwise AND of packed double-precision (64-bit) 314 /// floating-point elements in `a` and `b`. 315 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 316 { 317 return cast(__m128d)( cast(long2)a & cast(long2)b ); 318 } 319 unittest 320 { 321 double a = 4.32; 322 double b = -78.99; 323 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 324 __m128d A = _mm_set_pd(a, b); 325 __m128d B = _mm_set_pd(b, a); 326 long2 R = cast(long2)( _mm_and_pd(A, B) ); 327 assert(R.array[0] == correct); 328 assert(R.array[1] == correct); 329 } 330 331 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`. 332 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 333 { 334 return a & b; 335 } 336 unittest 337 { 338 __m128i A = _mm_set1_epi32(7); 339 __m128i B = _mm_set1_epi32(14); 340 __m128i R = _mm_and_si128(A, B); 341 int[4] correct = [6, 6, 6, 6]; 342 assert(R.array == correct); 343 } 344 345 /// Compute the bitwise NOT of packed double-precision (64-bit) 346 /// floating-point elements in `a` and then AND with `b`. 347 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 348 { 349 return cast(__m128d)( ~(cast(long2)a) & cast(long2)b); 350 } 351 unittest 352 { 353 double a = 4.32; 354 double b = -78.99; 355 long correct = (~*cast(long*)(&a)) & ( *cast(long*)(&b)); 356 long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b)); 357 __m128d A = _mm_setr_pd(a, b); 358 __m128d B = _mm_setr_pd(b, a); 359 long2 R = cast(long2)( _mm_andnot_pd(A, B) ); 360 assert(R.array[0] == correct); 361 assert(R.array[1] == correct2); 362 } 363 364 /// Compute the bitwise NOT of 128 bits (representing integer data) 365 /// in `a` and then AND with `b`. 366 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 367 { 368 return (~a) & b; 369 } 370 unittest 371 { 372 __m128i A = _mm_set1_epi32(7); 373 __m128i B = _mm_set1_epi32(14); 374 __m128i R = _mm_andnot_si128(A, B); 375 int[4] correct = [8, 8, 8, 8]; 376 assert(R.array == correct); 377 } 378 379 /// Average packed unsigned 16-bit integers in `a` and `b`. 380 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted 381 { 382 static if (GDC_with_SSE2) 383 { 384 return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b); 385 } 386 else static if (LDC_with_ARM64) 387 { 388 return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b); 389 } 390 else version(LDC) 391 { 392 // Generates pavgw even in LDC 1.0, even in -O0 393 // But not in ARM 394 enum ir = ` 395 %ia = zext <8 x i16> %0 to <8 x i32> 396 %ib = zext <8 x i16> %1 to <8 x i32> 397 %isum = add <8 x i32> %ia, %ib 398 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 399 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 400 %r = trunc <8 x i32> %isums to <8 x i16> 401 ret <8 x i16> %r`; 402 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 403 } 404 else 405 { 406 short8 sa = cast(short8)a; 407 short8 sb = cast(short8)b; 408 short8 sr = void; 409 foreach(i; 0..8) 410 { 411 sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 ); 412 } 413 return cast(int4)sr; 414 } 415 } 416 unittest 417 { 418 __m128i A = _mm_set1_epi16(31); 419 __m128i B = _mm_set1_epi16(64); 420 short8 avg = cast(short8)(_mm_avg_epu16(A, B)); 421 foreach(i; 0..8) 422 assert(avg.array[i] == 48); 423 } 424 425 /// Average packed unsigned 8-bit integers in `a` and `b`. 426 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted 427 { 428 static if (GDC_with_SSE2) 429 { 430 return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b); 431 } 432 else static if (LDC_with_ARM64) 433 { 434 return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b); 435 } 436 else version(LDC) 437 { 438 // Generates pavgb even in LDC 1.0, even in -O0 439 // But not in ARM 440 enum ir = ` 441 %ia = zext <16 x i8> %0 to <16 x i16> 442 %ib = zext <16 x i8> %1 to <16 x i16> 443 %isum = add <16 x i16> %ia, %ib 444 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 445 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 446 %r = trunc <16 x i16> %isums to <16 x i8> 447 ret <16 x i8> %r`; 448 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 449 } 450 else 451 { 452 byte16 sa = cast(byte16)a; 453 byte16 sb = cast(byte16)b; 454 byte16 sr = void; 455 foreach(i; 0..16) 456 { 457 sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 ); 458 } 459 return cast(int4)sr; 460 } 461 } 462 unittest 463 { 464 __m128i A = _mm_set1_epi8(31); 465 __m128i B = _mm_set1_epi8(64); 466 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B)); 467 foreach(i; 0..16) 468 assert(avg.array[i] == 48); 469 } 470 471 /// Shift `a` left by `bytes` bytes while shifting in zeros. 472 alias _mm_bslli_si128 = _mm_slli_si128; 473 unittest 474 { 475 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 476 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 477 __m128i result = _mm_bslli_si128!5(toShift); 478 assert( (cast(byte16)result).array == exact); 479 } 480 481 /// Shift `v` right by `bytes` bytes while shifting in zeros. 482 alias _mm_bsrli_si128 = _mm_srli_si128; 483 unittest 484 { 485 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 486 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0]; 487 __m128i result = _mm_bsrli_si128!5(toShift); 488 assert( (cast(byte16)result).array == exact); 489 } 490 491 /// Cast vector of type `__m128d` to type `__m128`. 492 /// Note: Also possible with a regular `cast(__m128)(a)`. 493 __m128 _mm_castpd_ps (__m128d a) pure @safe 494 { 495 return cast(__m128)a; 496 } 497 498 /// Cast vector of type `__m128d` to type `__m128i`. 499 /// Note: Also possible with a regular `cast(__m128i)(a)`. 500 __m128i _mm_castpd_si128 (__m128d a) pure @safe 501 { 502 return cast(__m128i)a; 503 } 504 505 /// Cast vector of type `__m128` to type `__m128d`. 506 /// Note: Also possible with a regular `cast(__m128d)(a)`. 507 __m128d _mm_castps_pd (__m128 a) pure @safe 508 { 509 return cast(__m128d)a; 510 } 511 512 /// Cast vector of type `__m128` to type `__m128i`. 513 /// Note: Also possible with a regular `cast(__m128i)(a)`. 514 __m128i _mm_castps_si128 (__m128 a) pure @safe 515 { 516 return cast(__m128i)a; 517 } 518 519 /// Cast vector of type `__m128i` to type `__m128d`. 520 /// Note: Also possible with a regular `cast(__m128d)(a)`. 521 __m128d _mm_castsi128_pd (__m128i a) pure @safe 522 { 523 return cast(__m128d)a; 524 } 525 526 /// Cast vector of type `__m128i` to type `__m128`. 527 /// Note: Also possible with a regular `cast(__m128)(a)`. 528 __m128 _mm_castsi128_ps (__m128i a) pure @safe 529 { 530 return cast(__m128)a; 531 } 532 533 /// Invalidate and flush the cache line that contains `p` 534 /// from all levels of the cache hierarchy. 535 void _mm_clflush (const(void)* p) @trusted 536 { 537 static if (GDC_with_SSE2) 538 { 539 __builtin_ia32_clflush(p); 540 } 541 else static if (LDC_with_SSE2) 542 { 543 __builtin_ia32_clflush(cast(void*)p); 544 } 545 else version(D_InlineAsm_X86) 546 { 547 asm pure nothrow @nogc @safe 548 { 549 mov EAX, p; 550 clflush [EAX]; 551 } 552 } 553 else version(D_InlineAsm_X86_64) 554 { 555 asm pure nothrow @nogc @safe 556 { 557 mov RAX, p; 558 clflush [RAX]; 559 } 560 } 561 else 562 { 563 // Do nothing. Invalidating cacheline does 564 // not affect correctness. 565 } 566 } 567 unittest 568 { 569 ubyte[64] cacheline; 570 _mm_clflush(cacheline.ptr); 571 } 572 573 /// Compare packed 16-bit integers in `a` and `b` for equality. 574 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 575 { 576 static if (GDC_with_SSE2) 577 { 578 return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b); 579 } 580 else 581 { 582 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 583 } 584 } 585 unittest 586 { 587 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 588 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 589 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 590 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 591 assert(R.array == E); 592 } 593 594 /// Compare packed 32-bit integers in `a` and `b` for equality. 595 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 596 { 597 static if (GDC_with_SSE2) 598 { 599 return __builtin_ia32_pcmpeqd128(a, b); 600 } 601 else 602 { 603 return equalMask!__m128i(a, b); 604 } 605 } 606 unittest 607 { 608 int4 A = [-3, -2, -1, 0]; 609 int4 B = [ 4, -2, 2, 0]; 610 int[4] E = [ 0, -1, 0, -1]; 611 int4 R = cast(int4)(_mm_cmpeq_epi32(A, B)); 612 assert(R.array == E); 613 } 614 615 /// Compare packed 8-bit integers in `a` and `b` for equality. 616 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 617 { 618 static if (GDC_with_SSE2) 619 { 620 return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b); 621 } 622 else 623 { 624 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 625 } 626 } 627 unittest 628 { 629 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 630 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 631 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 632 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 633 assert(C.array == correct); 634 } 635 636 /// Compare packed double-precision (64-bit) floating-point elements 637 /// in `a` and `b` for equality. 638 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 639 { 640 static if (GDC_with_SSE2) 641 { 642 return __builtin_ia32_cmpeqpd(a, b); 643 } 644 else 645 { 646 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b); 647 } 648 } 649 650 /// Compare the lower double-precision (64-bit) floating-point elements 651 /// in `a` and `b` for equality, store the result in the lower element, 652 /// and copy the upper element from `a`. 653 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 654 { 655 static if (GDC_with_SSE2) 656 { 657 return __builtin_ia32_cmpeqsd(a, b); 658 } 659 else 660 { 661 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b); 662 } 663 } 664 665 /// Compare packed double-precision (64-bit) floating-point elements 666 /// in `a` and `b` for greater-than-or-equal. 667 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 668 { 669 static if (GDC_with_SSE2) 670 { 671 return __builtin_ia32_cmpgepd(a, b); 672 } 673 else 674 { 675 return cast(__m128d) cmppd!(FPComparison.oge)(a, b); 676 } 677 } 678 679 /// Compare the lower double-precision (64-bit) floating-point elements 680 /// in `a` and `b` for greater-than-or-equal, store the result in the 681 /// lower element, and copy the upper element from `a`. 682 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 683 { 684 // Note: There is no __builtin_ia32_cmpgesd builtin. 685 static if (GDC_with_SSE2) 686 { 687 return __builtin_ia32_cmpnltsd(b, a); 688 } 689 else 690 { 691 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b); 692 } 693 } 694 695 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 696 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 697 { 698 static if (GDC_with_SSE2) 699 { 700 return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b); 701 } 702 else 703 { 704 return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b); 705 } 706 } 707 unittest 708 { 709 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 710 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 711 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 712 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 713 assert(R.array == E); 714 } 715 716 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 717 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 718 { 719 static if (GDC_with_SSE2) 720 { 721 return __builtin_ia32_pcmpgtd128(a, b); 722 } 723 else 724 { 725 return cast(__m128i)( greaterMask!int4(a, b)); 726 } 727 } 728 unittest 729 { 730 int4 A = [-3, 2, -1, 0]; 731 int4 B = [ 4, -2, 2, 0]; 732 int[4] E = [ 0, -1, 0, 0]; 733 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 734 assert(R.array == E); 735 } 736 737 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 738 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 739 { 740 static if (GDC_with_SSE2) 741 { 742 return cast(__m128i) __builtin_ia32_pcmpgtb128(cast(ubyte16)a, cast(ubyte16)b); 743 } 744 else 745 { 746 return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b); 747 } 748 } 749 unittest 750 { 751 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 752 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 753 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 754 byte[16] correct = [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 755 __m128i D = _mm_cmpeq_epi8(A, B); 756 assert(C.array == correct); 757 } 758 759 /// Compare packed double-precision (64-bit) floating-point elements 760 /// in `a` and `b` for greater-than. 761 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 762 { 763 static if (GDC_with_SSE2) 764 { 765 return __builtin_ia32_cmpgtpd(a, b); 766 } 767 else 768 { 769 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b); 770 } 771 } 772 773 /// Compare the lower double-precision (64-bit) floating-point elements 774 /// in `a` and `b` for greater-than, store the result in the lower element, 775 /// and copy the upper element from `a`. 776 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 777 { 778 // Note: There is no __builtin_ia32_cmpgtsd builtin. 779 static if (GDC_with_SSE2) 780 { 781 return __builtin_ia32_cmpnlesd(b, a); 782 } 783 else 784 { 785 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b); 786 } 787 } 788 789 /// Compare packed double-precision (64-bit) floating-point elements 790 /// in `a` and `b` for less-than-or-equal. 791 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 792 { 793 static if (GDC_with_SSE2) 794 { 795 return __builtin_ia32_cmplepd(a, b); 796 } 797 else 798 { 799 return cast(__m128d) cmppd!(FPComparison.ole)(a, b); 800 } 801 } 802 803 /// Compare the lower double-precision (64-bit) floating-point elements 804 /// in `a` and `b` for less-than-or-equal, store the result in the 805 /// lower element, and copy the upper element from `a`. 806 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 807 { 808 static if (GDC_with_SSE2) 809 { 810 return __builtin_ia32_cmplesd(a, b); 811 } 812 else 813 { 814 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b); 815 } 816 } 817 818 /// Compare packed 16-bit integers in `a` and `b` for less-than. 819 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 820 { 821 return _mm_cmpgt_epi16(b, a); 822 } 823 824 /// Compare packed 32-bit integers in `a` and `b` for less-than. 825 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 826 { 827 return _mm_cmpgt_epi32(b, a); 828 } 829 830 /// Compare packed 8-bit integers in `a` and `b` for less-than. 831 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 832 { 833 return _mm_cmpgt_epi8(b, a); 834 } 835 836 /// Compare packed double-precision (64-bit) floating-point elements 837 /// in `a` and `b` for less-than. 838 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 839 { 840 static if (GDC_with_SSE2) 841 { 842 return __builtin_ia32_cmpltpd(a, b); 843 } 844 else 845 { 846 return cast(__m128d) cmppd!(FPComparison.olt)(a, b); 847 } 848 } 849 850 /// Compare the lower double-precision (64-bit) floating-point elements 851 /// in `a` and `b` for less-than, store the result in the lower 852 /// element, and copy the upper element from `a`. 853 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 854 { 855 static if (GDC_with_SSE2) 856 { 857 return __builtin_ia32_cmpltsd(a, b); 858 } 859 else 860 { 861 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b); 862 } 863 } 864 865 /// Compare packed double-precision (64-bit) floating-point elements 866 /// in `a` and `b` for not-equal. 867 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 868 { 869 static if (GDC_with_SSE2) 870 { 871 return __builtin_ia32_cmpneqpd(a, b); 872 } 873 else 874 { 875 return cast(__m128d) cmppd!(FPComparison.une)(a, b); 876 } 877 } 878 879 /// Compare the lower double-precision (64-bit) floating-point elements 880 /// in `a` and `b` for not-equal, store the result in the lower 881 /// element, and copy the upper element from `a`. 882 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 883 { 884 static if (GDC_with_SSE2) 885 { 886 return __builtin_ia32_cmpneqsd(a, b); 887 } 888 else 889 { 890 return cast(__m128d) cmpsd!(FPComparison.une)(a, b); 891 } 892 } 893 894 /// Compare packed double-precision (64-bit) floating-point elements 895 /// in `a` and `b` for not-greater-than-or-equal. 896 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 897 { 898 static if (GDC_with_SSE2) 899 { 900 return __builtin_ia32_cmpngepd(a, b); 901 } 902 else 903 { 904 return cast(__m128d) cmppd!(FPComparison.ult)(a, b); 905 } 906 } 907 908 /// Compare the lower double-precision (64-bit) floating-point elements 909 /// in `a` and `b` for not-greater-than-or-equal, store the result in 910 /// the lower element, and copy the upper element from `a`. 911 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 912 { 913 // Note: There is no __builtin_ia32_cmpngesd builtin. 914 static if (GDC_with_SSE2) 915 { 916 return __builtin_ia32_cmpltsd(b, a); 917 } 918 else 919 { 920 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b); 921 } 922 } 923 924 /// Compare packed double-precision (64-bit) floating-point elements 925 /// in `a` and `b` for not-greater-than. 926 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 927 { 928 static if (GDC_with_SSE2) 929 { 930 return __builtin_ia32_cmpngtpd(a, b); 931 } 932 else 933 { 934 return cast(__m128d) cmppd!(FPComparison.ule)(a, b); 935 } 936 } 937 938 /// Compare the lower double-precision (64-bit) floating-point elements 939 /// in `a` and `b` for not-greater-than, store the result in the 940 /// lower element, and copy the upper element from `a`. 941 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 942 { 943 // Note: There is no __builtin_ia32_cmpngtsd builtin. 944 static if (GDC_with_SSE2) 945 { 946 return __builtin_ia32_cmplesd(b, a); 947 } 948 else 949 { 950 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b); 951 } 952 } 953 954 /// Compare packed double-precision (64-bit) floating-point elements 955 /// in `a` and `b` for not-less-than-or-equal. 956 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 957 { 958 static if (GDC_with_SSE2) 959 { 960 return __builtin_ia32_cmpnlepd(a, b); 961 } 962 else 963 { 964 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b); 965 } 966 } 967 968 /// Compare the lower double-precision (64-bit) floating-point elements 969 /// in `a` and `b` for not-less-than-or-equal, store the result in the 970 /// lower element, and copy the upper element from `a`. 971 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 972 { 973 static if (GDC_with_SSE2) 974 { 975 return __builtin_ia32_cmpnlesd(a, b); 976 } 977 else 978 { 979 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b); 980 } 981 } 982 983 /// Compare packed double-precision (64-bit) floating-point elements 984 /// in `a` and `b` for not-less-than. 985 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 986 { 987 static if (GDC_with_SSE2) 988 { 989 return __builtin_ia32_cmpnltpd(a, b); 990 } 991 else 992 { 993 return cast(__m128d) cmppd!(FPComparison.uge)(a, b); 994 } 995 } 996 997 /// Compare the lower double-precision (64-bit) floating-point elements 998 /// in `a` and `b` for not-less-than, store the result in the lower 999 /// element, and copy the upper element from `a`. 1000 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 1001 { 1002 static if (GDC_with_SSE2) 1003 { 1004 return __builtin_ia32_cmpnltsd(a, b); 1005 } 1006 else 1007 { 1008 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b); 1009 } 1010 } 1011 1012 /// Compare packed double-precision (64-bit) floating-point elements 1013 /// in `a` and `b` to see if neither is NaN. 1014 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 1015 { 1016 static if (GDC_with_SSE2) 1017 { 1018 return __builtin_ia32_cmpordpd(a, b); 1019 } 1020 else 1021 { 1022 return cast(__m128d) cmppd!(FPComparison.ord)(a, b); 1023 } 1024 } 1025 1026 /// Compare the lower double-precision (64-bit) floating-point elements 1027 /// in `a` and `b` to see if neither is NaN, store the result in the 1028 /// lower element, and copy the upper element from `a` to the upper element. 1029 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 1030 { 1031 static if (GDC_with_SSE2) 1032 { 1033 return __builtin_ia32_cmpordsd(a, b); 1034 } 1035 else 1036 { 1037 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b); 1038 } 1039 } 1040 1041 /// Compare packed double-precision (64-bit) floating-point elements 1042 /// in `a` and `b` to see if either is NaN. 1043 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 1044 { 1045 static if (GDC_with_SSE2) 1046 { 1047 return __builtin_ia32_cmpunordpd(a, b); 1048 } 1049 else 1050 { 1051 return cast(__m128d) cmppd!(FPComparison.uno)(a, b); 1052 } 1053 } 1054 1055 /// Compare the lower double-precision (64-bit) floating-point elements 1056 /// in `a` and `b` to see if either is NaN, store the result in the lower 1057 /// element, and copy the upper element from `a` to the upper element. 1058 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 1059 { 1060 static if (GDC_with_SSE2) 1061 { 1062 return __builtin_ia32_cmpunordsd(a, b); 1063 } 1064 else 1065 { 1066 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b); 1067 } 1068 } 1069 1070 /// Compare the lower double-precision (64-bit) floating-point element 1071 /// in `a` and `b` for equality, and return the boolean result (0 or 1). 1072 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe 1073 { 1074 // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 1075 // comisd instruction, it returns false in case of unordered instead. 1076 // 1077 // Actually C++ compilers disagree over the meaning of that instruction. 1078 // GCC will manage NaNs like the comisd instruction (return true if unordered), 1079 // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says. 1080 // We choose to do like the most numerous. It seems GCC is buggy with NaNs. 1081 return a.array[0] == b.array[0]; 1082 } 1083 unittest 1084 { 1085 assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1086 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1087 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1088 assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1089 assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1090 } 1091 1092 /// Compare the lower double-precision (64-bit) floating-point element 1093 /// in `a` and `b` for greater-than-or-equal, and return the boolean 1094 /// result (0 or 1). 1095 int _mm_comige_sd (__m128d a, __m128d b) pure @safe 1096 { 1097 return a.array[0] >= b.array[0]; 1098 } 1099 unittest 1100 { 1101 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1102 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1103 assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1104 assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1105 assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1106 assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1107 } 1108 1109 /// Compare the lower double-precision (64-bit) floating-point element 1110 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1). 1111 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe 1112 { 1113 return a.array[0] > b.array[0]; 1114 } 1115 unittest 1116 { 1117 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1118 assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1119 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1120 assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1121 assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1122 } 1123 1124 /// Compare the lower double-precision (64-bit) floating-point element 1125 /// in `a` and `b` for less-than-or-equal. 1126 int _mm_comile_sd (__m128d a, __m128d b) pure @safe 1127 { 1128 return a.array[0] <= b.array[0]; 1129 } 1130 unittest 1131 { 1132 assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1133 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1134 assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1135 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1136 assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1137 assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1138 } 1139 1140 /// Compare the lower double-precision (64-bit) floating-point element 1141 /// in `a` and `b` for less-than, and return the boolean result (0 or 1). 1142 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe 1143 { 1144 return a.array[0] < b.array[0]; 1145 } 1146 unittest 1147 { 1148 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1149 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1150 assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1151 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1152 assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1153 assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1154 } 1155 1156 /// Compare the lower double-precision (64-bit) floating-point element 1157 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1). 1158 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe 1159 { 1160 return a.array[0] != b.array[0]; 1161 } 1162 unittest 1163 { 1164 assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1165 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1166 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1167 assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1168 assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1169 } 1170 1171 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) 1172 /// floating-point elements. 1173 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted 1174 { 1175 version(LDC) 1176 { 1177 // Generates cvtdq2pd since LDC 1.0, even without optimizations 1178 enum ir = ` 1179 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 1180 %r = sitofp <2 x i32> %v to <2 x double> 1181 ret <2 x double> %r`; 1182 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a); 1183 } 1184 else static if (GDC_with_SSE2) 1185 { 1186 return __builtin_ia32_cvtdq2pd(a); 1187 } 1188 else 1189 { 1190 double2 r = void; 1191 r.ptr[0] = a.array[0]; 1192 r.ptr[1] = a.array[1]; 1193 return r; 1194 } 1195 } 1196 unittest 1197 { 1198 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54)); 1199 assert(A.array[0] == 54.0); 1200 assert(A.array[1] == 54.0); 1201 } 1202 1203 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 1204 /// floating-point elements. 1205 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted 1206 { 1207 static if (GDC_with_SSE2) 1208 { 1209 return __builtin_ia32_cvtdq2ps(a); 1210 } 1211 else 1212 { 1213 // x86: Generates cvtdq2ps since LDC 1.0.0 -O1 1214 // ARM: Generats scvtf.4s since LDC 1.8.0 -02 1215 __m128 res; 1216 res.ptr[0] = cast(float)a.array[0]; 1217 res.ptr[1] = cast(float)a.array[1]; 1218 res.ptr[2] = cast(float)a.array[2]; 1219 res.ptr[3] = cast(float)a.array[3]; 1220 return res; 1221 } 1222 } 1223 unittest 1224 { 1225 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 1226 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 1227 } 1228 1229 /// Convert packed double-precision (64-bit) floating-point elements 1230 /// in `a` to packed 32-bit integers. 1231 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted 1232 { 1233 // PERF ARM32 1234 static if (LDC_with_SSE2) 1235 { 1236 return __builtin_ia32_cvtpd2dq(a); 1237 } 1238 else static if (GDC_with_SSE2) 1239 { 1240 return __builtin_ia32_cvtpd2dq(a); 1241 } 1242 else static if (LDC_with_ARM64) 1243 { 1244 // Get current rounding mode. 1245 uint fpscr = arm_get_fpcr(); 1246 long2 i; 1247 switch(fpscr & _MM_ROUND_MASK_ARM) 1248 { 1249 default: 1250 case _MM_ROUND_NEAREST_ARM: i = vcvtnq_s64_f64(a); break; 1251 case _MM_ROUND_DOWN_ARM: i = vcvtmq_s64_f64(a); break; 1252 case _MM_ROUND_UP_ARM: i = vcvtpq_s64_f64(a); break; 1253 case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break; 1254 } 1255 int4 zero = 0; 1256 return cast(__m128i) shufflevector!(int4, 0, 2, 4, 6)(cast(int4)i, zero); 1257 } 1258 else 1259 { 1260 // PERF ARM32 1261 __m128i r = _mm_setzero_si128(); 1262 r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]); 1263 r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]); 1264 return r; 1265 } 1266 } 1267 unittest 1268 { 1269 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0)); 1270 assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0); 1271 } 1272 1273 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1274 /// to packed 32-bit integers 1275 __m64 _mm_cvtpd_pi32 (__m128d v) @safe 1276 { 1277 return to_m64(_mm_cvtpd_epi32(v)); 1278 } 1279 unittest 1280 { 1281 int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0)); 1282 assert(A.array[0] == 55 && A.array[1] == 61); 1283 } 1284 1285 /// Convert packed double-precision (64-bit) floating-point elements 1286 /// in `a` to packed single-precision (32-bit) floating-point elements. 1287 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted 1288 { 1289 static if (LDC_with_SSE2) 1290 { 1291 return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately 1292 } 1293 else static if (GDC_with_SSE2) 1294 { 1295 return __builtin_ia32_cvtpd2ps(a); 1296 } 1297 else 1298 { 1299 __m128 r = void; 1300 r.ptr[0] = a.array[0]; 1301 r.ptr[1] = a.array[1]; 1302 r.ptr[2] = 0; 1303 r.ptr[3] = 0; 1304 return r; 1305 } 1306 } 1307 unittest 1308 { 1309 __m128d A = _mm_set_pd(5.25, 4.0); 1310 __m128 B = _mm_cvtpd_ps(A); 1311 assert(B.array == [4.0f, 5.25f, 0, 0]); 1312 } 1313 1314 /// Convert packed 32-bit integers in `v` to packed double-precision 1315 /// (64-bit) floating-point elements. 1316 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe 1317 { 1318 return _mm_cvtepi32_pd(to_m128i(v)); 1319 } 1320 unittest 1321 { 1322 __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5)); 1323 assert(A.array[0] == 4.0 && A.array[1] == -5.0); 1324 } 1325 1326 /// Convert packed single-precision (32-bit) floating-point elements 1327 /// in `a` to packed 32-bit integers 1328 __m128i _mm_cvtps_epi32 (__m128 a) @trusted 1329 { 1330 static if (LDC_with_SSE2) 1331 { 1332 return cast(__m128i) __builtin_ia32_cvtps2dq(a); 1333 } 1334 else static if (GDC_with_SSE2) 1335 { 1336 return __builtin_ia32_cvtps2dq(a); 1337 } 1338 else static if (LDC_with_ARM64) 1339 { 1340 // Get current rounding mode. 1341 uint fpscr = arm_get_fpcr(); 1342 switch(fpscr & _MM_ROUND_MASK_ARM) 1343 { 1344 default: 1345 case _MM_ROUND_NEAREST_ARM: return vcvtnq_s32_f32(a); 1346 case _MM_ROUND_DOWN_ARM: return vcvtmq_s32_f32(a); 1347 case _MM_ROUND_UP_ARM: return vcvtpq_s32_f32(a); 1348 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a); 1349 } 1350 } 1351 else 1352 { 1353 __m128i r = void; 1354 r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]); 1355 r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]); 1356 r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]); 1357 r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]); 1358 return r; 1359 } 1360 } 1361 unittest 1362 { 1363 // GDC bug #98607 1364 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607 1365 // GDC does not provide optimization barrier for rounding mode. 1366 // Workarounded with different literals. This bug will likely only manifest in unittest. 1367 // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't. 1368 1369 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1370 1371 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1372 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1373 assert(A.array == [1, -2, 54, -3]); 1374 1375 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1376 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f)); 1377 assert(A.array == [1, -3, 53, -3]); 1378 1379 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1380 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f)); 1381 assert(A.array == [2, -2, 54, -2]); 1382 1383 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1384 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f)); 1385 assert(A.array == [1, -2, 53, -2]); 1386 1387 _MM_SET_ROUNDING_MODE(savedRounding); 1388 } 1389 1390 /// Convert packed single-precision (32-bit) floating-point elements 1391 /// in `a` to packed double-precision (64-bit) floating-point elements. 1392 __m128d _mm_cvtps_pd (__m128 a) pure @trusted 1393 { 1394 version(LDC) 1395 { 1396 // Generates cvtps2pd since LDC 1.0 -O0 1397 enum ir = ` 1398 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1> 1399 %r = fpext <2 x float> %v to <2 x double> 1400 ret <2 x double> %r`; 1401 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a); 1402 } 1403 else static if (GDC_with_SSE2) 1404 { 1405 return __builtin_ia32_cvtps2pd(a); 1406 } 1407 else 1408 { 1409 double2 r = void; 1410 r.ptr[0] = a.array[0]; 1411 r.ptr[1] = a.array[1]; 1412 return r; 1413 } 1414 } 1415 unittest 1416 { 1417 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f)); 1418 assert(A.array[0] == 54.0); 1419 assert(A.array[1] == 54.0); 1420 } 1421 1422 /// Copy the lower double-precision (64-bit) floating-point element of `a`. 1423 double _mm_cvtsd_f64 (__m128d a) pure @safe 1424 { 1425 return a.array[0]; 1426 } 1427 1428 /// Convert the lower double-precision (64-bit) floating-point element 1429 /// in `a` to a 32-bit integer. 1430 int _mm_cvtsd_si32 (__m128d a) @safe 1431 { 1432 static if (LDC_with_SSE2) 1433 { 1434 return __builtin_ia32_cvtsd2si(a); 1435 } 1436 else static if (GDC_with_SSE2) 1437 { 1438 return __builtin_ia32_cvtsd2si(a); 1439 } 1440 else 1441 { 1442 return convertDoubleToInt32UsingMXCSR(a[0]); 1443 } 1444 } 1445 unittest 1446 { 1447 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0))); 1448 } 1449 1450 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer. 1451 long _mm_cvtsd_si64 (__m128d a) @trusted 1452 { 1453 version (LDC) 1454 { 1455 version (X86_64) 1456 { 1457 return __builtin_ia32_cvtsd2si64(a); 1458 } 1459 else 1460 { 1461 // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer 1462 // using SSE instructions only. So the builtin doesn't exit for this arch. 1463 return convertDoubleToInt64UsingMXCSR(a[0]); 1464 } 1465 } 1466 else 1467 { 1468 return convertDoubleToInt64UsingMXCSR(a.array[0]); 1469 } 1470 } 1471 unittest 1472 { 1473 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0))); 1474 1475 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1476 1477 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1478 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49))); 1479 1480 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1481 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1))); 1482 1483 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1484 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1))); 1485 1486 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1487 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9))); 1488 1489 _MM_SET_ROUNDING_MODE(savedRounding); 1490 } 1491 1492 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; /// 1493 1494 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 1495 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a` 1496 /// to the upper elements of result. 1497 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted 1498 { 1499 static if (GDC_with_SSE2) 1500 { 1501 return __builtin_ia32_cvtsd2ss(a, b); 1502 } 1503 else 1504 { 1505 // Generates cvtsd2ss since LDC 1.3 -O0 1506 a.ptr[0] = b.array[0]; 1507 return a; 1508 } 1509 } 1510 unittest 1511 { 1512 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0)); 1513 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]); 1514 } 1515 1516 /// Get the lower 32-bit integer in `a`. 1517 int _mm_cvtsi128_si32 (__m128i a) pure @safe 1518 { 1519 return a.array[0]; 1520 } 1521 1522 /// Get the lower 64-bit integer in `a`. 1523 long _mm_cvtsi128_si64 (__m128i a) pure @safe 1524 { 1525 long2 la = cast(long2)a; 1526 return la.array[0]; 1527 } 1528 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 1529 1530 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 1531 /// lower element of result, and copy the upper element from `a` to the upper element of result. 1532 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted 1533 { 1534 a.ptr[0] = cast(double)b; 1535 return a; 1536 } 1537 unittest 1538 { 1539 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 1540 assert(a.array == [42.0, 0]); 1541 } 1542 1543 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements. 1544 __m128i _mm_cvtsi32_si128 (int a) pure @trusted 1545 { 1546 int4 r = [0, 0, 0, 0]; 1547 r.ptr[0] = a; 1548 return r; 1549 } 1550 unittest 1551 { 1552 __m128i a = _mm_cvtsi32_si128(65); 1553 assert(a.array == [65, 0, 0, 0]); 1554 } 1555 1556 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 1557 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 1558 1559 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted 1560 { 1561 a.ptr[0] = cast(double)b; 1562 return a; 1563 } 1564 unittest 1565 { 1566 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 1567 assert(a.array == [42.0, 0]); 1568 } 1569 1570 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element. 1571 __m128i _mm_cvtsi64_si128 (long a) pure @trusted 1572 { 1573 long2 r = [0, 0]; 1574 r.ptr[0] = a; 1575 return cast(__m128i)(r); 1576 } 1577 1578 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; /// 1579 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; /// 1580 1581 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 1582 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 1583 // element of result. 1584 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted 1585 { 1586 a.ptr[0] = b.array[0]; 1587 return a; 1588 } 1589 unittest 1590 { 1591 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 1592 assert(a.array == [42.0, 0]); 1593 } 1594 1595 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation. 1596 long _mm_cvttss_si64 (__m128 a) pure @safe 1597 { 1598 return cast(long)(a.array[0]); // Generates cvttss2si as expected 1599 } 1600 unittest 1601 { 1602 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1603 } 1604 1605 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1606 /// Put zeroes in the upper elements of result. 1607 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted 1608 { 1609 static if (LDC_with_SSE2) 1610 { 1611 return __builtin_ia32_cvttpd2dq(a); 1612 } 1613 else static if (GDC_with_SSE2) 1614 { 1615 return __builtin_ia32_cvttpd2dq(a); 1616 } 1617 else 1618 { 1619 // Note: doesn't generate cvttpd2dq as of LDC 1.13 1620 __m128i r; 1621 r.ptr[0] = cast(int)a.array[0]; 1622 r.ptr[1] = cast(int)a.array[1]; 1623 r.ptr[2] = 0; 1624 r.ptr[3] = 0; 1625 return r; 1626 } 1627 } 1628 unittest 1629 { 1630 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f)); 1631 assert(R.array == [-4, 45641, 0, 0]); 1632 } 1633 1634 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1635 /// to packed 32-bit integers with truncation. 1636 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe 1637 { 1638 return to_m64(_mm_cvttpd_epi32(v)); 1639 } 1640 unittest 1641 { 1642 int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f)); 1643 int[2] correct = [-4, 45641]; 1644 assert(R.array == correct); 1645 } 1646 1647 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1648 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted 1649 { 1650 // x86: Generates cvttps2dq since LDC 1.3 -O2 1651 // ARM64: generates fcvtze since LDC 1.8 -O2 1652 __m128i r; 1653 r.ptr[0] = cast(int)a.array[0]; 1654 r.ptr[1] = cast(int)a.array[1]; 1655 r.ptr[2] = cast(int)a.array[2]; 1656 r.ptr[3] = cast(int)a.array[3]; 1657 return r; 1658 } 1659 unittest 1660 { 1661 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f)); 1662 assert(R.array == [-4, 45641, 0, 1]); 1663 } 1664 1665 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation. 1666 int _mm_cvttsd_si32 (__m128d a) 1667 { 1668 // Generates cvttsd2si since LDC 1.3 -O0 1669 return cast(int)a.array[0]; 1670 } 1671 1672 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation. 1673 long _mm_cvttsd_si64 (__m128d a) 1674 { 1675 // Generates cvttsd2si since LDC 1.3 -O0 1676 // but in 32-bit instead, it's a long sequence that resort to FPU 1677 return cast(long)a.array[0]; 1678 } 1679 1680 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; /// 1681 1682 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`. 1683 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe 1684 { 1685 return a / b; 1686 } 1687 1688 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted 1689 { 1690 static if (GDC_with_SSE2) 1691 { 1692 return __builtin_ia32_divsd(a, b); 1693 } 1694 else version(DigitalMars) 1695 { 1696 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1697 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 1698 asm pure nothrow @nogc @trusted { nop;} 1699 a.array[0] = a.array[0] / b.array[0]; 1700 return a; 1701 } 1702 else 1703 { 1704 a.ptr[0] /= b.array[0]; 1705 return a; 1706 } 1707 } 1708 unittest 1709 { 1710 __m128d a = [2.0, 4.5]; 1711 a = _mm_div_sd(a, a); 1712 assert(a.array == [1.0, 4.5]); 1713 } 1714 1715 /// Extract a 16-bit integer from `v`, selected with `index`. 1716 /// Warning: the returned value is zero-extended to 32-bits. 1717 int _mm_extract_epi16(__m128i v, int index) pure @safe 1718 { 1719 short8 r = cast(short8)v; 1720 return cast(ushort)(r.array[index & 7]); 1721 } 1722 unittest 1723 { 1724 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1); 1725 assert(_mm_extract_epi16(A, 6) == 6); 1726 assert(_mm_extract_epi16(A, 0) == 65535); 1727 assert(_mm_extract_epi16(A, 5 + 8) == 5); 1728 } 1729 1730 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`. 1731 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted 1732 { 1733 short8 r = cast(short8)v; 1734 r.ptr[index & 7] = cast(short)i; 1735 return cast(__m128i)r; 1736 } 1737 unittest 1738 { 1739 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1740 short8 R = cast(short8) _mm_insert_epi16(A, 42, 6); 1741 short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7]; 1742 assert(R.array == correct); 1743 } 1744 1745 1746 void _mm_lfence() @trusted 1747 { 1748 version(GNU) 1749 { 1750 1751 static if (GDC_with_SSE2) 1752 { 1753 __builtin_ia32_lfence(); 1754 } 1755 else version(X86) 1756 { 1757 asm pure nothrow @nogc @trusted 1758 { 1759 "lfence;\n" : : : ; 1760 } 1761 } 1762 else 1763 static assert(false); 1764 } 1765 else static if (LDC_with_SSE2) 1766 { 1767 __builtin_ia32_lfence(); 1768 } 1769 else static if (DMD_with_asm) 1770 { 1771 asm nothrow @nogc pure @safe 1772 { 1773 lfence; 1774 } 1775 } 1776 else version(LDC) 1777 { 1778 llvm_memory_fence(); // PERF actually generates mfence 1779 } 1780 else 1781 static assert(false); 1782 } 1783 unittest 1784 { 1785 _mm_lfence(); 1786 } 1787 1788 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1789 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1790 __m128d _mm_load_pd (const(double) * mem_addr) pure 1791 { 1792 __m128d* aligned = cast(__m128d*)mem_addr; 1793 return *aligned; 1794 } 1795 unittest 1796 { 1797 align(16) double[2] S = [-5.0, 7.0]; 1798 __m128d R = _mm_load_pd(S.ptr); 1799 assert(R.array == S); 1800 } 1801 1802 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst. 1803 /// `mem_addr` does not need to be aligned on any particular boundary. 1804 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 1805 { 1806 double m = *mem_addr; 1807 __m128d r; 1808 r.ptr[0] = m; 1809 r.ptr[1] = m; 1810 return r; 1811 } 1812 unittest 1813 { 1814 double what = 4; 1815 __m128d R = _mm_load_pd1(&what); 1816 double[2] correct = [4.0, 4]; 1817 assert(R.array == correct); 1818 } 1819 1820 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 1821 /// element. `mem_addr` does not need to be aligned on any particular boundary. 1822 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted 1823 { 1824 double2 r = [0, 0]; 1825 r.ptr[0] = *mem_addr; 1826 return r; 1827 } 1828 unittest 1829 { 1830 double x = -42; 1831 __m128d a = _mm_load_sd(&x); 1832 assert(a.array == [-42.0, 0.0]); 1833 } 1834 1835 /// Load 128-bits of integer data from memory into dst. 1836 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1837 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted // TODO: shoudln't be trusted because alignment, Issue #62 1838 { 1839 return *mem_addr; 1840 } 1841 unittest 1842 { 1843 align(16) int[4] correct = [-1, 2, 3, 4]; 1844 int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr); 1845 assert(A.array == correct); 1846 } 1847 1848 alias _mm_load1_pd = _mm_load_pd1; /// 1849 1850 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 1851 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary. 1852 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted 1853 { 1854 a.ptr[1] = *mem_addr; 1855 return a; 1856 } 1857 unittest 1858 { 1859 double A = 7.0; 1860 __m128d B = _mm_setr_pd(4.0, -5.0); 1861 __m128d R = _mm_loadh_pd(B, &A); 1862 double[2] correct = [ 4.0, 7.0 ]; 1863 assert(R.array == correct); 1864 } 1865 1866 /// Load 64-bit integer from memory into the first element of result. Zero out the other. 1867 // Note: strange signature since the memory doesn't have to aligned (Issue #60) 1868 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted // TODO signature 1869 { 1870 auto pLong = cast(const(long)*)mem_addr; 1871 long2 r = [0, 0]; 1872 r.ptr[0] = *pLong; 1873 return cast(__m128i)(r); 1874 } 1875 unittest 1876 { 1877 long A = 0x7878787870707070; 1878 long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A); 1879 long[2] correct = [0x7878787870707070, 0]; 1880 assert(R.array == correct); 1881 } 1882 1883 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 1884 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary. 1885 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted 1886 { 1887 a.ptr[0] = *mem_addr; 1888 return a; 1889 } 1890 unittest 1891 { 1892 double A = 7.0; 1893 __m128d B = _mm_setr_pd(4.0, -5.0); 1894 __m128d R = _mm_loadl_pd(B, &A); 1895 double[2] correct = [ 7.0, -5.0 ]; 1896 assert(R.array == correct); 1897 } 1898 1899 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 1900 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1901 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted 1902 { 1903 __m128d a = *cast(__m128d*)(mem_addr); 1904 __m128d r; 1905 r.ptr[0] = a.array[1]; 1906 r.ptr[1] = a.array[0]; 1907 return r; 1908 } 1909 unittest 1910 { 1911 align(16) double[2] A = [56.0, -74.0]; 1912 __m128d R = _mm_loadr_pd(A.ptr); 1913 double[2] correct = [-74.0, 56.0]; 1914 assert(R.array == correct); 1915 } 1916 1917 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1918 /// `mem_addr` does not need to be aligned on any particular boundary. 1919 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted 1920 { 1921 static if (GDC_with_SSE2) 1922 { 1923 return __builtin_ia32_loadupd(mem_addr); 1924 } 1925 else version(LDC) 1926 { 1927 return loadUnaligned!(double2)(mem_addr); 1928 } 1929 else version(DigitalMars) 1930 { 1931 static if (DMD_with_DSIMD) 1932 { 1933 return cast(__m128d)__simd(XMM.LODUPD, *mem_addr); 1934 } 1935 else static if (SSESizedVectorsAreEmulated) 1936 { 1937 // Since this vector is emulated, it doesn't have alignement constraints 1938 // and as such we can just cast it. 1939 return *cast(__m128d*)(mem_addr); 1940 } 1941 else 1942 { 1943 __m128d result; 1944 result.ptr[0] = mem_addr[0]; 1945 result.ptr[1] = mem_addr[1]; 1946 return result; 1947 } 1948 } 1949 else 1950 { 1951 __m128d result; 1952 result.ptr[0] = mem_addr[0]; 1953 result.ptr[1] = mem_addr[1]; 1954 return result; 1955 } 1956 } 1957 unittest 1958 { 1959 double[2] A = [56.0, -75.0]; 1960 __m128d R = _mm_loadu_pd(A.ptr); 1961 double[2] correct = [56.0, -75.0]; 1962 assert(R.array == correct); 1963 } 1964 1965 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary. 1966 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 1967 { 1968 static if (GDC_with_SSE2) 1969 { 1970 return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr); 1971 } 1972 else 1973 { 1974 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 1975 } 1976 } 1977 unittest 1978 { 1979 align(16) int[4] correct = [-1, 2, -3, 4]; 1980 int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr); 1981 assert(A.array == correct); 1982 } 1983 1984 /// Load unaligned 32-bit integer from memory into the first element of result. 1985 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted 1986 { 1987 int r = *cast(int*)(mem_addr); 1988 int4 result = [0, 0, 0, 0]; 1989 result.ptr[0] = r; 1990 return result; 1991 } 1992 unittest 1993 { 1994 int r = 42; 1995 __m128i A = _mm_loadu_si32(&r); 1996 int[4] correct = [42, 0, 0, 0]; 1997 assert(A.array == correct); 1998 } 1999 2000 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 2001 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 2002 /// and pack the results in destination. 2003 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted 2004 { 2005 static if (GDC_with_SSE2) 2006 { 2007 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2008 } 2009 else static if (LDC_with_SSE2) 2010 { 2011 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2012 } 2013 else static if (LDC_with_ARM64) 2014 { 2015 int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b)); 2016 int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b)); 2017 int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl)); 2018 int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph)); 2019 return vcombine_s32(rl, rh); 2020 } 2021 else 2022 { 2023 short8 sa = cast(short8)a; 2024 short8 sb = cast(short8)b; 2025 int4 r; 2026 foreach(i; 0..4) 2027 { 2028 r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1]; 2029 } 2030 return r; 2031 } 2032 } 2033 unittest 2034 { 2035 short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2036 short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2037 int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B); 2038 int[4] correct = [1, 13, -2147483648, 2*32767*32767]; 2039 assert(R.array == correct); 2040 } 2041 2042 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 2043 /// (elements are not stored when the highest bit is not set in the corresponding element) 2044 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 2045 /// boundary. 2046 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted 2047 { 2048 static if (GDC_with_SSE2) 2049 { 2050 return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr); 2051 } 2052 else static if (LDC_with_SSE2) 2053 { 2054 return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr); 2055 } 2056 else static if (LDC_with_ARM64) 2057 { 2058 // PERF: catastrophic on ARM32 2059 byte16 bmask = cast(byte16)mask; 2060 byte16 shift = 7; 2061 bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask 2062 mask = cast(__m128i) bmask; 2063 __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr); 2064 dest = (a & mask) | (dest & ~mask); 2065 storeUnaligned!__m128i(dest, cast(int*)mem_addr); 2066 } 2067 else 2068 { 2069 byte16 b = cast(byte16)a; 2070 byte16 m = cast(byte16)mask; 2071 byte* dest = cast(byte*)(mem_addr); 2072 foreach(j; 0..16) 2073 { 2074 if (m.array[j] & 128) 2075 { 2076 dest[j] = b.array[j]; 2077 } 2078 } 2079 } 2080 } 2081 unittest 2082 { 2083 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]; 2084 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0); 2085 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); 2086 _mm_maskmoveu_si128(A, mask, dest.ptr); 2087 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42]; 2088 assert(dest == correct); 2089 } 2090 2091 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values. 2092 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe 2093 { 2094 static if (GDC_with_SSE2) 2095 { 2096 return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b); 2097 } 2098 else version(LDC) 2099 { 2100 // x86: pmaxsw since LDC 1.0 -O1 2101 // ARM: smax.8h since LDC 1.5 -01 2102 short8 sa = cast(short8)a; 2103 short8 sb = cast(short8)b; 2104 short8 greater = greaterMask!short8(sa, sb); 2105 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2106 } 2107 else 2108 { 2109 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else 2110 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2111 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2112 return _mm_xor_si128(b, mask); 2113 } 2114 } 2115 unittest 2116 { 2117 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57), 2118 _mm_setr_epi16(-4,-8, 9, 7, 0,-32768, 0, 0)); 2119 short[8] correct = [32767, 1, 9, 7, 9, 7, 0, 0]; 2120 assert(R.array == correct); 2121 } 2122 2123 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values. 2124 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe 2125 { 2126 version(LDC) 2127 { 2128 // x86: pmaxub since LDC 1.0.0 -O1 2129 // ARM64: umax.16b since LDC 1.5.0 -O1 2130 // PERF: catastrophic on ARM32 2131 ubyte16 sa = cast(ubyte16)a; 2132 ubyte16 sb = cast(ubyte16)b; 2133 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2134 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2135 } 2136 else 2137 { 2138 __m128i value128 = _mm_set1_epi8(-128); 2139 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2140 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2141 __m128i mask = _mm_and_si128(aTob, higher); 2142 return _mm_xor_si128(b, mask); 2143 } 2144 } 2145 unittest 2146 { 2147 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2148 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2149 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 2150 assert(R.array == correct); 2151 } 2152 2153 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed maximum values. 2154 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted 2155 { 2156 static if (GDC_with_SSE2) 2157 { 2158 return __builtin_ia32_maxpd(a, b); 2159 } 2160 else 2161 { 2162 // x86: Generates maxpd starting with LDC 1.9 -O2 2163 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2164 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2165 return a; 2166 } 2167 } 2168 unittest 2169 { 2170 __m128d A = _mm_setr_pd(4.0, 1.0); 2171 __m128d B = _mm_setr_pd(1.0, 8.0); 2172 __m128d M = _mm_max_pd(A, B); 2173 assert(M.array[0] == 4.0); 2174 assert(M.array[1] == 8.0); 2175 } 2176 2177 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 2178 /// lower element of result, and copy the upper element from `a` to the upper element of result. 2179 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted 2180 { 2181 static if (GDC_with_SSE2) 2182 { 2183 return __builtin_ia32_maxsd(a, b); 2184 } 2185 else 2186 { 2187 __m128d r = a; 2188 // Generates maxsd starting with LDC 1.3 2189 r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2190 return r; 2191 } 2192 } 2193 unittest 2194 { 2195 __m128d A = _mm_setr_pd(1.0, 1.0); 2196 __m128d B = _mm_setr_pd(4.0, 2.0); 2197 __m128d M = _mm_max_sd(A, B); 2198 assert(M.array[0] == 4.0); 2199 assert(M.array[1] == 1.0); 2200 } 2201 2202 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 2203 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 2204 /// is globally visible before any memory instruction which follows the fence in program order. 2205 void _mm_mfence() @trusted 2206 { 2207 version(GNU) 2208 { 2209 static if (GDC_with_SSE2) 2210 { 2211 __builtin_ia32_mfence(); 2212 } 2213 else version(X86) 2214 { 2215 asm pure nothrow @nogc @trusted 2216 { 2217 "mfence;\n" : : : ; 2218 } 2219 } 2220 else 2221 static assert(false); 2222 } 2223 else static if (LDC_with_SSE2) 2224 { 2225 __builtin_ia32_mfence(); 2226 } 2227 else static if (DMD_with_asm) 2228 { 2229 asm nothrow @nogc pure @safe 2230 { 2231 mfence; 2232 } 2233 } 2234 else version(LDC) 2235 { 2236 void _mm_mfence() pure @safe 2237 { 2238 // Note: will generate the DMB instruction on ARM 2239 llvm_memory_fence(); 2240 } 2241 } 2242 else 2243 static assert(false); 2244 } 2245 unittest 2246 { 2247 _mm_mfence(); 2248 } 2249 2250 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values. 2251 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe 2252 { 2253 static if (GDC_with_SSE2) 2254 { 2255 return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b); 2256 } 2257 else version(LDC) 2258 { 2259 // x86: pminsw since LDC 1.0 -O1 2260 // ARM64: smin.8h since LDC 1.5 -01 2261 short8 sa = cast(short8)a; 2262 short8 sb = cast(short8)b; 2263 short8 greater = greaterMask!short8(sa, sb); 2264 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2265 } 2266 else 2267 { 2268 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else 2269 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2270 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2271 return _mm_xor_si128(b, mask); 2272 } 2273 } 2274 unittest 2275 { 2276 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-32768), 2277 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 2278 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -32768]; 2279 assert(R.array == correct); 2280 } 2281 2282 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values. 2283 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe 2284 { 2285 version(LDC) 2286 { 2287 // x86: pminub since LDC 1.0.0 -O1 2288 // ARM: umin.16b since LDC 1.5.0 -O1 2289 // PERF: catastrophic on ARM32 2290 ubyte16 sa = cast(ubyte16)a; 2291 ubyte16 sb = cast(ubyte16)b; 2292 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2293 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2294 } 2295 else 2296 { 2297 __m128i value128 = _mm_set1_epi8(-128); 2298 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2299 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2300 __m128i mask = _mm_and_si128(aTob, lower); 2301 return _mm_xor_si128(b, mask); 2302 } 2303 } 2304 unittest 2305 { 2306 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2307 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2308 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 2309 assert(R.array == correct); 2310 } 2311 2312 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values. 2313 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted 2314 { 2315 static if (GDC_with_SSE2) 2316 { 2317 return __builtin_ia32_minpd(a, b); 2318 } 2319 else 2320 { 2321 // Generates minpd starting with LDC 1.9 2322 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2323 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2324 return a; 2325 } 2326 } 2327 unittest 2328 { 2329 __m128d A = _mm_setr_pd(1.0, 2.0); 2330 __m128d B = _mm_setr_pd(4.0, 1.0); 2331 __m128d M = _mm_min_pd(A, B); 2332 assert(M.array[0] == 1.0); 2333 assert(M.array[1] == 1.0); 2334 } 2335 2336 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 2337 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 2338 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe 2339 { 2340 static if (GDC_with_SSE2) 2341 { 2342 return __builtin_ia32_minsd(a, b); 2343 } 2344 else 2345 { 2346 // Generates minsd starting with LDC 1.3 2347 __m128d r = a; 2348 r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2349 return r; 2350 } 2351 } 2352 unittest 2353 { 2354 __m128d A = _mm_setr_pd(1.0, 3.0); 2355 __m128d B = _mm_setr_pd(4.0, 2.0); 2356 __m128d M = _mm_min_sd(A, B); 2357 assert(M.array[0] == 1.0); 2358 assert(M.array[1] == 3.0); 2359 } 2360 2361 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element. 2362 __m128i _mm_move_epi64 (__m128i a) pure @trusted 2363 { 2364 static if (GDC_with_SSE2) 2365 { 2366 // slightly better with GDC -O0 2367 return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 2368 } 2369 else 2370 { 2371 long2 result = [ 0, 0 ]; 2372 long2 la = cast(long2) a; 2373 result.ptr[0] = la.array[0]; 2374 return cast(__m128i)(result); 2375 } 2376 } 2377 unittest 2378 { 2379 long2 A = [13, 47]; 2380 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 2381 long[2] correct = [13, 0]; 2382 assert(B.array == correct); 2383 } 2384 2385 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 2386 /// the upper element from `a` to the upper element of dst. 2387 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted 2388 { 2389 static if (GDC_with_SSE2) 2390 { 2391 return __builtin_ia32_movsd(a, b); 2392 } 2393 else 2394 { 2395 b.ptr[1] = a.array[1]; 2396 return b; 2397 } 2398 } 2399 unittest 2400 { 2401 double2 A = [13.0, 47.0]; 2402 double2 B = [34.0, 58.0]; 2403 double2 C = _mm_move_sd(A, B); 2404 double[2] correct = [34.0, 47.0]; 2405 assert(C.array == correct); 2406 } 2407 2408 /// Create mask from the most significant bit of each 8-bit element in `v`. 2409 int _mm_movemask_epi8 (__m128i a) pure @trusted 2410 { 2411 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2412 static if (GDC_with_SSE2) 2413 { 2414 return __builtin_ia32_pmovmskb128(cast(ubyte16)a); 2415 } 2416 else static if (LDC_with_SSE2) 2417 { 2418 return __builtin_ia32_pmovmskb128(cast(byte16)a); 2419 } 2420 else static if (LDC_with_ARM64) 2421 { 2422 // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon 2423 // The other two solutions lead to unfound intrinsics in LLVM and that took a long time. 2424 // SO there might be something a bit faster, but this one is reasonable and branchless. 2425 byte8 mask_shift; 2426 mask_shift.ptr[0] = 7; 2427 mask_shift.ptr[1] = 6; 2428 mask_shift.ptr[2] = 5; 2429 mask_shift.ptr[3] = 4; 2430 mask_shift.ptr[4] = 3; 2431 mask_shift.ptr[5] = 2; 2432 mask_shift.ptr[6] = 1; 2433 mask_shift.ptr[7] = 0; 2434 byte8 mask_and = byte8(-128); 2435 byte8 lo = vget_low_u8(cast(byte16)a); 2436 byte8 hi = vget_high_u8(cast(byte16)a); 2437 lo = vand_u8(lo, mask_and); 2438 lo = vshr_u8(lo, mask_shift); 2439 hi = vand_u8(hi, mask_and); 2440 hi = vshr_u8(hi, mask_shift); 2441 lo = vpadd_u8(lo,lo); 2442 lo = vpadd_u8(lo,lo); 2443 lo = vpadd_u8(lo,lo); 2444 hi = vpadd_u8(hi,hi); 2445 hi = vpadd_u8(hi,hi); 2446 hi = vpadd_u8(hi,hi); 2447 return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]); 2448 } 2449 else 2450 { 2451 byte16 ai = cast(byte16)a; 2452 int r = 0; 2453 foreach(bit; 0..16) 2454 { 2455 if (ai.array[bit] < 0) r += (1 << bit); 2456 } 2457 return r; 2458 } 2459 } 2460 unittest 2461 { 2462 assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0))); 2463 } 2464 2465 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 2466 /// loating-point element in `v`. 2467 int _mm_movemask_pd(__m128d v) pure @safe 2468 { 2469 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2470 static if (GDC_with_SSE2) 2471 { 2472 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2473 /// packed double-precision (64-bit) floating-point element in `v`. 2474 return __builtin_ia32_movmskpd(v); 2475 } 2476 else static if (LDC_with_SSE2) 2477 { 2478 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2479 /// packed double-precision (64-bit) floating-point element in `v`. 2480 return __builtin_ia32_movmskpd(v); 2481 } 2482 else 2483 { 2484 long2 lv = cast(long2)v; 2485 int r = 0; 2486 if (lv.array[0] < 0) r += 1; 2487 if (lv.array[1] < 0) r += 2; 2488 return r; 2489 } 2490 } 2491 unittest 2492 { 2493 __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0); 2494 assert(_mm_movemask_pd(A) == 2); 2495 } 2496 2497 /// Copy the lower 64-bit integer in `v`. 2498 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe 2499 { 2500 long2 lv = cast(long2)v; 2501 return long1(lv.array[0]); 2502 } 2503 unittest 2504 { 2505 __m128i A = _mm_set_epi64x(-1, -2); 2506 __m64 R = _mm_movepi64_pi64(A); 2507 assert(R.array[0] == -2); 2508 } 2509 2510 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element. 2511 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted 2512 { 2513 long2 r; 2514 r.ptr[0] = a.array[0]; 2515 r.ptr[1] = 0; 2516 return cast(__m128i)r; 2517 } 2518 2519 // Note: generates pmuludq in LDC with -O1 2520 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted 2521 { 2522 __m128i zero = _mm_setzero_si128(); 2523 2524 static if (__VERSION__ >= 2088) 2525 { 2526 // Need LLVM9 to avoid this shufflevector 2527 long2 la, lb; 2528 la.ptr[0] = cast(uint)a.array[0]; 2529 la.ptr[1] = cast(uint)a.array[2]; 2530 lb.ptr[0] = cast(uint)b.array[0]; 2531 lb.ptr[1] = cast(uint)b.array[2]; 2532 } 2533 else 2534 { 2535 long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero); 2536 long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero); 2537 } 2538 2539 version(DigitalMars) 2540 { 2541 // DMD has no long2 mul 2542 // long2 mul not supported before LDC 1.5 2543 la.ptr[0] *= lb.array[0]; 2544 la.ptr[1] *= lb.array[1]; 2545 return cast(__m128i)(la); 2546 } 2547 else 2548 { 2549 static if (__VERSION__ >= 2076) 2550 { 2551 return cast(__m128i)(la * lb); 2552 } 2553 else 2554 { 2555 // long2 mul not supported before LDC 1.5 2556 la.ptr[0] *= lb.array[0]; 2557 la.ptr[1] *= lb.array[1]; 2558 return cast(__m128i)(la); 2559 } 2560 } 2561 } 2562 unittest 2563 { 2564 __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff); 2565 __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff); 2566 __m128i C = _mm_mul_epu32(A, B); 2567 long2 LC = cast(long2)C; 2568 assert(LC.array[0] == 18446744065119617025uL); 2569 assert(LC.array[1] == 12723420444339690338uL); 2570 } 2571 2572 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 2573 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 2574 { 2575 return a * b; 2576 } 2577 unittest 2578 { 2579 __m128d a = [-2.0, 1.5]; 2580 a = _mm_mul_pd(a, a); 2581 assert(a.array == [4.0, 2.25]); 2582 } 2583 2584 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 2585 /// element of result, and copy the upper element from `a` to the upper element of result. 2586 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted 2587 { 2588 version(DigitalMars) 2589 { 2590 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 2591 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 2592 asm pure nothrow @nogc @trusted { nop;} 2593 a.array[0] = a.array[0] * b.array[0]; 2594 return a; 2595 } 2596 else static if (GDC_with_SSE2) 2597 { 2598 return __builtin_ia32_mulsd(a, b); 2599 } 2600 else 2601 { 2602 a.ptr[0] *= b.array[0]; 2603 return a; 2604 } 2605 } 2606 unittest 2607 { 2608 __m128d a = [-2.0, 1.5]; 2609 a = _mm_mul_sd(a, a); 2610 assert(a.array == [4.0, 1.5]); 2611 } 2612 2613 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 2614 /// and get an unsigned 64-bit result. 2615 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe 2616 { 2617 return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b))); 2618 } 2619 unittest 2620 { 2621 __m64 A = _mm_set_pi32(42, 0xDEADBEEF); 2622 __m64 B = _mm_set_pi32(42, 0xCAFEBABE); 2623 __m64 C = _mm_mul_su32(A, B); 2624 assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL); 2625 } 2626 2627 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2628 /// high 16 bits of the intermediate integers. 2629 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted 2630 { 2631 static if (GDC_with_SSE2) 2632 { 2633 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2634 } 2635 else static if (LDC_with_SSE2) 2636 { 2637 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2638 } 2639 else 2640 { 2641 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h 2642 // PERF: it seems the simde solution has one less instruction in ARM64. 2643 // PERF: Catastrophic in ARM32. 2644 short8 sa = cast(short8)a; 2645 short8 sb = cast(short8)b; 2646 short8 r = void; 2647 r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16; 2648 r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16; 2649 r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16; 2650 r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16; 2651 r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16; 2652 r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16; 2653 r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16; 2654 r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16; 2655 return cast(__m128i)r; 2656 } 2657 } 2658 unittest 2659 { 2660 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2661 __m128i B = _mm_set1_epi16(16384); 2662 short8 R = cast(short8)_mm_mulhi_epi16(A, B); 2663 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1]; 2664 assert(R.array == correct); 2665 } 2666 2667 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2668 /// high 16 bits of the intermediate integers. 2669 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted 2670 { 2671 static if (GDC_with_SSE2) 2672 { 2673 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2674 } 2675 else static if (LDC_with_SSE2) 2676 { 2677 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2678 } 2679 else 2680 { 2681 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h 2682 // it seems the simde solution has one less instruction in ARM64 2683 // PERF: Catastrophic in ARM32. 2684 short8 sa = cast(short8)a; 2685 short8 sb = cast(short8)b; 2686 short8 r = void; 2687 r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 ); 2688 r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 ); 2689 r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 ); 2690 r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 ); 2691 r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 ); 2692 r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 ); 2693 r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 ); 2694 r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 ); 2695 return cast(__m128i)r; 2696 } 2697 } 2698 unittest 2699 { 2700 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2701 __m128i B = _mm_set1_epi16(16384); 2702 short8 R = cast(short8)_mm_mulhi_epu16(A, B); 2703 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1]; 2704 assert(R.array == correct); 2705 } 2706 2707 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 2708 /// bits of the intermediate integers. 2709 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe 2710 { 2711 return cast(__m128i)(cast(short8)a * cast(short8)b); 2712 } 2713 unittest 2714 { 2715 __m128i A = _mm_setr_epi16(16384, -16, 0, 3, 4, 1, 16, 7); 2716 __m128i B = _mm_set1_epi16(16384); 2717 short8 R = cast(short8)_mm_mullo_epi16(A, B); 2718 short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384]; 2719 assert(R.array == correct); 2720 } 2721 2722 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 2723 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 2724 { 2725 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 2726 } 2727 2728 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`. 2729 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 2730 { 2731 return a | b; 2732 } 2733 2734 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation. 2735 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted 2736 { 2737 static if (GDC_with_SSE2) 2738 { 2739 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2740 } 2741 else static if (LDC_with_SSE2) 2742 { 2743 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2744 } 2745 else static if (LDC_with_ARM64) 2746 { 2747 short4 ra = vqmovn_s32(cast(int4)a); 2748 short4 rb = vqmovn_s32(cast(int4)b); 2749 return cast(__m128i)vcombine_s16(ra, rb); 2750 } 2751 else 2752 { 2753 // PERF: catastrophic on ARM32 2754 short8 r; 2755 r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]); 2756 r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]); 2757 r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]); 2758 r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]); 2759 r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]); 2760 r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]); 2761 r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]); 2762 r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]); 2763 return cast(__m128i)r; 2764 } 2765 } 2766 unittest 2767 { 2768 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 2769 short8 R = cast(short8) _mm_packs_epi32(A, A); 2770 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0]; 2771 assert(R.array == correct); 2772 } 2773 2774 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation. 2775 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted 2776 { 2777 static if (GDC_with_SSE2) 2778 { 2779 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2780 } 2781 else static if (LDC_with_SSE2) 2782 { 2783 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2784 } 2785 else static if (LDC_with_ARM64) 2786 { 2787 // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02 2788 byte8 ra = vqmovn_s16(cast(short8)a); 2789 byte8 rb = vqmovn_s16(cast(short8)b); 2790 return cast(__m128i)vcombine_s8(ra, rb); 2791 } 2792 else 2793 { 2794 // PERF: ARM32 is missing 2795 byte16 r; 2796 short8 sa = cast(short8)a; 2797 short8 sb = cast(short8)b; 2798 foreach(i; 0..8) 2799 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]); 2800 foreach(i; 0..8) 2801 r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]); 2802 return cast(__m128i)r; 2803 } 2804 } 2805 unittest 2806 { 2807 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0); 2808 byte16 R = cast(byte16) _mm_packs_epi16(A, A); 2809 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0, 2810 127, -128, 127, 0, 127, -128, 127, 0]; 2811 assert(R.array == correct); 2812 } 2813 2814 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation. 2815 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 2816 { 2817 static if (GDC_with_SSE2) 2818 { 2819 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2820 } 2821 else static if (LDC_with_SSE2) 2822 { 2823 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2824 } 2825 else static if (LDC_with_ARM64) 2826 { 2827 // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02 2828 byte8 ra = vqmovun_s16(cast(short8)a); 2829 byte8 rb = vqmovun_s16(cast(short8)b); 2830 return cast(__m128i)vcombine_s8(ra, rb); 2831 } 2832 else 2833 { 2834 short8 sa = cast(short8)a; 2835 short8 sb = cast(short8)b; 2836 ubyte[16] result = void; 2837 for (int i = 0; i < 8; ++i) 2838 { 2839 short s = sa[i]; 2840 if (s < 0) s = 0; 2841 if (s > 255) s = 255; 2842 result[i] = cast(ubyte)s; 2843 2844 s = sb[i]; 2845 if (s < 0) s = 0; 2846 if (s > 255) s = 255; 2847 result[i+8] = cast(ubyte)s; 2848 } 2849 return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr); 2850 } 2851 } 2852 unittest 2853 { 2854 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 2855 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 2856 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 2857 0, 255, 0, 255, 255, 2, 1, 0]; 2858 foreach(i; 0..16) 2859 assert(AA.array[i] == cast(byte)(correctResult[i])); 2860 } 2861 2862 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 2863 /// and power consumption of spin-wait loops. 2864 void _mm_pause() @trusted 2865 { 2866 version(GNU) 2867 { 2868 static if (GDC_with_SSE2) 2869 { 2870 __builtin_ia32_pause(); 2871 } 2872 else version(X86) 2873 { 2874 asm pure nothrow @nogc @trusted 2875 { 2876 "pause;\n" : : : ; 2877 } 2878 } 2879 else 2880 static assert(false); 2881 } 2882 else static if (LDC_with_SSE2) 2883 { 2884 __builtin_ia32_pause(); 2885 } 2886 else static if (DMD_with_asm) 2887 { 2888 asm nothrow @nogc pure @safe 2889 { 2890 rep; nop; // F3 90 = pause 2891 } 2892 } 2893 else version (LDC) 2894 { 2895 // PERF: Do nothing currently , could be the "yield" intruction on ARM. 2896 } 2897 else 2898 static assert(false); 2899 } 2900 unittest 2901 { 2902 _mm_pause(); 2903 } 2904 2905 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 2906 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 2907 /// low 16 bits of 64-bit elements in result. 2908 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted 2909 { 2910 static if (GDC_with_SSE2) 2911 { 2912 return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b); 2913 } 2914 else static if (LDC_with_SSE2) 2915 { 2916 return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b); 2917 } 2918 else static if (LDC_with_ARM64) 2919 { 2920 ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b)); 2921 2922 // PERF: Looks suboptimal vs addp 2923 ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]); 2924 ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]); 2925 ushort8 r = 0; 2926 r[0] = r0; 2927 r[4] = r4; 2928 return cast(__m128i) r; 2929 } 2930 else 2931 { 2932 // PERF: ARM32 is lacking 2933 byte16 ab = cast(byte16)a; 2934 byte16 bb = cast(byte16)b; 2935 ubyte[16] t; 2936 foreach(i; 0..16) 2937 { 2938 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]); 2939 if (diff < 0) diff = -diff; 2940 t[i] = cast(ubyte)(diff); 2941 } 2942 int4 r = _mm_setzero_si128(); 2943 r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 2944 r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15]; 2945 return r; 2946 } 2947 } 2948 unittest 2949 { 2950 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 2951 __m128i B = _mm_set1_epi8(1); 2952 __m128i R = _mm_sad_epu8(A, B); 2953 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 2954 0, 2955 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 2956 0]; 2957 assert(R.array == correct); 2958 } 2959 2960 /// Set packed 16-bit integers with the supplied values. 2961 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 2962 { 2963 short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7]; 2964 return cast(__m128i) loadUnaligned!(short8)(result.ptr); 2965 } 2966 unittest 2967 { 2968 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 2969 short8 B = cast(short8) A; 2970 foreach(i; 0..8) 2971 assert(B.array[i] == i); 2972 } 2973 2974 /// Set packed 32-bit integers with the supplied values. 2975 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 2976 { 2977 int[4] result = [e0, e1, e2, e3]; 2978 return loadUnaligned!(int4)(result.ptr); 2979 } 2980 unittest 2981 { 2982 __m128i A = _mm_set_epi32(3, 2, 1, 0); 2983 foreach(i; 0..4) 2984 assert(A.array[i] == i); 2985 } 2986 2987 /// Set packed 64-bit integers with the supplied values. 2988 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted 2989 { 2990 long[2] result = [e0.array[0], e1.array[0]]; 2991 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 2992 } 2993 unittest 2994 { 2995 __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678)); 2996 long2 B = cast(long2) A; 2997 assert(B.array[0] == 5678); 2998 assert(B.array[1] == 1234); 2999 } 3000 3001 /// Set packed 64-bit integers with the supplied values. 3002 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 3003 { 3004 long[2] result = [e0, e1]; 3005 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 3006 } 3007 unittest 3008 { 3009 __m128i A = _mm_set_epi64x(1234, 5678); 3010 long2 B = cast(long2) A; 3011 assert(B.array[0] == 5678); 3012 assert(B.array[1] == 1234); 3013 } 3014 3015 /// Set packed 8-bit integers with the supplied values. 3016 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 3017 byte e11, byte e10, byte e9, byte e8, 3018 byte e7, byte e6, byte e5, byte e4, 3019 byte e3, byte e2, byte e1, byte e0) pure @trusted 3020 { 3021 byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 3022 e8, e9, e10, e11, e12, e13, e14, e15]; 3023 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 3024 } 3025 3026 /// Set packed double-precision (64-bit) floating-point elements with the supplied values. 3027 __m128d _mm_set_pd (double e1, double e0) pure @trusted 3028 { 3029 double[2] result = [e0, e1]; 3030 return loadUnaligned!(double2)(result.ptr); 3031 } 3032 unittest 3033 { 3034 __m128d A = _mm_set_pd(61.0, 55.0); 3035 double[2] correct = [55.0, 61.0]; 3036 assert(A.array == correct); 3037 } 3038 3039 /// Broadcast double-precision (64-bit) floating-point value `a` to all element. 3040 __m128d _mm_set_pd1 (double a) pure @trusted 3041 { 3042 double[2] result = [a, a]; 3043 return loadUnaligned!(double2)(result.ptr); 3044 } 3045 unittest 3046 { 3047 __m128d A = _mm_set_pd1(61.0); 3048 double[2] correct = [61.0, 61.0]; 3049 assert(A.array == correct); 3050 } 3051 3052 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 3053 /// and zero the upper element. 3054 __m128d _mm_set_sd (double a) pure @trusted 3055 { 3056 double[2] result = [a, 0]; 3057 return loadUnaligned!(double2)(result.ptr); 3058 } 3059 3060 /// Broadcast 16-bit integer a to all elements of dst. 3061 __m128i _mm_set1_epi16 (short a) pure @trusted 3062 { 3063 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 3064 { 3065 short8 v = a; 3066 return cast(__m128i) v; 3067 } 3068 else 3069 return cast(__m128i)(short8(a)); 3070 } 3071 unittest 3072 { 3073 short8 a = cast(short8) _mm_set1_epi16(31); 3074 for (int i = 0; i < 8; ++i) 3075 assert(a.array[i] == 31); 3076 } 3077 3078 /// Broadcast 32-bit integer `a` to all elements. 3079 __m128i _mm_set1_epi32 (int a) pure @trusted 3080 { 3081 return cast(__m128i)(int4(a)); 3082 } 3083 unittest 3084 { 3085 int4 a = cast(int4) _mm_set1_epi32(31); 3086 for (int i = 0; i < 4; ++i) 3087 assert(a.array[i] == 31); 3088 } 3089 3090 /// Broadcast 64-bit integer `a` to all elements. 3091 __m128i _mm_set1_epi64 (__m64 a) pure @safe 3092 { 3093 return _mm_set_epi64(a, a); 3094 } 3095 unittest 3096 { 3097 long b = 0x1DEADCAFE; 3098 __m64 a; 3099 a.ptr[0] = b; 3100 long2 c = cast(long2) _mm_set1_epi64(a); 3101 assert(c.array[0] == b); 3102 assert(c.array[1] == b); 3103 } 3104 3105 /// Broadcast 64-bit integer `a` to all elements 3106 __m128i _mm_set1_epi64x (long a) pure @trusted 3107 { 3108 long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3109 return cast(__m128i)(b); 3110 } 3111 unittest 3112 { 3113 long b = 0x1DEADCAFE; 3114 long2 c = cast(long2) _mm_set1_epi64x(b); 3115 for (int i = 0; i < 2; ++i) 3116 assert(c.array[i] == b); 3117 } 3118 3119 /// Broadcast 8-bit integer `a` to all elements. 3120 __m128i _mm_set1_epi8 (byte a) pure @trusted 3121 { 3122 byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3123 return cast(__m128i)(b); 3124 } 3125 unittest 3126 { 3127 byte16 b = cast(byte16) _mm_set1_epi8(31); 3128 for (int i = 0; i < 16; ++i) 3129 assert(b.array[i] == 31); 3130 } 3131 3132 alias _mm_set1_pd = _mm_set_pd1; 3133 3134 /// Set packed 16-bit integers with the supplied values in reverse order. 3135 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 3136 short e3, short e2, short e1, short e0) pure @trusted 3137 { 3138 short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0]; 3139 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 3140 } 3141 unittest 3142 { 3143 short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0); 3144 short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0]; 3145 assert(A.array == correct); 3146 } 3147 3148 /// Set packed 32-bit integers with the supplied values in reverse order. 3149 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3150 { 3151 int[4] result = [e3, e2, e1, e0]; 3152 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 3153 } 3154 unittest 3155 { 3156 int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647); 3157 int[4] correct = [-1, 0, -2147483648, 2147483647]; 3158 assert(A.array == correct); 3159 } 3160 3161 /// Set packed 64-bit integers with the supplied values in reverse order. 3162 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 3163 { 3164 long[2] result = [e1, e0]; 3165 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 3166 } 3167 unittest 3168 { 3169 long2 A = cast(long2) _mm_setr_epi64(-1, 0); 3170 long[2] correct = [-1, 0]; 3171 assert(A.array == correct); 3172 } 3173 3174 /// Set packed 8-bit integers with the supplied values in reverse order. 3175 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12, 3176 byte e11, byte e10, byte e9, byte e8, 3177 byte e7, byte e6, byte e5, byte e4, 3178 byte e3, byte e2, byte e1, byte e0) pure @trusted 3179 { 3180 byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 3181 e7, e6, e5, e4, e3, e2, e1, e0]; 3182 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 3183 } 3184 3185 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order. 3186 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 3187 { 3188 double2 result; 3189 result.ptr[0] = e1; 3190 result.ptr[1] = e0; 3191 return result; 3192 } 3193 unittest 3194 { 3195 __m128d A = _mm_setr_pd(61.0, 55.0); 3196 double[2] correct = [61.0, 55.0]; 3197 assert(A.array == correct); 3198 } 3199 3200 /// Return vector of type `__m128d` with all elements set to zero. 3201 __m128d _mm_setzero_pd () pure @trusted 3202 { 3203 // Note: using loadUnaligned has better -O0 codegen compared to .ptr 3204 double[2] result = [0.0, 0.0]; 3205 return loadUnaligned!(double2)(result.ptr); 3206 } 3207 3208 /// Return vector of type `__m128i` with all elements set to zero. 3209 __m128i _mm_setzero_si128() pure @trusted 3210 { 3211 // Note: using loadUnaligned has better -O0 codegen compared to .ptr 3212 int[4] result = [0, 0, 0, 0]; 3213 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 3214 } 3215 3216 /// Shuffle 32-bit integers in a using the control in `imm8`. 3217 /// See_also: `_MM_SHUFFLE`. 3218 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe 3219 { 3220 static if (GDC_with_SSE2) 3221 { 3222 return __builtin_ia32_pshufd(a, imm8); 3223 } 3224 else 3225 { 3226 return shufflevector!(int4, (imm8 >> 0) & 3, 3227 (imm8 >> 2) & 3, 3228 (imm8 >> 4) & 3, 3229 (imm8 >> 6) & 3)(a, a); 3230 } 3231 } 3232 unittest 3233 { 3234 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 3235 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3236 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 3237 int[4] expectedB = [ 3, 2, 1, 0 ]; 3238 assert(B.array == expectedB); 3239 } 3240 3241 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`. 3242 /// See_also: `_MM_SHUFFLE2`. 3243 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe 3244 { 3245 static if (GDC_with_SSE2) 3246 { 3247 return __builtin_ia32_shufpd(a, b, imm8); 3248 } 3249 else 3250 { 3251 return shufflevector!(double2, 0 + ( imm8 & 1 ), 3252 2 + ( (imm8 >> 1) & 1 ))(a, b); 3253 } 3254 } 3255 unittest 3256 { 3257 __m128d A = _mm_setr_pd(0.5, 2.0); 3258 __m128d B = _mm_setr_pd(4.0, 5.0); 3259 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 3260 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B); 3261 double[2] correct = [ 2.0, 5.0 ]; 3262 assert(R.array == correct); 3263 } 3264 3265 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 3266 /// 64 bits of result, with the low 64 bits being copied from from `a` to result. 3267 /// See also: `_MM_SHUFFLE`. 3268 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe 3269 { 3270 static if (GDC_with_SSE2) 3271 { 3272 return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8); 3273 } 3274 else 3275 { 3276 return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3, 3277 4 + ( (imm8 >> 0) & 3 ), 3278 4 + ( (imm8 >> 2) & 3 ), 3279 4 + ( (imm8 >> 4) & 3 ), 3280 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 3281 } 3282 } 3283 unittest 3284 { 3285 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3286 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3287 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 3288 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 3289 assert(C.array == expectedC); 3290 } 3291 3292 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 3293 /// bits of result, with the high 64 bits being copied from from `a` to result. 3294 /// See_also: `_MM_SHUFFLE`. 3295 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe 3296 { 3297 static if (GDC_with_SSE2) 3298 { 3299 return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8); 3300 } 3301 else 3302 { 3303 return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ), 3304 ( (imm8 >> 2) & 3 ), 3305 ( (imm8 >> 4) & 3 ), 3306 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 3307 } 3308 } 3309 unittest 3310 { 3311 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3312 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3313 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 3314 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 3315 assert(B.array == expectedB); 3316 } 3317 3318 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros. 3319 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted 3320 { 3321 static if (LDC_with_SSE2) 3322 { 3323 return __builtin_ia32_pslld128(a, count); 3324 } 3325 else static if (GDC_with_SSE2) 3326 { 3327 return __builtin_ia32_pslld128(a, count); 3328 } 3329 else static if (DMD_with_32bit_asm) 3330 { 3331 asm pure nothrow @nogc @trusted 3332 { 3333 movdqu XMM0, a; 3334 movdqu XMM1, count; 3335 pslld XMM0, XMM1; 3336 movdqu a, XMM0; 3337 } 3338 return a; 3339 } 3340 else 3341 { 3342 int4 r = void; 3343 long2 lc = cast(long2)count; 3344 int bits = cast(int)(lc.array[0]); 3345 foreach(i; 0..4) 3346 r[i] = cast(uint)(a[i]) << bits; 3347 return r; 3348 } 3349 } 3350 3351 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros. 3352 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted 3353 { 3354 static if (LDC_with_SSE2) 3355 { 3356 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3357 } 3358 else static if (GDC_with_SSE2) 3359 { 3360 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3361 } 3362 else static if (DMD_with_32bit_asm) 3363 { 3364 asm pure nothrow @nogc @trusted 3365 { 3366 movdqu XMM0, a; 3367 movdqu XMM1, count; 3368 psllq XMM0, XMM1; 3369 movdqu a, XMM0; 3370 } 3371 return a; 3372 } 3373 else 3374 { 3375 // ARM: good since LDC 1.12 -O2 3376 // ~but -O0 version is catastrophic 3377 long2 r = void; 3378 long2 sa = cast(long2)a; 3379 long2 lc = cast(long2)count; 3380 int bits = cast(int)(lc.array[0]); 3381 foreach(i; 0..2) 3382 r.array[i] = cast(ulong)(sa.array[i]) << bits; 3383 return cast(__m128i)r; 3384 } 3385 } 3386 3387 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros. 3388 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 3389 { 3390 static if (LDC_with_SSE2) 3391 { 3392 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3393 } 3394 else static if (GDC_with_SSE2) 3395 { 3396 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3397 } 3398 else static if (DMD_with_32bit_asm) 3399 { 3400 asm pure nothrow @nogc 3401 { 3402 movdqu XMM0, a; 3403 movdqu XMM1, count; 3404 psllw XMM0, XMM1; 3405 movdqu a, XMM0; 3406 } 3407 return a; 3408 } 3409 else 3410 { 3411 short8 sa = cast(short8)a; 3412 long2 lc = cast(long2)count; 3413 int bits = cast(int)(lc.array[0]); 3414 short8 r = void; 3415 foreach(i; 0..8) 3416 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits); 3417 return cast(int4)r; 3418 } 3419 } 3420 3421 3422 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 3423 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted 3424 { 3425 static if (GDC_with_SSE2) 3426 { 3427 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3428 } 3429 else static if (LDC_with_SSE2) 3430 { 3431 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3432 } 3433 else 3434 { 3435 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3436 // D says "It's illegal to shift by the same or more bits 3437 // than the size of the quantity being shifted" 3438 // and it's UB instead. 3439 int4 r = _mm_setzero_si128(); 3440 3441 ubyte count = cast(ubyte) imm8; 3442 if (count > 31) 3443 return r; 3444 3445 foreach(i; 0..4) 3446 r.array[i] = cast(uint)(a.array[i]) << count; 3447 return r; 3448 } 3449 } 3450 unittest 3451 { 3452 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3453 __m128i B = _mm_slli_epi32(A, 1); 3454 __m128i B2 = _mm_slli_epi32(A, 1 + 256); 3455 int[4] expectedB = [ 0, 4, 6, -8]; 3456 assert(B.array == expectedB); 3457 assert(B2.array == expectedB); 3458 3459 __m128i C = _mm_slli_epi32(A, 0); 3460 int[4] expectedC = [ 0, 2, 3, -4]; 3461 assert(C.array == expectedC); 3462 3463 __m128i D = _mm_slli_epi32(A, 65); 3464 int[4] expectedD = [ 0, 0, 0, 0]; 3465 assert(D.array == expectedD); 3466 } 3467 3468 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros. 3469 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted 3470 { 3471 static if (GDC_with_SSE2) 3472 { 3473 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3474 } 3475 else static if (LDC_with_SSE2) 3476 { 3477 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3478 } 3479 else 3480 { 3481 long2 sa = cast(long2)a; 3482 3483 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3484 // D says "It's illegal to shift by the same or more bits 3485 // than the size of the quantity being shifted" 3486 // and it's UB instead. 3487 long2 r = cast(long2) _mm_setzero_si128(); 3488 ubyte count = cast(ubyte) imm8; 3489 if (count > 63) 3490 return cast(__m128i)r; 3491 3492 r.ptr[0] = cast(ulong)(sa.array[0]) << count; 3493 r.ptr[1] = cast(ulong)(sa.array[1]) << count; 3494 return cast(__m128i)r; 3495 } 3496 } 3497 unittest 3498 { 3499 __m128i A = _mm_setr_epi64(8, -4); 3500 long2 B = cast(long2) _mm_slli_epi64(A, 1); 3501 long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024); 3502 long[2] expectedB = [ 16, -8]; 3503 assert(B.array == expectedB); 3504 assert(B2.array == expectedB); 3505 3506 long2 C = cast(long2) _mm_slli_epi64(A, 0); 3507 long[2] expectedC = [ 8, -4]; 3508 assert(C.array == expectedC); 3509 3510 long2 D = cast(long2) _mm_slli_epi64(A, 64); 3511 long[2] expectedD = [ 0, -0]; 3512 assert(D.array == expectedD); 3513 } 3514 3515 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 3516 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted 3517 { 3518 static if (GDC_with_SSE2) 3519 { 3520 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3521 } 3522 else static if (LDC_with_SSE2) 3523 { 3524 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3525 } 3526 else static if (LDC_with_ARM64) 3527 { 3528 short8 sa = cast(short8)a; 3529 short8 r = cast(short8)_mm_setzero_si128(); 3530 ubyte count = cast(ubyte) imm8; 3531 if (count > 15) 3532 return cast(__m128i)r; 3533 r = sa << short8(count); 3534 return cast(__m128i)r; 3535 } 3536 else 3537 { 3538 short8 sa = cast(short8)a; 3539 short8 r = cast(short8)_mm_setzero_si128(); 3540 ubyte count = cast(ubyte) imm8; 3541 if (count > 15) 3542 return cast(__m128i)r; 3543 foreach(i; 0..8) 3544 r.ptr[i] = cast(short)(sa.array[i] << count); 3545 return cast(__m128i)r; 3546 } 3547 } 3548 unittest 3549 { 3550 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3551 short8 B = cast(short8)( _mm_slli_epi16(A, 1) ); 3552 short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) ); 3553 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 3554 assert(B.array == expectedB); 3555 assert(B2.array == expectedB); 3556 3557 short8 C = cast(short8)( _mm_slli_epi16(A, 16) ); 3558 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 3559 assert(C.array == expectedC); 3560 } 3561 3562 3563 /// Shift `a` left by `bytes` bytes while shifting in zeros. 3564 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted 3565 { 3566 static if (bytes & 0xF0) 3567 { 3568 return _mm_setzero_si128(); 3569 } 3570 else 3571 { 3572 static if (GDC_with_SSE2) 3573 { 3574 return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 3575 } 3576 else version(DigitalMars) 3577 { 3578 version(D_InlineAsm_X86) 3579 { 3580 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64 3581 { 3582 movdqu XMM0, op; 3583 pslldq XMM0, bytes; 3584 movdqu op, XMM0; 3585 } 3586 return op; 3587 } 3588 else 3589 { 3590 byte16 A = cast(byte16)op; 3591 byte16 R; 3592 for (int n = 15; n >= bytes; --n) 3593 R.ptr[n] = A.array[n-bytes]; 3594 for (int n = bytes-1; n >= 0; --n) 3595 R.ptr[n] = 0; 3596 return cast(__m128i)R; 3597 } 3598 } 3599 else 3600 { 3601 return cast(__m128i) shufflevector!(byte16, 3602 16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes, 3603 22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes, 3604 28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes) 3605 (cast(byte16)_mm_setzero_si128(), cast(byte16)op); 3606 } 3607 } 3608 } 3609 unittest 3610 { 3611 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3612 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left 3613 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ]; 3614 assert(R.array == correct); 3615 3616 __m128i B = _mm_srli_si128!16(_mm_set1_epi32(-1)); 3617 int[4] expectedB = [0, 0, 0, 0]; 3618 assert(B.array == expectedB); 3619 } 3620 3621 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`. 3622 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted 3623 { 3624 version(LDC) 3625 { 3626 // Disappeared with LDC 1.11 3627 static if (__VERSION__ < 2081) 3628 return __builtin_ia32_sqrtpd(vec); 3629 else 3630 { 3631 vec.array[0] = llvm_sqrt(vec.array[0]); 3632 vec.array[1] = llvm_sqrt(vec.array[1]); 3633 return vec; 3634 } 3635 } 3636 else static if (GDC_with_SSE2) 3637 { 3638 return __builtin_ia32_sqrtpd(vec); 3639 } 3640 else 3641 { 3642 vec.ptr[0] = sqrt(vec.array[0]); 3643 vec.ptr[1] = sqrt(vec.array[1]); 3644 return vec; 3645 } 3646 } 3647 3648 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 3649 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 3650 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted 3651 { 3652 // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only. 3653 // "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 3654 // The quadword at bits 127:64 of the destination operand remains unchanged." 3655 version(LDC) 3656 { 3657 // Disappeared with LDC 1.11 3658 static if (__VERSION__ < 2081) 3659 { 3660 __m128d c = __builtin_ia32_sqrtsd(b); 3661 a[0] = c[0]; 3662 return a; 3663 } 3664 else 3665 { 3666 a.array[0] = llvm_sqrt(b.array[0]); 3667 return a; 3668 } 3669 } 3670 else static if (GDC_with_SSE2) 3671 { 3672 __m128d c = __builtin_ia32_sqrtsd(b); 3673 a.ptr[0] = c.array[0]; 3674 return a; 3675 } 3676 else 3677 { 3678 a.ptr[0] = sqrt(b.array[0]); 3679 return a; 3680 } 3681 } 3682 unittest 3683 { 3684 __m128d A = _mm_setr_pd(1.0, 3.0); 3685 __m128d B = _mm_setr_pd(4.0, 5.0); 3686 __m128d R = _mm_sqrt_sd(A, B); 3687 double[2] correct = [2.0, 3.0 ]; 3688 assert(R.array == correct); 3689 } 3690 3691 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits. 3692 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted 3693 { 3694 static if (GDC_with_SSE2) 3695 { 3696 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3697 } 3698 else static if (LDC_with_SSE2) 3699 { 3700 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3701 } 3702 else 3703 { 3704 short8 sa = cast(short8)a; 3705 long2 lc = cast(long2)count; 3706 int bits = cast(int)(lc.array[0]); 3707 short8 r = void; 3708 foreach(i; 0..8) 3709 r.ptr[i] = cast(short)(sa.array[i] >> bits); 3710 return cast(int4)r; 3711 } 3712 } 3713 3714 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits. 3715 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted 3716 { 3717 static if (LDC_with_SSE2) 3718 { 3719 return __builtin_ia32_psrad128(a, count); 3720 } 3721 else static if (GDC_with_SSE2) 3722 { 3723 return __builtin_ia32_psrad128(a, count); 3724 } 3725 else 3726 { 3727 int4 r = void; 3728 long2 lc = cast(long2)count; 3729 int bits = cast(int)(lc.array[0]); 3730 r.ptr[0] = (a.array[0] >> bits); 3731 r.ptr[1] = (a.array[1] >> bits); 3732 r.ptr[2] = (a.array[2] >> bits); 3733 r.ptr[3] = (a.array[3] >> bits); 3734 return r; 3735 } 3736 } 3737 3738 3739 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 3740 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted 3741 { 3742 static if (GDC_with_SSE2) 3743 { 3744 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 3745 } 3746 else static if (LDC_with_SSE2) 3747 { 3748 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 3749 } 3750 else static if (LDC_with_ARM64) 3751 { 3752 short8 sa = cast(short8)a; 3753 ubyte count = cast(ubyte)imm8; 3754 if (count > 15) 3755 count = 15; 3756 short8 r = sa >> short8(count); 3757 return cast(__m128i)r; 3758 } 3759 else 3760 { 3761 short8 sa = cast(short8)a; 3762 short8 r = void; 3763 3764 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3765 // D says "It's illegal to shift by the same or more bits 3766 // than the size of the quantity being shifted" 3767 // and it's UB instead. 3768 ubyte count = cast(ubyte)imm8; 3769 if (count > 15) 3770 count = 15; 3771 foreach(i; 0..8) 3772 r.ptr[i] = cast(short)(sa.array[i] >> count); 3773 return cast(int4)r; 3774 } 3775 } 3776 unittest 3777 { 3778 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3779 short8 B = cast(short8)( _mm_srai_epi16(A, 1) ); 3780 short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) ); 3781 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 3782 assert(B.array == expectedB); 3783 assert(B2.array == expectedB); 3784 3785 short8 C = cast(short8)( _mm_srai_epi16(A, 18) ); 3786 short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ]; 3787 assert(C.array == expectedC); 3788 } 3789 3790 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 3791 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted 3792 { 3793 static if (LDC_with_SSE2) 3794 { 3795 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 3796 } 3797 else static if (GDC_with_SSE2) 3798 { 3799 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 3800 } 3801 else 3802 { 3803 int4 r = void; 3804 3805 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3806 // D says "It's illegal to shift by the same or more bits 3807 // than the size of the quantity being shifted" 3808 // and it's UB instead. 3809 ubyte count = cast(ubyte) imm8; 3810 if (count > 31) 3811 count = 31; 3812 3813 r.ptr[0] = (a.array[0] >> count); 3814 r.ptr[1] = (a.array[1] >> count); 3815 r.ptr[2] = (a.array[2] >> count); 3816 r.ptr[3] = (a.array[3] >> count); 3817 return r; 3818 } 3819 } 3820 unittest 3821 { 3822 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3823 __m128i B = _mm_srai_epi32(A, 1); 3824 __m128i B2 = _mm_srai_epi32(A, 1 + 256); 3825 int[4] expectedB = [ 0, 1, 1, -2]; 3826 assert(B.array == expectedB); 3827 assert(B2.array == expectedB); 3828 3829 __m128i C = _mm_srai_epi32(A, 32); 3830 int[4] expectedC = [ 0, 0, 0, -1]; 3831 assert(C.array == expectedC); 3832 3833 __m128i D = _mm_srai_epi32(A, 0); 3834 int[4] expectedD = [ 0, 2, 3, -4]; 3835 assert(D.array == expectedD); 3836 } 3837 3838 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted 3839 { 3840 static if (LDC_with_SSE2) 3841 { 3842 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 3843 } 3844 else static if (GDC_with_SSE2) 3845 { 3846 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 3847 } 3848 else 3849 { 3850 short8 sa = cast(short8)a; 3851 long2 lc = cast(long2)count; 3852 int bits = cast(int)(lc.array[0]); 3853 short8 r = void; 3854 foreach(i; 0..8) 3855 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits); 3856 return cast(int4)r; 3857 } 3858 } 3859 3860 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted 3861 { 3862 static if (LDC_with_SSE2) 3863 { 3864 return __builtin_ia32_psrld128(a, count); 3865 } 3866 else static if (GDC_with_SSE2) 3867 { 3868 return __builtin_ia32_psrld128(a, count); 3869 } 3870 else 3871 { 3872 int4 r = void; 3873 long2 lc = cast(long2)count; 3874 int bits = cast(int)(lc.array[0]); 3875 r.ptr[0] = cast(uint)(a.array[0]) >> bits; 3876 r.ptr[1] = cast(uint)(a.array[1]) >> bits; 3877 r.ptr[2] = cast(uint)(a.array[2]) >> bits; 3878 r.ptr[3] = cast(uint)(a.array[3]) >> bits; 3879 return r; 3880 } 3881 } 3882 3883 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted 3884 { 3885 static if (LDC_with_SSE2) 3886 { 3887 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 3888 } 3889 else static if (GDC_with_SSE2) 3890 { 3891 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 3892 } 3893 else 3894 { 3895 long2 r = void; 3896 long2 sa = cast(long2)a; 3897 long2 lc = cast(long2)count; 3898 int bits = cast(int)(lc.array[0]); 3899 r.ptr[0] = cast(ulong)(sa.array[0]) >> bits; 3900 r.ptr[1] = cast(ulong)(sa.array[1]) >> bits; 3901 return cast(__m128i)r; 3902 } 3903 } 3904 3905 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 3906 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted 3907 { 3908 static if (GDC_with_SSE2) 3909 { 3910 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 3911 } 3912 else static if (LDC_with_SSE2) 3913 { 3914 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 3915 } 3916 else static if (LDC_with_ARM64) 3917 { 3918 short8 sa = cast(short8)a; 3919 short8 r = cast(short8) _mm_setzero_si128(); 3920 3921 ubyte count = cast(ubyte)imm8; 3922 if (count >= 16) 3923 return cast(__m128i)r; 3924 3925 r = sa >>> short8(count); // This facility offered with LDC, but not DMD. 3926 return cast(__m128i)r; 3927 } 3928 else 3929 { 3930 short8 sa = cast(short8)a; 3931 ubyte count = cast(ubyte)imm8; 3932 3933 short8 r = cast(short8) _mm_setzero_si128(); 3934 if (count >= 16) 3935 return cast(__m128i)r; 3936 3937 foreach(i; 0..8) 3938 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count); 3939 return cast(__m128i)r; 3940 } 3941 } 3942 unittest 3943 { 3944 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3945 short8 B = cast(short8)( _mm_srli_epi16(A, 1) ); 3946 short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) ); 3947 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 3948 assert(B.array == expectedB); 3949 assert(B2.array == expectedB); 3950 3951 short8 C = cast(short8)( _mm_srli_epi16(A, 16) ); 3952 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0]; 3953 assert(C.array == expectedC); 3954 3955 short8 D = cast(short8)( _mm_srli_epi16(A, 0) ); 3956 short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ]; 3957 assert(D.array == expectedD); 3958 } 3959 3960 3961 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 3962 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted 3963 { 3964 static if (GDC_with_SSE2) 3965 { 3966 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 3967 } 3968 else static if (LDC_with_SSE2) 3969 { 3970 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 3971 } 3972 else 3973 { 3974 ubyte count = cast(ubyte) imm8; 3975 3976 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3977 // D says "It's illegal to shift by the same or more bits 3978 // than the size of the quantity being shifted" 3979 // and it's UB instead. 3980 int4 r = _mm_setzero_si128(); 3981 if (count >= 32) 3982 return r; 3983 r.ptr[0] = a.array[0] >>> count; 3984 r.ptr[1] = a.array[1] >>> count; 3985 r.ptr[2] = a.array[2] >>> count; 3986 r.ptr[3] = a.array[3] >>> count; 3987 return r; 3988 } 3989 } 3990 unittest 3991 { 3992 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3993 __m128i B = _mm_srli_epi32(A, 1); 3994 __m128i B2 = _mm_srli_epi32(A, 1 + 256); 3995 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 3996 assert(B.array == expectedB); 3997 assert(B2.array == expectedB); 3998 3999 __m128i C = _mm_srli_epi32(A, 255); 4000 int[4] expectedC = [ 0, 0, 0, 0 ]; 4001 assert(C.array == expectedC); 4002 } 4003 4004 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros. 4005 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted 4006 { 4007 static if (GDC_with_SSE2) 4008 { 4009 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4010 } 4011 else static if (LDC_with_SSE2) 4012 { 4013 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4014 } 4015 else 4016 { 4017 long2 r = cast(long2) _mm_setzero_si128(); 4018 long2 sa = cast(long2)a; 4019 4020 ubyte count = cast(ubyte) imm8; 4021 if (count >= 64) 4022 return cast(__m128i)r; 4023 4024 r.ptr[0] = sa.array[0] >>> count; 4025 r.ptr[1] = sa.array[1] >>> count; 4026 return cast(__m128i)r; 4027 } 4028 } 4029 unittest 4030 { 4031 __m128i A = _mm_setr_epi64(8, -4); 4032 long2 B = cast(long2) _mm_srli_epi64(A, 1); 4033 long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512); 4034 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 4035 assert(B.array == expectedB); 4036 assert(B2.array == expectedB); 4037 4038 long2 C = cast(long2) _mm_srli_epi64(A, 64); 4039 long[2] expectedC = [ 0, 0 ]; 4040 assert(C.array == expectedC); 4041 } 4042 4043 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4044 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe 4045 { 4046 static if (bytes & 0xF0) 4047 { 4048 return _mm_setzero_si128(); 4049 } 4050 else static if (GDC_with_SSE2) 4051 { 4052 return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8)); 4053 } 4054 else static if (DMD_with_32bit_asm) 4055 { 4056 asm pure nothrow @nogc @trusted 4057 { 4058 movdqu XMM0, v; 4059 psrldq XMM0, bytes; 4060 movdqu v, XMM0; 4061 } 4062 return v; 4063 } 4064 else 4065 { 4066 return cast(__m128i) shufflevector!(byte16, 4067 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7, 4068 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15) 4069 (cast(byte16) v, cast(byte16)_mm_setzero_si128()); 4070 } 4071 } 4072 unittest 4073 { 4074 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1)); 4075 int[4] correct = [2, 3, 4, 0]; 4076 assert(R.array == correct); 4077 4078 __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1)); 4079 int[4] expectedA = [0, 0, 0, 0]; 4080 assert(A.array == expectedA); 4081 } 4082 4083 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4084 /// #BONUS 4085 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe 4086 { 4087 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v); 4088 } 4089 unittest 4090 { 4091 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)); 4092 float[4] correct = [3.0f, 4.0f, 0, 0]; 4093 assert(R.array == correct); 4094 } 4095 4096 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4097 /// #BONUS 4098 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe 4099 { 4100 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v); 4101 } 4102 4103 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4104 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4105 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted 4106 { 4107 __m128d* aligned = cast(__m128d*)mem_addr; 4108 *aligned = a; 4109 } 4110 4111 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 4112 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4113 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted 4114 { 4115 __m128d* aligned = cast(__m128d*)mem_addr; 4116 __m128d r; 4117 r.ptr[0] = a.array[0]; 4118 r.ptr[1] = a.array[0]; 4119 *aligned = r; 4120 } 4121 4122 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 4123 /// be aligned on any particular boundary. 4124 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 4125 { 4126 *mem_addr = a.array[0]; 4127 } 4128 4129 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 4130 /// general-protection exception may be generated. 4131 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 4132 { 4133 *mem_addr = a; 4134 } 4135 4136 alias _mm_store1_pd = _mm_store_pd1; /// 4137 4138 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory. 4139 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 4140 { 4141 *mem_addr = a.array[1]; 4142 } 4143 4144 // Note: `mem_addr` doesn't have to actually be aligned, which breaks 4145 // expectations from the user point of view. This problem also exist in C++. 4146 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 4147 { 4148 long* dest = cast(long*)mem_addr; 4149 long2 la = cast(long2)a; 4150 *dest = la.array[0]; 4151 } 4152 unittest 4153 { 4154 long[3] A = [1, 2, 3]; 4155 _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 4156 long[3] correct = [1, 0x1_0000_0000, 3]; 4157 assert(A == correct); 4158 } 4159 4160 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. 4161 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 4162 { 4163 *mem_addr = a.array[0]; 4164 } 4165 4166 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order. `mem_addr` must be 4167 /// aligned on a 16-byte boundary or a general-protection exception may be generated. 4168 void _mm_storer_pd (double* mem_addr, __m128d a) pure 4169 { 4170 __m128d* aligned = cast(__m128d*)mem_addr; 4171 *aligned = shufflevector!(double2, 1, 0)(a, a); 4172 } 4173 4174 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4175 /// `mem_addr` does not need to be aligned on any particular boundary. 4176 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe 4177 { 4178 storeUnaligned!double2(a, mem_addr); 4179 } 4180 4181 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 4182 /// boundary. 4183 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe 4184 { 4185 storeUnaligned!__m128i(a, cast(int*)mem_addr); 4186 } 4187 4188 /// Store 32-bit integer from the first element of `a` into memory. 4189 /// `mem_addr` does not need to be aligned on any particular boundary. 4190 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted 4191 { 4192 int* dest = cast(int*)mem_addr; 4193 *dest = a.array[0]; 4194 } 4195 unittest 4196 { 4197 int[2] arr = [-24, 12]; 4198 _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7)); 4199 assert(arr == [-24, -1]); 4200 } 4201 4202 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 4203 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte 4204 /// boundary or a general-protection exception may be generated. 4205 void _mm_stream_pd (double* mem_addr, __m128d a) 4206 { 4207 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4208 __m128d* dest = cast(__m128d*)mem_addr; 4209 *dest = a; 4210 } 4211 4212 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 4213 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception 4214 /// may be generated. 4215 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) 4216 { 4217 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4218 __m128i* dest = cast(__m128i*)mem_addr; 4219 *dest = a; 4220 } 4221 4222 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 4223 /// pollution. If the cache line containing address mem_addr is already in the cache, 4224 /// the cache will be updated. 4225 void _mm_stream_si32 (int* mem_addr, int a) 4226 { 4227 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4228 *mem_addr = a; 4229 } 4230 4231 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 4232 /// cache pollution. If the cache line containing address mem_addr is already 4233 /// in the cache, the cache will be updated. 4234 void _mm_stream_si64 (long* mem_addr, long a) 4235 { 4236 // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4237 *mem_addr = a; 4238 } 4239 4240 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 4241 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 4242 { 4243 return cast(__m128i)(cast(short8)a - cast(short8)b); 4244 } 4245 4246 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 4247 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 4248 { 4249 return cast(__m128i)(cast(int4)a - cast(int4)b); 4250 } 4251 4252 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`. 4253 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 4254 { 4255 return cast(__m128i)(cast(long2)a - cast(long2)b); 4256 } 4257 4258 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 4259 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 4260 { 4261 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 4262 } 4263 4264 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 4265 /// floating-point elements in `a`. 4266 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 4267 { 4268 return a - b; 4269 } 4270 4271 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 4272 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the 4273 /// upper element of result. 4274 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted 4275 { 4276 version(DigitalMars) 4277 { 4278 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 4279 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 4280 asm pure nothrow @nogc @trusted { nop;} 4281 a[0] = a[0] - b[0]; 4282 return a; 4283 } 4284 else static if (GDC_with_SSE2) 4285 { 4286 return __builtin_ia32_subsd(a, b); 4287 } 4288 else 4289 { 4290 a.ptr[0] -= b.array[0]; 4291 return a; 4292 } 4293 } 4294 unittest 4295 { 4296 __m128d a = [1.5, -2.0]; 4297 a = _mm_sub_sd(a, a); 4298 assert(a.array == [0.0, -2.0]); 4299 } 4300 4301 /// Subtract 64-bit integer `b` from 64-bit integer `a`. 4302 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe 4303 { 4304 return a - b; 4305 } 4306 4307 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4308 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 4309 { 4310 version(LDC) 4311 { 4312 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4313 { 4314 // Generates PSUBSW since LDC 1.15 -O0 4315 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4316 4317 enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4318 enum ir = ` 4319 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4320 ret <8 x i16> %r`; 4321 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4322 } 4323 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4324 { 4325 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4326 short[8] res; 4327 short8 sa = cast(short8)a; 4328 short8 sb = cast(short8)b; 4329 foreach(i; 0..8) 4330 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4331 return _mm_loadu_si128(cast(int4*)res.ptr); 4332 } 4333 else static if (LDC_with_SSE2) 4334 { 4335 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 4336 } 4337 else 4338 static assert(false); 4339 } 4340 else static if (GDC_with_SSE2) 4341 { 4342 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 4343 } 4344 else 4345 { 4346 short[8] res; 4347 short8 sa = cast(short8)a; 4348 short8 sb = cast(short8)b; 4349 foreach(i; 0..8) 4350 res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4351 return _mm_loadu_si128(cast(int4*)res.ptr); 4352 } 4353 } 4354 unittest 4355 { 4356 short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0), 4357 _mm_setr_epi16(-10 , 16, 5, 4, 3, 2, 1, 0)); 4358 static immutable short[8] correctResult = [32767, -32768, 0, 0, 0, 0, 0, 0]; 4359 assert(res.array == correctResult); 4360 } 4361 4362 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 4363 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 4364 { 4365 version(LDC) 4366 { 4367 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4368 { 4369 // x86: Generates PSUBSB since LDC 1.15 -O0 4370 // ARM: Generates sqsub.16b since LDC 1.21 -O0 4371 enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 4372 enum ir = ` 4373 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 4374 ret <16 x i8> %r`; 4375 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 4376 } 4377 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4378 { 4379 byte[16] res; 4380 byte16 sa = cast(byte16)a; 4381 byte16 sb = cast(byte16)b; 4382 foreach(i; 0..16) 4383 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4384 return _mm_loadu_si128(cast(int4*)res.ptr); 4385 } 4386 else static if (LDC_with_SSE2) 4387 { 4388 return cast(__m128i) __builtin_ia32_psubsb128(cast(byte16) a, cast(byte16) b); 4389 } 4390 else 4391 static assert(false); 4392 } 4393 else static if (GDC_with_SSE2) 4394 { 4395 return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b); 4396 } 4397 else 4398 { 4399 byte[16] res; 4400 byte16 sa = cast(byte16)a; 4401 byte16 sb = cast(byte16)b; 4402 foreach(i; 0..16) 4403 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4404 return _mm_loadu_si128(cast(int4*)res.ptr); 4405 } 4406 } 4407 unittest 4408 { 4409 byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4410 _mm_setr_epi8( 15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4411 static immutable byte[16] correctResult = [-128, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4412 assert(res.array == correctResult); 4413 } 4414 4415 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 4416 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 4417 { 4418 version(LDC) 4419 { 4420 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4421 { 4422 // x86: Generates PSUBUSW since LDC 1.15 -O0 4423 // ARM: Generates uqsub.8h since LDC 1.21 -O0 4424 enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4425 enum ir = ` 4426 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4427 ret <8 x i16> %r`; 4428 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4429 } 4430 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4431 { 4432 short[8] res; 4433 short8 sa = cast(short8)a; 4434 short8 sb = cast(short8)b; 4435 foreach(i; 0..8) 4436 { 4437 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4438 res[i] = saturateSignedIntToUnsignedShort(sum); 4439 } 4440 return _mm_loadu_si128(cast(int4*)res.ptr); 4441 } 4442 else static if (LDC_with_SSE2) 4443 { 4444 return cast(__m128i) __builtin_ia32_psubusw128(a, b); 4445 } 4446 else 4447 static assert(false); 4448 } 4449 else static if (GDC_with_SSE2) 4450 { 4451 return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b); 4452 } 4453 else 4454 { 4455 short[8] res; 4456 short8 sa = cast(short8)a; 4457 short8 sb = cast(short8)b; 4458 foreach(i; 0..8) 4459 { 4460 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4461 res[i] = saturateSignedIntToUnsignedShort(sum); 4462 } 4463 return _mm_loadu_si128(cast(int4*)res.ptr); 4464 } 4465 } 4466 unittest 4467 { 4468 short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534, 1, 5, 4, 3, 2, 1, 0), 4469 _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0)); 4470 static immutable short[8] correct = [ 0, 0, 1, 0, 0, 2, 0, 0]; 4471 assert(R.array == correct); 4472 } 4473 4474 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4475 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4476 { 4477 version(LDC) 4478 { 4479 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4480 { 4481 // x86: Generates PSUBUSB since LDC 1.15 -O0 4482 // ARM: Generates uqsub.16b since LDC 1.21 -O0 4483 enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 4484 enum ir = ` 4485 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 4486 ret <16 x i8> %r`; 4487 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 4488 } 4489 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4490 { 4491 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4492 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4493 { 4494 ubyte[16] res; 4495 byte16 sa = cast(byte16)a; 4496 byte16 sb = cast(byte16)b; 4497 foreach(i; 0..16) 4498 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4499 return _mm_loadu_si128(cast(int4*)res.ptr); 4500 } 4501 } 4502 else static if (LDC_with_SSE2) 4503 { 4504 return __builtin_ia32_psubusb128(a, b); 4505 } 4506 else 4507 static assert(false); 4508 } 4509 else static if (GDC_with_SSE2) 4510 { 4511 return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b); 4512 } 4513 else 4514 { 4515 ubyte[16] res; 4516 byte16 sa = cast(byte16)a; 4517 byte16 sb = cast(byte16)b; 4518 foreach(i; 0..16) 4519 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4520 return _mm_loadu_si128(cast(int4*)res.ptr); 4521 } 4522 } 4523 unittest 4524 { 4525 byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4526 _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4527 static immutable byte[16] correctResult = [ 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4528 assert(res.array == correctResult); 4529 } 4530 4531 // Note: the only difference between these intrinsics is the signalling 4532 // behaviour of quiet NaNs. This is incorrect but the case where 4533 // you would want to differentiate between qNaN and sNaN and then 4534 // treat them differently on purpose seems extremely rare. 4535 alias _mm_ucomieq_sd = _mm_comieq_sd; /// 4536 alias _mm_ucomige_sd = _mm_comige_sd; /// 4537 alias _mm_ucomigt_sd = _mm_comigt_sd; /// 4538 alias _mm_ucomile_sd = _mm_comile_sd; /// 4539 alias _mm_ucomilt_sd = _mm_comilt_sd; /// 4540 alias _mm_ucomineq_sd = _mm_comineq_sd; /// 4541 4542 /// Return vector of type `__m128d` with undefined elements. 4543 __m128d _mm_undefined_pd() pure @safe 4544 { 4545 __m128d result = void; 4546 return result; 4547 } 4548 4549 /// Return vector of type `__m128i` with undefined elements. 4550 __m128i _mm_undefined_si128() pure @safe 4551 { 4552 __m128i result = void; 4553 return result; 4554 } 4555 4556 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. 4557 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe 4558 { 4559 static if (GDC_with_SSE2) 4560 { 4561 return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b); 4562 } 4563 else static if (DMD_with_32bit_asm) 4564 { 4565 asm pure nothrow @nogc @trusted 4566 { 4567 movdqu XMM0, a; 4568 movdqu XMM1, b; 4569 punpckhwd XMM0, XMM1; 4570 movdqu a, XMM0; 4571 } 4572 return a; 4573 } 4574 else 4575 { 4576 return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 4577 (cast(short8)a, cast(short8)b); 4578 } 4579 } 4580 unittest 4581 { 4582 __m128i A = _mm_setr_epi16(4, 5, 6, 7, 8, 9, 10, 11); 4583 __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19); 4584 short8 C = cast(short8)(_mm_unpackhi_epi16(A, B)); 4585 short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19]; 4586 assert(C.array == correct); 4587 } 4588 4589 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. 4590 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted 4591 { 4592 static if (GDC_with_SSE2) 4593 { 4594 return __builtin_ia32_punpckhdq128(a, b); 4595 } 4596 else version(DigitalMars) 4597 { 4598 __m128i r; 4599 r.ptr[0] = a.array[2]; 4600 r.ptr[1] = b.array[2]; 4601 r.ptr[2] = a.array[3]; 4602 r.ptr[3] = b.array[3]; 4603 return r; 4604 } 4605 else 4606 { 4607 return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 4608 } 4609 } 4610 unittest 4611 { 4612 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 4613 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 4614 __m128i C = _mm_unpackhi_epi32(A, B); 4615 int[4] correct = [3, 7, 4, 8]; 4616 assert(C.array == correct); 4617 } 4618 4619 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`. 4620 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted 4621 { 4622 static if (GDC_with_SSE2) 4623 { 4624 return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b); 4625 } 4626 else 4627 { 4628 __m128i r = cast(__m128i)b; 4629 r[0] = a[2]; 4630 r[1] = a[3]; 4631 return r; 4632 } 4633 } 4634 unittest // Issue #36 4635 { 4636 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4637 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4638 long2 C = cast(long2)(_mm_unpackhi_epi64(A, B)); 4639 long[2] correct = [0x33333333_33333333, 0x55555555_55555555]; 4640 assert(C.array == correct); 4641 } 4642 4643 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. 4644 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe 4645 { 4646 static if (GDC_with_SSE2) 4647 { 4648 return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b); 4649 } 4650 else static if (DMD_with_32bit_asm) 4651 { 4652 asm pure nothrow @nogc @trusted 4653 { 4654 movdqu XMM0, a; 4655 movdqu XMM1, b; 4656 punpckhbw XMM0, XMM1; 4657 movdqu a, XMM0; 4658 } 4659 return a; 4660 } 4661 else 4662 { 4663 return cast(__m128i)shufflevector!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 4664 12, 28, 13, 29, 14, 30, 15, 31) 4665 (cast(byte16)a, cast(byte16)b); 4666 } 4667 } 4668 unittest 4669 { 4670 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 4671 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 4672 byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B); 4673 byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]; 4674 assert(C.array == correct); 4675 } 4676 4677 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`. 4678 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe 4679 { 4680 static if (GDC_with_SSE2) 4681 { 4682 return __builtin_ia32_unpckhpd(a, b); 4683 } 4684 else 4685 { 4686 return shufflevector!(__m128d, 1, 3)(a, b); 4687 } 4688 } 4689 unittest 4690 { 4691 __m128d A = _mm_setr_pd(4.0, 6.0); 4692 __m128d B = _mm_setr_pd(7.0, 9.0); 4693 __m128d C = _mm_unpackhi_pd(A, B); 4694 double[2] correct = [6.0, 9.0]; 4695 assert(C.array == correct); 4696 } 4697 4698 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. 4699 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe 4700 { 4701 static if (GDC_with_SSE2) 4702 { 4703 return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b); 4704 } 4705 else static if (DMD_with_32bit_asm) 4706 { 4707 asm pure nothrow @nogc @trusted 4708 { 4709 movdqu XMM0, a; 4710 movdqu XMM1, b; 4711 punpcklwd XMM0, XMM1; 4712 movdqu a, XMM0; 4713 } 4714 return a; 4715 } 4716 else 4717 { 4718 return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11) 4719 (cast(short8)a, cast(short8)b); 4720 } 4721 } 4722 unittest 4723 { 4724 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 4725 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 4726 short8 C = cast(short8) _mm_unpacklo_epi16(A, B); 4727 short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11]; 4728 assert(C.array == correct); 4729 } 4730 4731 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. 4732 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted 4733 { 4734 static if (GDC_with_SSE2) 4735 { 4736 return __builtin_ia32_punpckldq128(a, b); 4737 } 4738 else version(DigitalMars) 4739 { 4740 __m128i r; 4741 r.ptr[0] = a.array[0]; 4742 r.ptr[1] = b.array[0]; 4743 r.ptr[2] = a.array[1]; 4744 r.ptr[3] = b.array[1]; 4745 return r; 4746 } 4747 else 4748 { 4749 return shufflevector!(int4, 0, 4, 1, 5)(cast(int4)a, cast(int4)b); 4750 } 4751 } 4752 unittest 4753 { 4754 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 4755 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 4756 __m128i C = _mm_unpacklo_epi32(A, B); 4757 int[4] correct = [1, 5, 2, 6]; 4758 assert(C.array == correct); 4759 } 4760 4761 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`. 4762 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted 4763 { 4764 static if (GDC_with_SSE2) 4765 { 4766 return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b); 4767 } 4768 else 4769 { 4770 long2 lA = cast(long2)a; 4771 long2 lB = cast(long2)b; 4772 long2 R; 4773 R.ptr[0] = lA.array[0]; 4774 R.ptr[1] = lB.array[0]; 4775 return cast(__m128i)R; 4776 } 4777 } 4778 unittest // Issue #36 4779 { 4780 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4781 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4782 long2 C = cast(long2)(_mm_unpacklo_epi64(A, B)); 4783 long[2] correct = [0x22222222_22222222, 0x44444444_44444444]; 4784 assert(C.array == correct); 4785 } 4786 4787 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. 4788 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe 4789 { 4790 static if (GDC_with_SSE2) 4791 { 4792 return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b); 4793 } 4794 else static if (DMD_with_32bit_asm) 4795 { 4796 asm pure nothrow @nogc @trusted 4797 { 4798 movdqu XMM0, a; 4799 movdqu XMM1, b; 4800 punpcklbw XMM0, XMM1; 4801 movdqu a, XMM0; 4802 } 4803 return a; 4804 } 4805 else 4806 { 4807 return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 4808 4, 20, 5, 21, 6, 22, 7, 23) 4809 (cast(byte16)a, cast(byte16)b); 4810 } 4811 } 4812 unittest 4813 { 4814 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 4815 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 4816 byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B); 4817 byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]; 4818 assert(C.array == correct); 4819 } 4820 4821 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`. 4822 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe 4823 { 4824 static if (GDC_with_SSE2) 4825 { 4826 return __builtin_ia32_unpcklpd(a, b); 4827 } 4828 else 4829 { 4830 return shufflevector!(__m128d, 0, 2)(a, b); 4831 } 4832 } 4833 unittest 4834 { 4835 __m128d A = _mm_setr_pd(4.0, 6.0); 4836 __m128d B = _mm_setr_pd(7.0, 9.0); 4837 __m128d C = _mm_unpacklo_pd(A, B); 4838 double[2] correct = [4.0, 7.0]; 4839 assert(C.array == correct); 4840 } 4841 4842 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 4843 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 4844 { 4845 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 4846 } 4847 4848 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`. 4849 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 4850 { 4851 return a ^ b; 4852 } 4853 4854 unittest 4855 { 4856 float distance(float[4] a, float[4] b) nothrow @nogc 4857 { 4858 __m128 va = _mm_loadu_ps(a.ptr); 4859 __m128 vb = _mm_loadu_ps(b.ptr); 4860 __m128 diffSquared = _mm_sub_ps(va, vb); 4861 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 4862 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared)); 4863 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum)); 4864 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 4865 } 4866 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 4867 }