1 /** 2 * SSE2 intrinsics. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.emmintrin; 8 9 public import inteli.types; 10 public import inteli.xmmintrin; // SSE2 includes SSE1 11 import inteli.mmx; 12 import inteli.internals; 13 14 nothrow @nogc: 15 16 17 // SSE2 instructions 18 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 19 20 /// Add packed 16-bit integers in `a` and `b`. 21 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 22 { 23 return cast(__m128i)(cast(short8)a + cast(short8)b); 24 } 25 unittest 26 { 27 __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77); 28 short8 R = cast(short8) _mm_add_epi16(A, A); 29 short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154]; 30 assert(R.array == correct); 31 } 32 33 /// Add packed 32-bit integers in `a` and `b`. 34 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 35 { 36 return cast(__m128i)(cast(int4)a + cast(int4)b); 37 } 38 unittest 39 { 40 __m128i A = _mm_setr_epi32( -7, -1, 0, 9); 41 int4 R = _mm_add_epi32(A, A); 42 int[4] correct = [ -14, -2, 0, 18 ]; 43 assert(R.array == correct); 44 } 45 46 /// Add packed 64-bit integers in `a` and `b`. 47 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 48 { 49 return cast(__m128i)(cast(long2)a + cast(long2)b); 50 } 51 unittest 52 { 53 __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000); 54 long2 R = cast(long2) _mm_add_epi64(A, A); 55 long[2] correct = [ -2, 0 ]; 56 assert(R.array == correct); 57 } 58 59 /// Add packed 8-bit integers in `a` and `b`. 60 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 61 { 62 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 63 } 64 unittest 65 { 66 __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78); 67 byte16 R = cast(byte16) _mm_add_epi8(A, A); 68 byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100]; 69 assert(R.array == correct); 70 } 71 72 /// Add the lower double-precision (64-bit) floating-point element 73 /// in `a` and `b`, store the result in the lower element of dst, 74 /// and copy the upper element from `a` to the upper element of destination. 75 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 76 { 77 static if (GDC_with_SSE2) 78 { 79 return __builtin_ia32_addsd(a, b); 80 } 81 else version(DigitalMars) 82 { 83 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 84 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 85 asm pure nothrow @nogc @trusted { nop;} 86 a[0] = a[0] + b[0]; 87 return a; 88 } 89 else 90 { 91 a[0] += b[0]; 92 return a; 93 } 94 } 95 unittest 96 { 97 __m128d a = [1.5, -2.0]; 98 a = _mm_add_sd(a, a); 99 assert(a.array == [3.0, -2.0]); 100 } 101 102 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 103 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 104 { 105 return a + b; 106 } 107 unittest 108 { 109 __m128d a = [1.5, -2.0]; 110 a = _mm_add_pd(a, a); 111 assert(a.array == [3.0, -4.0]); 112 } 113 114 /// Add 64-bit integers `a` and `b`. 115 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe 116 { 117 return a + b; 118 } 119 120 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 121 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 122 { 123 static if (GDC_with_SSE2) 124 { 125 return cast(__m128i)__builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 126 } 127 else version(LDC) 128 { 129 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 130 { 131 // x86: Generates PADDSW since LDC 1.15 -O0 132 // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20 133 enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 134 enum ir = ` 135 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 136 ret <8 x i16> %r`; 137 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 138 } 139 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 140 { 141 short[8] res; 142 short8 sa = cast(short8)a; 143 short8 sb = cast(short8)b; 144 foreach(i; 0..8) 145 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 146 return _mm_loadu_si128(cast(int4*)res.ptr); 147 } 148 else 149 return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 150 } 151 else 152 { 153 short[8] res; 154 short8 sa = cast(short8)a; 155 short8 sb = cast(short8)b; 156 foreach(i; 0..8) 157 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 158 return _mm_loadu_si128(cast(int4*)res.ptr); 159 } 160 } 161 unittest 162 { 163 short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), 164 _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); 165 static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 166 assert(res.array == correctResult); 167 } 168 169 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 170 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 171 { 172 static if (GDC_with_SSE2) 173 { 174 return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b); 175 } 176 else version(LDC) 177 { 178 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 179 { 180 // x86: Generates PADDSB since LDC 1.15 -O0 181 // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20 182 enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 183 enum ir = ` 184 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 185 ret <16 x i8> %r`; 186 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 187 } 188 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 189 { 190 byte[16] res; 191 byte16 sa = cast(byte16)a; 192 byte16 sb = cast(byte16)b; 193 foreach(i; 0..16) 194 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 195 return _mm_loadu_si128(cast(int4*)res.ptr); 196 } 197 else 198 return cast(__m128i) __builtin_ia32_paddsb128(cast(byte16)a, cast(byte16)b); 199 } 200 else 201 { 202 byte[16] res; 203 byte16 sa = cast(byte16)a; 204 byte16 sb = cast(byte16)b; 205 foreach(i; 0..16) 206 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 207 return _mm_loadu_si128(cast(int4*)res.ptr); 208 } 209 } 210 unittest 211 { 212 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 213 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 214 static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14, 215 16, 18, 20, 22, 24, 26, 28, 30]; 216 assert(res.array == correctResult); 217 } 218 219 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 220 // PERF: #GDC version? 221 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 222 { 223 version(LDC) 224 { 225 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 226 { 227 // x86: Generates PADDUSB since LDC 1.15 -O0 228 // ARM: Generates uqadd.16b since LDC 1.21 -O1 229 enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 230 enum ir = ` 231 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 232 ret <16 x i8> %r`; 233 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 234 } 235 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 236 { 237 ubyte[16] res; 238 byte16 sa = cast(byte16)a; 239 byte16 sb = cast(byte16)b; 240 foreach(i; 0..16) 241 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 242 return _mm_loadu_si128(cast(int4*)res.ptr); 243 } 244 else 245 return __builtin_ia32_paddusb128(a, b); 246 } 247 else 248 { 249 ubyte[16] res; 250 byte16 sa = cast(byte16)a; 251 byte16 sb = cast(byte16)b; 252 foreach(i; 0..16) 253 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 254 return _mm_loadu_si128(cast(int4*)res.ptr); 255 } 256 } 257 unittest 258 { 259 byte16 res = cast(byte16) 260 _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0), 261 _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0)); 262 static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 263 0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 264 assert(res.array == correctResult); 265 } 266 267 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation. 268 // PERF: #GDC version? 269 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 270 { 271 version(LDC) 272 { 273 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 274 { 275 // x86: Generates PADDUSW since LDC 1.15 -O0 276 // ARM: Generates uqadd.8h since LDC 1.21 -O1 277 enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 278 enum ir = ` 279 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 280 ret <8 x i16> %r`; 281 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 282 } 283 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 284 { 285 ushort[8] res; 286 short8 sa = cast(short8)a; 287 short8 sb = cast(short8)b; 288 foreach(i; 0..8) 289 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 290 return _mm_loadu_si128(cast(int4*)res.ptr); 291 } 292 else 293 return __builtin_ia32_paddusw128(a, b); 294 } 295 else 296 { 297 ushort[8] res; 298 short8 sa = cast(short8)a; 299 short8 sb = cast(short8)b; 300 foreach(i; 0..8) 301 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 302 return _mm_loadu_si128(cast(int4*)res.ptr); 303 } 304 } 305 unittest 306 { 307 short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0), 308 _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0)); 309 static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6]; 310 assert(res.array == correctResult); 311 } 312 313 /// Compute the bitwise AND of packed double-precision (64-bit) 314 /// floating-point elements in `a` and `b`. 315 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 316 { 317 return cast(__m128d)( cast(long2)a & cast(long2)b ); 318 } 319 unittest 320 { 321 double a = 4.32; 322 double b = -78.99; 323 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 324 __m128d A = _mm_set_pd(a, b); 325 __m128d B = _mm_set_pd(b, a); 326 long2 R = cast(long2)( _mm_and_pd(A, B) ); 327 assert(R.array[0] == correct); 328 assert(R.array[1] == correct); 329 } 330 331 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`. 332 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 333 { 334 return a & b; 335 } 336 unittest 337 { 338 __m128i A = _mm_set1_epi32(7); 339 __m128i B = _mm_set1_epi32(14); 340 __m128i R = _mm_and_si128(A, B); 341 int[4] correct = [6, 6, 6, 6]; 342 assert(R.array == correct); 343 } 344 345 /// Compute the bitwise NOT of packed double-precision (64-bit) 346 /// floating-point elements in `a` and then AND with `b`. 347 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 348 { 349 return cast(__m128d)( ~(cast(long2)a) & cast(long2)b); 350 } 351 unittest 352 { 353 double a = 4.32; 354 double b = -78.99; 355 long correct = (~*cast(long*)(&a)) & ( *cast(long*)(&b)); 356 long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b)); 357 __m128d A = _mm_setr_pd(a, b); 358 __m128d B = _mm_setr_pd(b, a); 359 long2 R = cast(long2)( _mm_andnot_pd(A, B) ); 360 assert(R.array[0] == correct); 361 assert(R.array[1] == correct2); 362 } 363 364 /// Compute the bitwise NOT of 128 bits (representing integer data) 365 /// in `a` and then AND with `b`. 366 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 367 { 368 return (~a) & b; 369 } 370 unittest 371 { 372 __m128i A = _mm_set1_epi32(7); 373 __m128i B = _mm_set1_epi32(14); 374 __m128i R = _mm_andnot_si128(A, B); 375 int[4] correct = [8, 8, 8, 8]; 376 assert(R.array == correct); 377 } 378 379 /// Average packed unsigned 16-bit integers in `a` and `b`. 380 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted 381 { 382 static if (GDC_with_SSE2) 383 { 384 return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b); 385 } 386 else static if (LDC_with_ARM64) 387 { 388 return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b); 389 } 390 else version(LDC) 391 { 392 // Generates pavgw even in LDC 1.0, even in -O0 393 // But not in ARM 394 enum ir = ` 395 %ia = zext <8 x i16> %0 to <8 x i32> 396 %ib = zext <8 x i16> %1 to <8 x i32> 397 %isum = add <8 x i32> %ia, %ib 398 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 399 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 400 %r = trunc <8 x i32> %isums to <8 x i16> 401 ret <8 x i16> %r`; 402 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 403 } 404 else 405 { 406 short8 sa = cast(short8)a; 407 short8 sb = cast(short8)b; 408 short8 sr = void; 409 foreach(i; 0..8) 410 { 411 sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 ); 412 } 413 return cast(int4)sr; 414 } 415 } 416 unittest 417 { 418 __m128i A = _mm_set1_epi16(31); 419 __m128i B = _mm_set1_epi16(64); 420 short8 avg = cast(short8)(_mm_avg_epu16(A, B)); 421 foreach(i; 0..8) 422 assert(avg.array[i] == 48); 423 } 424 425 /// Average packed unsigned 8-bit integers in `a` and `b`. 426 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted 427 { 428 static if (GDC_with_SSE2) 429 { 430 return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b); 431 } 432 else static if (LDC_with_ARM64) 433 { 434 return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b); 435 } 436 else version(LDC) 437 { 438 // Generates pavgb even in LDC 1.0, even in -O0 439 // But not in ARM 440 enum ir = ` 441 %ia = zext <16 x i8> %0 to <16 x i16> 442 %ib = zext <16 x i8> %1 to <16 x i16> 443 %isum = add <16 x i16> %ia, %ib 444 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 445 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 446 %r = trunc <16 x i16> %isums to <16 x i8> 447 ret <16 x i8> %r`; 448 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 449 } 450 else 451 { 452 byte16 sa = cast(byte16)a; 453 byte16 sb = cast(byte16)b; 454 byte16 sr = void; 455 foreach(i; 0..16) 456 { 457 sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 ); 458 } 459 return cast(int4)sr; 460 } 461 } 462 unittest 463 { 464 __m128i A = _mm_set1_epi8(31); 465 __m128i B = _mm_set1_epi8(64); 466 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B)); 467 foreach(i; 0..16) 468 assert(avg.array[i] == 48); 469 } 470 471 /// Shift `a` left by `bytes` bytes while shifting in zeros. 472 alias _mm_bslli_si128 = _mm_slli_si128; 473 unittest 474 { 475 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 476 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 477 __m128i result = _mm_bslli_si128!5(toShift); 478 assert( (cast(byte16)result).array == exact); 479 } 480 481 /// Shift `v` right by `bytes` bytes while shifting in zeros. 482 alias _mm_bsrli_si128 = _mm_srli_si128; 483 unittest 484 { 485 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 486 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0]; 487 __m128i result = _mm_bsrli_si128!5(toShift); 488 assert( (cast(byte16)result).array == exact); 489 } 490 491 /// Cast vector of type `__m128d` to type `__m128`. 492 /// Note: Also possible with a regular `cast(__m128)(a)`. 493 __m128 _mm_castpd_ps (__m128d a) pure @safe 494 { 495 return cast(__m128)a; 496 } 497 498 /// Cast vector of type `__m128d` to type `__m128i`. 499 /// Note: Also possible with a regular `cast(__m128i)(a)`. 500 __m128i _mm_castpd_si128 (__m128d a) pure @safe 501 { 502 return cast(__m128i)a; 503 } 504 505 /// Cast vector of type `__m128` to type `__m128d`. 506 /// Note: Also possible with a regular `cast(__m128d)(a)`. 507 __m128d _mm_castps_pd (__m128 a) pure @safe 508 { 509 return cast(__m128d)a; 510 } 511 512 /// Cast vector of type `__m128` to type `__m128i`. 513 /// Note: Also possible with a regular `cast(__m128i)(a)`. 514 __m128i _mm_castps_si128 (__m128 a) pure @safe 515 { 516 return cast(__m128i)a; 517 } 518 519 /// Cast vector of type `__m128i` to type `__m128d`. 520 /// Note: Also possible with a regular `cast(__m128d)(a)`. 521 __m128d _mm_castsi128_pd (__m128i a) pure @safe 522 { 523 return cast(__m128d)a; 524 } 525 526 /// Cast vector of type `__m128i` to type `__m128`. 527 /// Note: Also possible with a regular `cast(__m128)(a)`. 528 __m128 _mm_castsi128_ps (__m128i a) pure @safe 529 { 530 return cast(__m128)a; 531 } 532 533 /// Invalidate and flush the cache line that contains `p` 534 /// from all levels of the cache hierarchy. 535 void _mm_clflush (const(void)* p) @trusted 536 { 537 static if (GDC_with_SSE2) 538 { 539 __builtin_ia32_clflush(p); 540 } 541 else static if (LDC_with_SSE2) 542 { 543 __builtin_ia32_clflush(cast(void*)p); 544 } 545 else version(D_InlineAsm_X86) 546 { 547 asm pure nothrow @nogc @safe 548 { 549 mov EAX, p; 550 clflush [EAX]; 551 } 552 } 553 else version(D_InlineAsm_X86_64) 554 { 555 asm pure nothrow @nogc @safe 556 { 557 mov RAX, p; 558 clflush [RAX]; 559 } 560 } 561 else 562 { 563 // Do nothing. Invalidating cacheline does 564 // not affect correctness. 565 } 566 } 567 unittest 568 { 569 ubyte[64] cacheline; 570 _mm_clflush(cacheline.ptr); 571 } 572 573 /// Compare packed 16-bit integers in `a` and `b` for equality. 574 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 575 { 576 static if (GDC_with_SSE2) 577 { 578 return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b); 579 } 580 else 581 { 582 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 583 } 584 } 585 unittest 586 { 587 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 588 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 589 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 590 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 591 assert(R.array == E); 592 } 593 594 /// Compare packed 32-bit integers in `a` and `b` for equality. 595 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 596 { 597 static if (GDC_with_SSE2) 598 { 599 return __builtin_ia32_pcmpeqd128(a, b); 600 } 601 else 602 { 603 return equalMask!__m128i(a, b); 604 } 605 } 606 unittest 607 { 608 int4 A = [-3, -2, -1, 0]; 609 int4 B = [ 4, -2, 2, 0]; 610 int[4] E = [ 0, -1, 0, -1]; 611 int4 R = cast(int4)(_mm_cmpeq_epi32(A, B)); 612 assert(R.array == E); 613 } 614 615 /// Compare packed 8-bit integers in `a` and `b` for equality. 616 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 617 { 618 static if (GDC_with_SSE2) 619 { 620 return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b); 621 } 622 else 623 { 624 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 625 } 626 } 627 unittest 628 { 629 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 630 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 631 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 632 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 633 assert(C.array == correct); 634 } 635 636 /// Compare packed double-precision (64-bit) floating-point elements 637 /// in `a` and `b` for equality. 638 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 639 { 640 static if (GDC_with_SSE2) 641 { 642 return __builtin_ia32_cmpeqpd(a, b); 643 } 644 else 645 { 646 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b); 647 } 648 } 649 650 /// Compare the lower double-precision (64-bit) floating-point elements 651 /// in `a` and `b` for equality, store the result in the lower element, 652 /// and copy the upper element from `a`. 653 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 654 { 655 static if (GDC_with_SSE2) 656 { 657 return __builtin_ia32_cmpeqsd(a, b); 658 } 659 else 660 { 661 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b); 662 } 663 } 664 665 /// Compare packed double-precision (64-bit) floating-point elements 666 /// in `a` and `b` for greater-than-or-equal. 667 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 668 { 669 static if (GDC_with_SSE2) 670 { 671 return __builtin_ia32_cmpgepd(a, b); 672 } 673 else 674 { 675 return cast(__m128d) cmppd!(FPComparison.oge)(a, b); 676 } 677 } 678 679 /// Compare the lower double-precision (64-bit) floating-point elements 680 /// in `a` and `b` for greater-than-or-equal, store the result in the 681 /// lower element, and copy the upper element from `a`. 682 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 683 { 684 // Note: There is no __builtin_ia32_cmpgesd builtin. 685 static if (GDC_with_SSE2) 686 { 687 return __builtin_ia32_cmpnltsd(b, a); 688 } 689 else 690 { 691 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b); 692 } 693 } 694 695 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 696 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 697 { 698 static if (GDC_with_SSE2) 699 { 700 return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b); 701 } 702 else 703 { 704 return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b); 705 } 706 } 707 unittest 708 { 709 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 710 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 711 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 712 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 713 assert(R.array == E); 714 } 715 716 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 717 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 718 { 719 static if (GDC_with_SSE2) 720 { 721 return __builtin_ia32_pcmpgtd128(a, b); 722 } 723 else 724 { 725 return cast(__m128i)( greaterMask!int4(a, b)); 726 } 727 } 728 unittest 729 { 730 int4 A = [-3, 2, -1, 0]; 731 int4 B = [ 4, -2, 2, 0]; 732 int[4] E = [ 0, -1, 0, 0]; 733 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 734 assert(R.array == E); 735 } 736 737 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 738 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 739 { 740 static if (GDC_with_SSE2) 741 { 742 return cast(__m128i) __builtin_ia32_pcmpgtb128(cast(ubyte16)a, cast(ubyte16)b); 743 } 744 else 745 { 746 return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b); 747 } 748 } 749 unittest 750 { 751 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 752 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 753 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 754 byte[16] correct = [0, 0,-1, 0, 0, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 755 __m128i D = _mm_cmpeq_epi8(A, B); 756 assert(C.array == correct); 757 } 758 759 /// Compare packed double-precision (64-bit) floating-point elements 760 /// in `a` and `b` for greater-than. 761 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 762 { 763 static if (GDC_with_SSE2) 764 { 765 return __builtin_ia32_cmpgtpd(a, b); 766 } 767 else 768 { 769 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b); 770 } 771 } 772 773 /// Compare the lower double-precision (64-bit) floating-point elements 774 /// in `a` and `b` for greater-than, store the result in the lower element, 775 /// and copy the upper element from `a`. 776 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 777 { 778 // Note: There is no __builtin_ia32_cmpgtsd builtin. 779 static if (GDC_with_SSE2) 780 { 781 return __builtin_ia32_cmpnlesd(b, a); 782 } 783 else 784 { 785 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b); 786 } 787 } 788 789 /// Compare packed double-precision (64-bit) floating-point elements 790 /// in `a` and `b` for less-than-or-equal. 791 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 792 { 793 static if (GDC_with_SSE2) 794 { 795 return __builtin_ia32_cmplepd(a, b); 796 } 797 else 798 { 799 return cast(__m128d) cmppd!(FPComparison.ole)(a, b); 800 } 801 } 802 803 /// Compare the lower double-precision (64-bit) floating-point elements 804 /// in `a` and `b` for less-than-or-equal, store the result in the 805 /// lower element, and copy the upper element from `a`. 806 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 807 { 808 static if (GDC_with_SSE2) 809 { 810 return __builtin_ia32_cmplesd(a, b); 811 } 812 else 813 { 814 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b); 815 } 816 } 817 818 /// Compare packed 16-bit integers in `a` and `b` for less-than. 819 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 820 { 821 return _mm_cmpgt_epi16(b, a); 822 } 823 824 /// Compare packed 32-bit integers in `a` and `b` for less-than. 825 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 826 { 827 return _mm_cmpgt_epi32(b, a); 828 } 829 830 /// Compare packed 8-bit integers in `a` and `b` for less-than. 831 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 832 { 833 return _mm_cmpgt_epi8(b, a); 834 } 835 836 /// Compare packed double-precision (64-bit) floating-point elements 837 /// in `a` and `b` for less-than. 838 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 839 { 840 static if (GDC_with_SSE2) 841 { 842 return __builtin_ia32_cmpltpd(a, b); 843 } 844 else 845 { 846 return cast(__m128d) cmppd!(FPComparison.olt)(a, b); 847 } 848 } 849 850 /// Compare the lower double-precision (64-bit) floating-point elements 851 /// in `a` and `b` for less-than, store the result in the lower 852 /// element, and copy the upper element from `a`. 853 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 854 { 855 static if (GDC_with_SSE2) 856 { 857 return __builtin_ia32_cmpltsd(a, b); 858 } 859 else 860 { 861 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b); 862 } 863 } 864 865 /// Compare packed double-precision (64-bit) floating-point elements 866 /// in `a` and `b` for not-equal. 867 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 868 { 869 static if (GDC_with_SSE2) 870 { 871 return __builtin_ia32_cmpneqpd(a, b); 872 } 873 else 874 { 875 return cast(__m128d) cmppd!(FPComparison.une)(a, b); 876 } 877 } 878 879 /// Compare the lower double-precision (64-bit) floating-point elements 880 /// in `a` and `b` for not-equal, store the result in the lower 881 /// element, and copy the upper element from `a`. 882 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 883 { 884 static if (GDC_with_SSE2) 885 { 886 return __builtin_ia32_cmpneqsd(a, b); 887 } 888 else 889 { 890 return cast(__m128d) cmpsd!(FPComparison.une)(a, b); 891 } 892 } 893 894 /// Compare packed double-precision (64-bit) floating-point elements 895 /// in `a` and `b` for not-greater-than-or-equal. 896 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 897 { 898 static if (GDC_with_SSE2) 899 { 900 return __builtin_ia32_cmpngepd(a, b); 901 } 902 else 903 { 904 return cast(__m128d) cmppd!(FPComparison.ult)(a, b); 905 } 906 } 907 908 /// Compare the lower double-precision (64-bit) floating-point elements 909 /// in `a` and `b` for not-greater-than-or-equal, store the result in 910 /// the lower element, and copy the upper element from `a`. 911 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 912 { 913 // Note: There is no __builtin_ia32_cmpngesd builtin. 914 static if (GDC_with_SSE2) 915 { 916 return __builtin_ia32_cmpltsd(b, a); 917 } 918 else 919 { 920 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b); 921 } 922 } 923 924 /// Compare packed double-precision (64-bit) floating-point elements 925 /// in `a` and `b` for not-greater-than. 926 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 927 { 928 static if (GDC_with_SSE2) 929 { 930 return __builtin_ia32_cmpngtpd(a, b); 931 } 932 else 933 { 934 return cast(__m128d) cmppd!(FPComparison.ule)(a, b); 935 } 936 } 937 938 /// Compare the lower double-precision (64-bit) floating-point elements 939 /// in `a` and `b` for not-greater-than, store the result in the 940 /// lower element, and copy the upper element from `a`. 941 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 942 { 943 // Note: There is no __builtin_ia32_cmpngtsd builtin. 944 static if (GDC_with_SSE2) 945 { 946 return __builtin_ia32_cmplesd(b, a); 947 } 948 else 949 { 950 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b); 951 } 952 } 953 954 /// Compare packed double-precision (64-bit) floating-point elements 955 /// in `a` and `b` for not-less-than-or-equal. 956 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 957 { 958 static if (GDC_with_SSE2) 959 { 960 return __builtin_ia32_cmpnlepd(a, b); 961 } 962 else 963 { 964 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b); 965 } 966 } 967 968 /// Compare the lower double-precision (64-bit) floating-point elements 969 /// in `a` and `b` for not-less-than-or-equal, store the result in the 970 /// lower element, and copy the upper element from `a`. 971 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 972 { 973 static if (GDC_with_SSE2) 974 { 975 return __builtin_ia32_cmpnlesd(a, b); 976 } 977 else 978 { 979 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b); 980 } 981 } 982 983 /// Compare packed double-precision (64-bit) floating-point elements 984 /// in `a` and `b` for not-less-than. 985 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 986 { 987 static if (GDC_with_SSE2) 988 { 989 return __builtin_ia32_cmpnltpd(a, b); 990 } 991 else 992 { 993 return cast(__m128d) cmppd!(FPComparison.uge)(a, b); 994 } 995 } 996 997 /// Compare the lower double-precision (64-bit) floating-point elements 998 /// in `a` and `b` for not-less-than, store the result in the lower 999 /// element, and copy the upper element from `a`. 1000 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 1001 { 1002 static if (GDC_with_SSE2) 1003 { 1004 return __builtin_ia32_cmpnltsd(a, b); 1005 } 1006 else 1007 { 1008 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b); 1009 } 1010 } 1011 1012 /// Compare packed double-precision (64-bit) floating-point elements 1013 /// in `a` and `b` to see if neither is NaN. 1014 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 1015 { 1016 static if (GDC_with_SSE2) 1017 { 1018 return __builtin_ia32_cmpordpd(a, b); 1019 } 1020 else 1021 { 1022 return cast(__m128d) cmppd!(FPComparison.ord)(a, b); 1023 } 1024 } 1025 1026 /// Compare the lower double-precision (64-bit) floating-point elements 1027 /// in `a` and `b` to see if neither is NaN, store the result in the 1028 /// lower element, and copy the upper element from `a` to the upper element. 1029 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 1030 { 1031 static if (GDC_with_SSE2) 1032 { 1033 return __builtin_ia32_cmpordsd(a, b); 1034 } 1035 else 1036 { 1037 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b); 1038 } 1039 } 1040 1041 /// Compare packed double-precision (64-bit) floating-point elements 1042 /// in `a` and `b` to see if either is NaN. 1043 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 1044 { 1045 static if (GDC_with_SSE2) 1046 { 1047 return __builtin_ia32_cmpunordpd(a, b); 1048 } 1049 else 1050 { 1051 return cast(__m128d) cmppd!(FPComparison.uno)(a, b); 1052 } 1053 } 1054 1055 /// Compare the lower double-precision (64-bit) floating-point elements 1056 /// in `a` and `b` to see if either is NaN, store the result in the lower 1057 /// element, and copy the upper element from `a` to the upper element. 1058 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 1059 { 1060 static if (GDC_with_SSE2) 1061 { 1062 return __builtin_ia32_cmpunordsd(a, b); 1063 } 1064 else 1065 { 1066 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b); 1067 } 1068 } 1069 1070 /// Compare the lower double-precision (64-bit) floating-point element 1071 /// in `a` and `b` for equality, and return the boolean result (0 or 1). 1072 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe 1073 { 1074 // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 1075 // comisd instruction, it returns false in case of unordered instead. 1076 // 1077 // Actually C++ compilers disagree over the meaning of that instruction. 1078 // GCC will manage NaNs like the comisd instruction (return true if unordered), 1079 // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says. 1080 // We choose to do like the most numerous. It seems GCC is buggy with NaNs. 1081 return a.array[0] == b.array[0]; 1082 } 1083 unittest 1084 { 1085 assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1086 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1087 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1088 assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1089 assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1090 } 1091 1092 /// Compare the lower double-precision (64-bit) floating-point element 1093 /// in `a` and `b` for greater-than-or-equal, and return the boolean 1094 /// result (0 or 1). 1095 int _mm_comige_sd (__m128d a, __m128d b) pure @safe 1096 { 1097 return a.array[0] >= b.array[0]; 1098 } 1099 unittest 1100 { 1101 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1102 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1103 assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1104 assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1105 assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1106 assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1107 } 1108 1109 /// Compare the lower double-precision (64-bit) floating-point element 1110 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1). 1111 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe 1112 { 1113 return a.array[0] > b.array[0]; 1114 } 1115 unittest 1116 { 1117 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1118 assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1119 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1120 assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1121 assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1122 } 1123 1124 /// Compare the lower double-precision (64-bit) floating-point element 1125 /// in `a` and `b` for less-than-or-equal. 1126 int _mm_comile_sd (__m128d a, __m128d b) pure @safe 1127 { 1128 return a.array[0] <= b.array[0]; 1129 } 1130 unittest 1131 { 1132 assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1133 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1134 assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1135 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1136 assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1137 assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1138 } 1139 1140 /// Compare the lower double-precision (64-bit) floating-point element 1141 /// in `a` and `b` for less-than, and return the boolean result (0 or 1). 1142 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe 1143 { 1144 return a.array[0] < b.array[0]; 1145 } 1146 unittest 1147 { 1148 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1149 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1150 assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1151 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1152 assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1153 assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1154 } 1155 1156 /// Compare the lower double-precision (64-bit) floating-point element 1157 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1). 1158 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe 1159 { 1160 return a.array[0] != b.array[0]; 1161 } 1162 unittest 1163 { 1164 assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1165 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1166 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1167 assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1168 assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1169 } 1170 1171 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) 1172 /// floating-point elements. 1173 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted 1174 { 1175 version(LDC) 1176 { 1177 // Generates cvtdq2pd since LDC 1.0, even without optimizations 1178 enum ir = ` 1179 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 1180 %r = sitofp <2 x i32> %v to <2 x double> 1181 ret <2 x double> %r`; 1182 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a); 1183 } 1184 else static if (GDC_with_SSE2) 1185 { 1186 return __builtin_ia32_cvtdq2pd(a); 1187 } 1188 else 1189 { 1190 double2 r = void; 1191 r.ptr[0] = a.array[0]; 1192 r.ptr[1] = a.array[1]; 1193 return r; 1194 } 1195 } 1196 unittest 1197 { 1198 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54)); 1199 assert(A.array[0] == 54.0); 1200 assert(A.array[1] == 54.0); 1201 } 1202 1203 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 1204 /// floating-point elements. 1205 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted 1206 { 1207 static if (GDC_with_SSE2) 1208 { 1209 return __builtin_ia32_cvtdq2ps(a); 1210 } 1211 else 1212 { 1213 // x86: Generates cvtdq2ps since LDC 1.0.0 -O1 1214 // ARM: Generats scvtf.4s since LDC 1.8.0 -02 1215 __m128 res; 1216 res.ptr[0] = cast(float)a.array[0]; 1217 res.ptr[1] = cast(float)a.array[1]; 1218 res.ptr[2] = cast(float)a.array[2]; 1219 res.ptr[3] = cast(float)a.array[3]; 1220 return res; 1221 } 1222 } 1223 unittest 1224 { 1225 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 1226 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 1227 } 1228 1229 /// Convert packed double-precision (64-bit) floating-point elements 1230 /// in `a` to packed 32-bit integers. 1231 // PERF ARM32 1232 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted 1233 { 1234 static if (LDC_with_SSE2) 1235 { 1236 return __builtin_ia32_cvtpd2dq(a); 1237 } 1238 else static if (GDC_with_SSE2) 1239 { 1240 return __builtin_ia32_cvtpd2dq(a); 1241 } 1242 else static if (LDC_with_ARM64) 1243 { 1244 // Get current rounding mode. 1245 uint fpscr = arm_get_fpcr(); 1246 long2 i; 1247 switch(fpscr & _MM_ROUND_MASK_ARM) 1248 { 1249 default: 1250 case _MM_ROUND_NEAREST_ARM: i = vcvtnq_s64_f64(a); break; 1251 case _MM_ROUND_DOWN_ARM: i = vcvtmq_s64_f64(a); break; 1252 case _MM_ROUND_UP_ARM: i = vcvtpq_s64_f64(a); break; 1253 case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break; 1254 } 1255 int4 zero = 0; 1256 return cast(__m128i) shufflevector!(int4, 0, 2, 4, 6)(cast(int4)i, zero); 1257 } 1258 else 1259 { 1260 // PERF ARM32 1261 __m128i r = _mm_setzero_si128(); 1262 r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]); 1263 r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]); 1264 return r; 1265 } 1266 } 1267 unittest 1268 { 1269 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0)); 1270 assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0); 1271 } 1272 1273 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1274 /// to packed 32-bit integers 1275 __m64 _mm_cvtpd_pi32 (__m128d v) @safe 1276 { 1277 return to_m64(_mm_cvtpd_epi32(v)); 1278 } 1279 unittest 1280 { 1281 int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0)); 1282 assert(A.array[0] == 55 && A.array[1] == 61); 1283 } 1284 1285 /// Convert packed double-precision (64-bit) floating-point elements 1286 /// in `a` to packed single-precision (32-bit) floating-point elements. 1287 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted 1288 { 1289 static if (LDC_with_SSE2) 1290 { 1291 return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately 1292 } 1293 else static if (GDC_with_SSE2) 1294 { 1295 return __builtin_ia32_cvtpd2ps(a); 1296 } 1297 else 1298 { 1299 __m128 r = void; 1300 r.ptr[0] = a.array[0]; 1301 r.ptr[1] = a.array[1]; 1302 r.ptr[2] = 0; 1303 r.ptr[3] = 0; 1304 return r; 1305 } 1306 } 1307 unittest 1308 { 1309 __m128d A = _mm_set_pd(5.25, 4.0); 1310 __m128 B = _mm_cvtpd_ps(A); 1311 assert(B.array == [4.0f, 5.25f, 0, 0]); 1312 } 1313 1314 /// Convert packed 32-bit integers in `v` to packed double-precision 1315 /// (64-bit) floating-point elements. 1316 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe 1317 { 1318 return _mm_cvtepi32_pd(to_m128i(v)); 1319 } 1320 unittest 1321 { 1322 __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5)); 1323 assert(A.array[0] == 4.0 && A.array[1] == -5.0); 1324 } 1325 1326 /// Convert packed single-precision (32-bit) floating-point elements 1327 /// in `a` to packed 32-bit integers 1328 __m128i _mm_cvtps_epi32 (__m128 a) @trusted 1329 { 1330 static if (LDC_with_SSE2) 1331 { 1332 // Disabled, since it fail with optimizations unfortunately 1333 //alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq; 1334 return __asm!__m128i("cvtps2dq $1,$0","=x,x",a); 1335 } 1336 else static if (GDC_with_SSE2) 1337 { 1338 return __builtin_ia32_cvtps2dq(a); 1339 } 1340 else static if (LDC_with_ARM64) 1341 { 1342 // Get current rounding mode. 1343 uint fpscr = arm_get_fpcr(); 1344 switch(fpscr & _MM_ROUND_MASK_ARM) 1345 { 1346 default: 1347 case _MM_ROUND_NEAREST_ARM: return vcvtnq_s32_f32(a); 1348 case _MM_ROUND_DOWN_ARM: return vcvtmq_s32_f32(a); 1349 case _MM_ROUND_UP_ARM: return vcvtpq_s32_f32(a); 1350 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a); 1351 } 1352 } 1353 else 1354 { 1355 __m128i r = void; 1356 r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]); 1357 r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]); 1358 r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]); 1359 r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]); 1360 return r; 1361 } 1362 } 1363 unittest 1364 { 1365 // GDC bug #98607 1366 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607 1367 // GDC does not provide optimization barrier for roundign mode. 1368 // Workarounded with different literals. Thsi bug will likely only manifest in unittest. 1369 // GDC people provided no actual fix and instead say other compilers are buggy... when they aren't. 1370 1371 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1372 1373 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1374 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1375 assert(A.array == [1, -2, 54, -3]); 1376 1377 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1378 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f)); 1379 assert(A.array == [1, -3, 53, -3]); 1380 1381 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1382 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f)); 1383 assert(A.array == [2, -2, 54, -2]); 1384 1385 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1386 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f)); 1387 assert(A.array == [1, -2, 53, -2]); 1388 1389 _MM_SET_ROUNDING_MODE(savedRounding); 1390 } 1391 1392 /// Convert packed single-precision (32-bit) floating-point elements 1393 /// in `a` to packed double-precision (64-bit) floating-point elements. 1394 __m128d _mm_cvtps_pd (__m128 a) pure @trusted 1395 { 1396 version(LDC) 1397 { 1398 // Generates cvtps2pd since LDC 1.0 -O0 1399 enum ir = ` 1400 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1> 1401 %r = fpext <2 x float> %v to <2 x double> 1402 ret <2 x double> %r`; 1403 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a); 1404 } 1405 else static if (GDC_with_SSE2) 1406 { 1407 return __builtin_ia32_cvtps2pd(a); 1408 } 1409 else 1410 { 1411 double2 r = void; 1412 r.ptr[0] = a.array[0]; 1413 r.ptr[1] = a.array[1]; 1414 return r; 1415 } 1416 } 1417 unittest 1418 { 1419 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f)); 1420 assert(A.array[0] == 54.0); 1421 assert(A.array[1] == 54.0); 1422 } 1423 1424 /// Copy the lower double-precision (64-bit) floating-point element of `a`. 1425 double _mm_cvtsd_f64 (__m128d a) pure @safe 1426 { 1427 return a.array[0]; 1428 } 1429 1430 /// Convert the lower double-precision (64-bit) floating-point element 1431 /// in `a` to a 32-bit integer. 1432 int _mm_cvtsd_si32 (__m128d a) @safe 1433 { 1434 static if (LDC_with_SSE2) 1435 { 1436 return __builtin_ia32_cvtsd2si(a); 1437 } 1438 else static if (GDC_with_SSE2) 1439 { 1440 return __builtin_ia32_cvtsd2si(a); 1441 } 1442 else 1443 { 1444 return convertDoubleToInt32UsingMXCSR(a[0]); 1445 } 1446 } 1447 unittest 1448 { 1449 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0))); 1450 } 1451 1452 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer. 1453 long _mm_cvtsd_si64 (__m128d a) @trusted 1454 { 1455 version (LDC) 1456 { 1457 version (X86_64) 1458 { 1459 return __builtin_ia32_cvtsd2si64(a); 1460 } 1461 else 1462 { 1463 // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer 1464 // using SSE instructions only. So the builtin doesn't exit for this arch. 1465 return convertDoubleToInt64UsingMXCSR(a[0]); 1466 } 1467 } 1468 else 1469 { 1470 return convertDoubleToInt64UsingMXCSR(a.array[0]); 1471 } 1472 } 1473 unittest 1474 { 1475 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0))); 1476 1477 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1478 1479 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1480 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49))); 1481 1482 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1483 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1))); 1484 1485 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1486 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1))); 1487 1488 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1489 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9))); 1490 1491 _MM_SET_ROUNDING_MODE(savedRounding); 1492 } 1493 1494 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; /// 1495 1496 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 1497 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a` 1498 /// to the upper elements of result. 1499 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted 1500 { 1501 static if (GDC_with_SSE2) 1502 { 1503 return __builtin_ia32_cvtsd2ss(a, b); 1504 } 1505 else 1506 { 1507 // Generates cvtsd2ss since LDC 1.3 -O0 1508 a.ptr[0] = b.array[0]; 1509 return a; 1510 } 1511 } 1512 unittest 1513 { 1514 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0)); 1515 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]); 1516 } 1517 1518 /// Get the lower 32-bit integer in `a`. 1519 int _mm_cvtsi128_si32 (__m128i a) pure @safe 1520 { 1521 return a.array[0]; 1522 } 1523 1524 /// Get the lower 64-bit integer in `a`. 1525 long _mm_cvtsi128_si64 (__m128i a) pure @safe 1526 { 1527 long2 la = cast(long2)a; 1528 return la.array[0]; 1529 } 1530 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 1531 1532 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 1533 /// lower element of result, and copy the upper element from `a` to the upper element of result. 1534 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted 1535 { 1536 a.ptr[0] = cast(double)b; 1537 return a; 1538 } 1539 unittest 1540 { 1541 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 1542 assert(a.array == [42.0, 0]); 1543 } 1544 1545 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements. 1546 __m128i _mm_cvtsi32_si128 (int a) pure @trusted 1547 { 1548 int4 r = [0, 0, 0, 0]; 1549 r.ptr[0] = a; 1550 return r; 1551 } 1552 unittest 1553 { 1554 __m128i a = _mm_cvtsi32_si128(65); 1555 assert(a.array == [65, 0, 0, 0]); 1556 } 1557 1558 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 1559 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 1560 1561 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted 1562 { 1563 a.ptr[0] = cast(double)b; 1564 return a; 1565 } 1566 unittest 1567 { 1568 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 1569 assert(a.array == [42.0, 0]); 1570 } 1571 1572 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element. 1573 __m128i _mm_cvtsi64_si128 (long a) pure @trusted 1574 { 1575 long2 r = [0, 0]; 1576 r.ptr[0] = a; 1577 return cast(__m128i)(r); 1578 } 1579 1580 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; /// 1581 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; /// 1582 1583 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 1584 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 1585 // element of result. 1586 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted 1587 { 1588 a.ptr[0] = b.array[0]; 1589 return a; 1590 } 1591 unittest 1592 { 1593 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 1594 assert(a.array == [42.0, 0]); 1595 } 1596 1597 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation. 1598 long _mm_cvttss_si64 (__m128 a) pure @safe 1599 { 1600 return cast(long)(a.array[0]); // Generates cvttss2si as expected 1601 } 1602 unittest 1603 { 1604 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1605 } 1606 1607 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1608 /// Put zeroes in the upper elements of result. 1609 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted 1610 { 1611 static if (LDC_with_SSE2) 1612 { 1613 return __builtin_ia32_cvttpd2dq(a); 1614 } 1615 else static if (GDC_with_SSE2) 1616 { 1617 return __builtin_ia32_cvttpd2dq(a); 1618 } 1619 else 1620 { 1621 // Note: doesn't generate cvttpd2dq as of LDC 1.13 1622 __m128i r; 1623 r.ptr[0] = cast(int)a.array[0]; 1624 r.ptr[1] = cast(int)a.array[1]; 1625 r.ptr[2] = 0; 1626 r.ptr[3] = 0; 1627 return r; 1628 } 1629 } 1630 unittest 1631 { 1632 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f)); 1633 assert(R.array == [-4, 45641, 0, 0]); 1634 } 1635 1636 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1637 /// to packed 32-bit integers with truncation. 1638 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe 1639 { 1640 return to_m64(_mm_cvttpd_epi32(v)); 1641 } 1642 unittest 1643 { 1644 int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f)); 1645 int[2] correct = [-4, 45641]; 1646 assert(R.array == correct); 1647 } 1648 1649 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1650 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted 1651 { 1652 // x86: Generates cvttps2dq since LDC 1.3 -O2 1653 // ARM64: generates fcvtze since LDC 1.8 -O2 1654 __m128i r; 1655 r.ptr[0] = cast(int)a.array[0]; 1656 r.ptr[1] = cast(int)a.array[1]; 1657 r.ptr[2] = cast(int)a.array[2]; 1658 r.ptr[3] = cast(int)a.array[3]; 1659 return r; 1660 } 1661 unittest 1662 { 1663 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f)); 1664 assert(R.array == [-4, 45641, 0, 1]); 1665 } 1666 1667 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation. 1668 int _mm_cvttsd_si32 (__m128d a) 1669 { 1670 // Generates cvttsd2si since LDC 1.3 -O0 1671 return cast(int)a.array[0]; 1672 } 1673 1674 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation. 1675 long _mm_cvttsd_si64 (__m128d a) 1676 { 1677 // Generates cvttsd2si since LDC 1.3 -O0 1678 // but in 32-bit instead, it's a long sequence that resort to FPU 1679 return cast(long)a.array[0]; 1680 } 1681 1682 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; /// 1683 1684 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`. 1685 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe 1686 { 1687 return a / b; 1688 } 1689 1690 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted 1691 { 1692 static if (GDC_with_SSE2) 1693 { 1694 return __builtin_ia32_divsd(a, b); 1695 } 1696 else version(DigitalMars) 1697 { 1698 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1699 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 1700 asm pure nothrow @nogc @trusted { nop;} 1701 a.array[0] = a.array[0] / b.array[0]; 1702 return a; 1703 } 1704 else 1705 { 1706 a.ptr[0] /= b.array[0]; 1707 return a; 1708 } 1709 } 1710 unittest 1711 { 1712 __m128d a = [2.0, 4.5]; 1713 a = _mm_div_sd(a, a); 1714 assert(a.array == [1.0, 4.5]); 1715 } 1716 1717 /// Extract a 16-bit integer from `v`, selected with `index` 1718 int _mm_extract_epi16(__m128i v, int index) pure @safe 1719 { 1720 short8 r = cast(short8)v; 1721 return cast(ushort)(r.array[index]); 1722 } 1723 unittest 1724 { 1725 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1); 1726 assert(_mm_extract_epi16(A, 6) == 6); 1727 assert(_mm_extract_epi16(A, 0) == 65535); 1728 } 1729 1730 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`. 1731 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted 1732 { 1733 short8 r = cast(short8)v; 1734 r.ptr[index & 7] = cast(short)i; 1735 return cast(__m128i)r; 1736 } 1737 unittest 1738 { 1739 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1740 short8 R = cast(short8) _mm_insert_epi16(A, 42, 6); 1741 short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7]; 1742 assert(R.array == correct); 1743 } 1744 1745 1746 void _mm_lfence() @trusted 1747 { 1748 version(GNU) 1749 { 1750 1751 static if (GDC_with_SSE2) 1752 { 1753 __builtin_ia32_lfence(); 1754 } 1755 else version(X86) 1756 { 1757 asm pure nothrow @nogc @trusted 1758 { 1759 "lfence;\n" : : : ; 1760 } 1761 } 1762 else 1763 static assert(false); 1764 } 1765 else static if (LDC_with_SSE2) 1766 { 1767 __builtin_ia32_lfence(); 1768 } 1769 else static if (DMD_with_asm) 1770 { 1771 asm nothrow @nogc pure @safe 1772 { 1773 lfence; 1774 } 1775 } 1776 else version(LDC) 1777 { 1778 llvm_memory_fence(); // PERF actually generates mfence 1779 } 1780 else 1781 static assert(false); 1782 } 1783 unittest 1784 { 1785 _mm_lfence(); 1786 } 1787 1788 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1789 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1790 __m128d _mm_load_pd (const(double) * mem_addr) pure 1791 { 1792 __m128d* aligned = cast(__m128d*)mem_addr; 1793 return *aligned; 1794 } 1795 unittest 1796 { 1797 align(16) double[2] S = [-5.0, 7.0]; 1798 __m128d R = _mm_load_pd(S.ptr); 1799 assert(R.array == S); 1800 } 1801 1802 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst. 1803 /// `mem_addr` does not need to be aligned on any particular boundary. 1804 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 1805 { 1806 double m = *mem_addr; 1807 __m128d r; 1808 r.ptr[0] = m; 1809 r.ptr[1] = m; 1810 return r; 1811 } 1812 unittest 1813 { 1814 double what = 4; 1815 __m128d R = _mm_load_pd1(&what); 1816 double[2] correct = [4.0, 4]; 1817 assert(R.array == correct); 1818 } 1819 1820 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 1821 /// element. `mem_addr` does not need to be aligned on any particular boundary. 1822 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted 1823 { 1824 double2 r = [0, 0]; 1825 r.ptr[0] = *mem_addr; 1826 return r; 1827 } 1828 unittest 1829 { 1830 double x = -42; 1831 __m128d a = _mm_load_sd(&x); 1832 assert(a.array == [-42.0, 0.0]); 1833 } 1834 1835 /// Load 128-bits of integer data from memory into dst. 1836 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1837 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted // TODO: shoudln't be trusted because alignment, Issue #62 1838 { 1839 return *mem_addr; 1840 } 1841 unittest 1842 { 1843 align(16) int[4] correct = [-1, 2, 3, 4]; 1844 int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr); 1845 assert(A.array == correct); 1846 } 1847 1848 alias _mm_load1_pd = _mm_load_pd1; /// 1849 1850 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 1851 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary. 1852 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted 1853 { 1854 a.ptr[1] = *mem_addr; 1855 return a; 1856 } 1857 unittest 1858 { 1859 double A = 7.0; 1860 __m128d B = _mm_setr_pd(4.0, -5.0); 1861 __m128d R = _mm_loadh_pd(B, &A); 1862 double[2] correct = [ 4.0, 7.0 ]; 1863 assert(R.array == correct); 1864 } 1865 1866 /// Load 64-bit integer from memory into the first element of result. Zero out the other. 1867 // Note: strange signature since the memory doesn't have to aligned (Issue #60) 1868 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted // TODO signature 1869 { 1870 auto pLong = cast(const(long)*)mem_addr; 1871 long2 r = [0, 0]; 1872 r.ptr[0] = *pLong; 1873 return cast(__m128i)(r); 1874 } 1875 unittest 1876 { 1877 long A = 0x7878787870707070; 1878 long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A); 1879 long[2] correct = [0x7878787870707070, 0]; 1880 assert(R.array == correct); 1881 } 1882 1883 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 1884 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary. 1885 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted 1886 { 1887 a.ptr[0] = *mem_addr; 1888 return a; 1889 } 1890 unittest 1891 { 1892 double A = 7.0; 1893 __m128d B = _mm_setr_pd(4.0, -5.0); 1894 __m128d R = _mm_loadl_pd(B, &A); 1895 double[2] correct = [ 7.0, -5.0 ]; 1896 assert(R.array == correct); 1897 } 1898 1899 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 1900 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1901 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted 1902 { 1903 __m128d a = *cast(__m128d*)(mem_addr); 1904 __m128d r; 1905 r.ptr[0] = a.array[1]; 1906 r.ptr[1] = a.array[0]; 1907 return r; 1908 } 1909 unittest 1910 { 1911 align(16) double[2] A = [56.0, -74.0]; 1912 __m128d R = _mm_loadr_pd(A.ptr); 1913 double[2] correct = [-74.0, 56.0]; 1914 assert(R.array == correct); 1915 } 1916 1917 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1918 /// `mem_addr` does not need to be aligned on any particular boundary. 1919 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted 1920 { 1921 static if (GDC_with_SSE2) 1922 { 1923 return __builtin_ia32_loadupd(mem_addr); 1924 } 1925 else version(LDC) 1926 { 1927 return loadUnaligned!(double2)(mem_addr); 1928 } 1929 else version(DigitalMars) 1930 { 1931 static if (DMD_with_DSIMD) 1932 { 1933 return cast(__m128d)__simd(XMM.LODUPD, *mem_addr); 1934 } 1935 else static if (SSESizedVectorsAreEmulated) 1936 { 1937 // Since this vector is emulated, it doesn't have alignement constraints 1938 // and as such we can just cast it. 1939 return *cast(__m128d*)(mem_addr); 1940 } 1941 else 1942 { 1943 __m128d result; 1944 result.ptr[0] = mem_addr[0]; 1945 result.ptr[1] = mem_addr[1]; 1946 return result; 1947 } 1948 } 1949 else 1950 { 1951 __m128d result; 1952 result.ptr[0] = mem_addr[0]; 1953 result.ptr[1] = mem_addr[1]; 1954 return result; 1955 } 1956 } 1957 unittest 1958 { 1959 double[2] A = [56.0, -75.0]; 1960 __m128d R = _mm_loadu_pd(A.ptr); 1961 double[2] correct = [56.0, -75.0]; 1962 assert(R.array == correct); 1963 } 1964 1965 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary. 1966 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 1967 { 1968 static if (GDC_with_SSE2) 1969 { 1970 return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr); 1971 } 1972 else 1973 { 1974 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 1975 } 1976 } 1977 unittest 1978 { 1979 align(16) int[4] correct = [-1, 2, -3, 4]; 1980 int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr); 1981 assert(A.array == correct); 1982 } 1983 1984 /// Load unaligned 32-bit integer from memory into the first element of result. 1985 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted 1986 { 1987 int r = *cast(int*)(mem_addr); 1988 int4 result = [0, 0, 0, 0]; 1989 result.ptr[0] = r; 1990 return result; 1991 } 1992 unittest 1993 { 1994 int r = 42; 1995 __m128i A = _mm_loadu_si32(&r); 1996 int[4] correct = [42, 0, 0, 0]; 1997 assert(A.array == correct); 1998 } 1999 2000 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 2001 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 2002 /// and pack the results in destination. 2003 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted 2004 { 2005 static if (GDC_with_SSE2) 2006 { 2007 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2008 } 2009 else static if (LDC_with_SSE2) 2010 { 2011 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2012 } 2013 else static if (LDC_with_ARM64) 2014 { 2015 int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b)); 2016 int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b)); 2017 int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl)); 2018 int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph)); 2019 return vcombine_s32(rl, rh); 2020 } 2021 else 2022 { 2023 short8 sa = cast(short8)a; 2024 short8 sb = cast(short8)b; 2025 int4 r; 2026 foreach(i; 0..4) 2027 { 2028 r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1]; 2029 } 2030 return r; 2031 } 2032 } 2033 unittest 2034 { 2035 short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2036 short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2037 int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B); 2038 int[4] correct = [1, 13, -2147483648, 2*32767*32767]; 2039 assert(R.array == correct); 2040 } 2041 2042 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 2043 /// (elements are not stored when the highest bit is not set in the corresponding element) 2044 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 2045 /// boundary. 2046 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted 2047 { 2048 static if (GDC_with_SSE2) 2049 { 2050 return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr); 2051 } 2052 else static if (LDC_with_SSE2) 2053 { 2054 return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr); 2055 } 2056 else static if (LDC_with_ARM64) 2057 { 2058 // PERF: catastrophic on ARM32 2059 byte16 bmask = cast(byte16)mask; 2060 byte16 shift = 7; 2061 bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask 2062 mask = cast(__m128i) bmask; 2063 __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr); 2064 dest = (a & mask) | (dest & ~mask); 2065 storeUnaligned!__m128i(dest, cast(int*)mem_addr); 2066 } 2067 else 2068 { 2069 byte16 b = cast(byte16)a; 2070 byte16 m = cast(byte16)mask; 2071 byte* dest = cast(byte*)(mem_addr); 2072 foreach(j; 0..16) 2073 { 2074 if (m.array[j] & 128) 2075 { 2076 dest[j] = b.array[j]; 2077 } 2078 } 2079 } 2080 } 2081 unittest 2082 { 2083 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]; 2084 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0); 2085 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); 2086 _mm_maskmoveu_si128(A, mask, dest.ptr); 2087 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42]; 2088 assert(dest == correct); 2089 } 2090 2091 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values. 2092 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe 2093 { 2094 version(GNU) 2095 { 2096 // PERF: not necessarily the best for GDC 2097 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else 2098 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2099 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2100 return _mm_xor_si128(b, mask); 2101 } 2102 else 2103 { 2104 // x86: pmaxsw since LDC 1.0 -O1 2105 // ARM: smax.8h since LDC 1.5 -01 2106 short8 sa = cast(short8)a; 2107 short8 sb = cast(short8)b; 2108 short8 greater = greaterMask!short8(sa, sb); 2109 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2110 } 2111 } 2112 unittest 2113 { 2114 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57), 2115 _mm_setr_epi16(-4,-8, 9, 7, 0,-32768, 0, 0)); 2116 short[8] correct = [32767, 1, 9, 7, 9, 7, 0, 0]; 2117 assert(R.array == correct); 2118 } 2119 2120 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values. 2121 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe 2122 { 2123 version(LDC) 2124 { 2125 // x86: pmaxub since LDC 1.0.0 -O1 2126 // ARM64: umax.16b since LDC 1.5.0 -O1 2127 // PERF: catastrophic on ARM32 2128 ubyte16 sa = cast(ubyte16)a; 2129 ubyte16 sb = cast(ubyte16)b; 2130 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2131 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2132 } 2133 else 2134 { 2135 __m128i value128 = _mm_set1_epi8(-128); 2136 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2137 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2138 __m128i mask = _mm_and_si128(aTob, higher); 2139 return _mm_xor_si128(b, mask); 2140 } 2141 } 2142 unittest 2143 { 2144 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2145 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2146 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 2147 assert(R.array == correct); 2148 } 2149 2150 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed maximum values. 2151 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted 2152 { 2153 static if (GDC_with_SSE2) 2154 { 2155 return __builtin_ia32_maxpd(a, b); 2156 } 2157 else 2158 { 2159 // x86: Generates maxpd starting with LDC 1.9 -O2 2160 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2161 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2162 return a; 2163 } 2164 } 2165 unittest 2166 { 2167 __m128d A = _mm_setr_pd(4.0, 1.0); 2168 __m128d B = _mm_setr_pd(1.0, 8.0); 2169 __m128d M = _mm_max_pd(A, B); 2170 assert(M.array[0] == 4.0); 2171 assert(M.array[1] == 8.0); 2172 } 2173 2174 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 2175 /// lower element of result, and copy the upper element from `a` to the upper element of result. 2176 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted 2177 { 2178 static if (GDC_with_SSE2) 2179 { 2180 return __builtin_ia32_maxsd(a, b); 2181 } 2182 else 2183 { 2184 __m128d r = a; 2185 // Generates maxsd starting with LDC 1.3 2186 r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2187 return r; 2188 } 2189 } 2190 unittest 2191 { 2192 __m128d A = _mm_setr_pd(1.0, 1.0); 2193 __m128d B = _mm_setr_pd(4.0, 2.0); 2194 __m128d M = _mm_max_sd(A, B); 2195 assert(M.array[0] == 4.0); 2196 assert(M.array[1] == 1.0); 2197 } 2198 2199 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 2200 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 2201 /// is globally visible before any memory instruction which follows the fence in program order. 2202 void _mm_mfence() @trusted 2203 { 2204 version(GNU) 2205 { 2206 static if (GDC_with_SSE2) 2207 { 2208 __builtin_ia32_mfence(); 2209 } 2210 else version(X86) 2211 { 2212 asm pure nothrow @nogc @trusted 2213 { 2214 "mfence;\n" : : : ; 2215 } 2216 } 2217 else 2218 static assert(false); 2219 } 2220 else static if (LDC_with_SSE2) 2221 { 2222 __builtin_ia32_mfence(); 2223 } 2224 else static if (DMD_with_asm) 2225 { 2226 asm nothrow @nogc pure @safe 2227 { 2228 mfence; 2229 } 2230 } 2231 else version(LDC) 2232 { 2233 void _mm_mfence() pure @safe 2234 { 2235 // Note: will generate the DMB instruction on ARM 2236 llvm_memory_fence(); 2237 } 2238 } 2239 else 2240 static assert(false); 2241 } 2242 unittest 2243 { 2244 _mm_mfence(); 2245 } 2246 2247 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values. 2248 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe 2249 { 2250 version(GNU) 2251 { 2252 // PERF: not necessarily the best for GDC 2253 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else 2254 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2255 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2256 return _mm_xor_si128(b, mask); 2257 } 2258 else 2259 { 2260 // x86: pminsw since LDC 1.0 -O1 2261 // ARM64: smin.8h since LDC 1.5 -01 2262 short8 sa = cast(short8)a; 2263 short8 sb = cast(short8)b; 2264 short8 greater = greaterMask!short8(sa, sb); 2265 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2266 } 2267 } 2268 unittest 2269 { 2270 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-32768), 2271 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 2272 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -32768]; 2273 assert(R.array == correct); 2274 } 2275 2276 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values. 2277 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe 2278 { 2279 version(LDC) 2280 { 2281 // x86: pminub since LDC 1.0.0 -O1 2282 // ARM: umin.16b since LDC 1.5.0 -O1 2283 // PERF: catastrophic on ARM32 2284 ubyte16 sa = cast(ubyte16)a; 2285 ubyte16 sb = cast(ubyte16)b; 2286 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2287 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2288 } 2289 else 2290 { 2291 __m128i value128 = _mm_set1_epi8(-128); 2292 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2293 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2294 __m128i mask = _mm_and_si128(aTob, lower); 2295 return _mm_xor_si128(b, mask); 2296 } 2297 } 2298 unittest 2299 { 2300 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2301 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2302 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 2303 assert(R.array == correct); 2304 } 2305 2306 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values. 2307 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted 2308 { 2309 static if (GDC_with_SSE2) 2310 { 2311 return __builtin_ia32_minpd(a, b); 2312 } 2313 else 2314 { 2315 // Generates minpd starting with LDC 1.9 2316 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2317 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2318 return a; 2319 } 2320 } 2321 unittest 2322 { 2323 __m128d A = _mm_setr_pd(1.0, 2.0); 2324 __m128d B = _mm_setr_pd(4.0, 1.0); 2325 __m128d M = _mm_min_pd(A, B); 2326 assert(M.array[0] == 1.0); 2327 assert(M.array[1] == 1.0); 2328 } 2329 2330 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 2331 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 2332 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe 2333 { 2334 static if (GDC_with_SSE2) 2335 { 2336 return __builtin_ia32_minsd(a, b); 2337 } 2338 else 2339 { 2340 // Generates minsd starting with LDC 1.3 2341 __m128d r = a; 2342 r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2343 return r; 2344 } 2345 } 2346 unittest 2347 { 2348 __m128d A = _mm_setr_pd(1.0, 3.0); 2349 __m128d B = _mm_setr_pd(4.0, 2.0); 2350 __m128d M = _mm_min_sd(A, B); 2351 assert(M.array[0] == 1.0); 2352 assert(M.array[1] == 3.0); 2353 } 2354 2355 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element. 2356 __m128i _mm_move_epi64 (__m128i a) pure @trusted 2357 { 2358 static if (GDC_with_SSE2) 2359 { 2360 // slightly better with GDC -O0 2361 return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 2362 } 2363 else 2364 { 2365 long2 result = [ 0, 0 ]; 2366 long2 la = cast(long2) a; 2367 result.ptr[0] = la.array[0]; 2368 return cast(__m128i)(result); 2369 } 2370 } 2371 unittest 2372 { 2373 long2 A = [13, 47]; 2374 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 2375 long[2] correct = [13, 0]; 2376 assert(B.array == correct); 2377 } 2378 2379 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 2380 /// the upper element from `a` to the upper element of dst. 2381 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted 2382 { 2383 static if (GDC_with_SSE2) 2384 { 2385 return __builtin_ia32_movsd(a, b); 2386 } 2387 else 2388 { 2389 b.ptr[1] = a.array[1]; 2390 return b; 2391 } 2392 } 2393 unittest 2394 { 2395 double2 A = [13.0, 47.0]; 2396 double2 B = [34.0, 58.0]; 2397 double2 C = _mm_move_sd(A, B); 2398 double[2] correct = [34.0, 47.0]; 2399 assert(C.array == correct); 2400 } 2401 2402 /// Create mask from the most significant bit of each 8-bit element in `v`. 2403 int _mm_movemask_epi8 (__m128i a) pure @trusted 2404 { 2405 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2406 static if (GDC_with_SSE2) 2407 { 2408 return __builtin_ia32_pmovmskb128(cast(ubyte16)a); 2409 } 2410 else static if (LDC_with_SSE2) 2411 { 2412 return __builtin_ia32_pmovmskb128(cast(byte16)a); 2413 } 2414 else static if (LDC_with_ARM64) 2415 { 2416 // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon 2417 // The other two solutions lead to unfound intrinsics in LLVM and that took a long time. 2418 // SO there might be something a bit faster, but this one is reasonable and branchless. 2419 byte8 mask_shift; 2420 mask_shift.ptr[0] = 7; 2421 mask_shift.ptr[1] = 6; 2422 mask_shift.ptr[2] = 5; 2423 mask_shift.ptr[3] = 4; 2424 mask_shift.ptr[4] = 3; 2425 mask_shift.ptr[5] = 2; 2426 mask_shift.ptr[6] = 1; 2427 mask_shift.ptr[7] = 0; 2428 byte8 mask_and = byte8(-128); 2429 byte8 lo = vget_low_u8(cast(byte16)a); 2430 byte8 hi = vget_high_u8(cast(byte16)a); 2431 lo = vand_u8(lo, mask_and); 2432 lo = vshr_u8(lo, mask_shift); 2433 hi = vand_u8(hi, mask_and); 2434 hi = vshr_u8(hi, mask_shift); 2435 lo = vpadd_u8(lo,lo); 2436 lo = vpadd_u8(lo,lo); 2437 lo = vpadd_u8(lo,lo); 2438 hi = vpadd_u8(hi,hi); 2439 hi = vpadd_u8(hi,hi); 2440 hi = vpadd_u8(hi,hi); 2441 return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]); 2442 } 2443 else 2444 { 2445 byte16 ai = cast(byte16)a; 2446 int r = 0; 2447 foreach(bit; 0..16) 2448 { 2449 if (ai.array[bit] < 0) r += (1 << bit); 2450 } 2451 return r; 2452 } 2453 } 2454 unittest 2455 { 2456 assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0))); 2457 } 2458 2459 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 2460 /// loating-point element in `v`. 2461 int _mm_movemask_pd(__m128d v) pure @safe 2462 { 2463 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2464 static if (GDC_with_SSE2) 2465 { 2466 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2467 /// packed double-precision (64-bit) floating-point element in `v`. 2468 return __builtin_ia32_movmskpd(v); 2469 } 2470 else static if (LDC_with_SSE2) 2471 { 2472 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2473 /// packed double-precision (64-bit) floating-point element in `v`. 2474 return __builtin_ia32_movmskpd(v); 2475 } 2476 else 2477 { 2478 long2 lv = cast(long2)v; 2479 int r = 0; 2480 if (lv.array[0] < 0) r += 1; 2481 if (lv.array[1] < 0) r += 2; 2482 return r; 2483 } 2484 } 2485 unittest 2486 { 2487 __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0); 2488 assert(_mm_movemask_pd(A) == 2); 2489 } 2490 2491 /// Copy the lower 64-bit integer in `v`. 2492 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe 2493 { 2494 long2 lv = cast(long2)v; 2495 return long1(lv.array[0]); 2496 } 2497 unittest 2498 { 2499 __m128i A = _mm_set_epi64x(-1, -2); 2500 __m64 R = _mm_movepi64_pi64(A); 2501 assert(R.array[0] == -2); 2502 } 2503 2504 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element. 2505 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted 2506 { 2507 long2 r; 2508 r.ptr[0] = a.array[0]; 2509 r.ptr[1] = 0; 2510 return cast(__m128i)r; 2511 } 2512 2513 // Note: generates pmuludq in LDC with -O1 2514 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted 2515 { 2516 __m128i zero = _mm_setzero_si128(); 2517 2518 static if (__VERSION__ >= 2088) 2519 { 2520 // Need LLVM9 to avoid this shufflevector 2521 long2 la, lb; 2522 la.ptr[0] = cast(uint)a.array[0]; 2523 la.ptr[1] = cast(uint)a.array[2]; 2524 lb.ptr[0] = cast(uint)b.array[0]; 2525 lb.ptr[1] = cast(uint)b.array[2]; 2526 } 2527 else 2528 { 2529 long2 la = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(a, zero); 2530 long2 lb = cast(long2) shufflevector!(int4, 0, 4, 2, 6)(b, zero); 2531 } 2532 2533 version(DigitalMars) 2534 { 2535 // DMD has no long2 mul 2536 // long2 mul not supported before LDC 1.5 2537 la.ptr[0] *= lb.array[0]; 2538 la.ptr[1] *= lb.array[1]; 2539 return cast(__m128i)(la); 2540 } 2541 else 2542 { 2543 static if (__VERSION__ >= 2076) 2544 { 2545 return cast(__m128i)(la * lb); 2546 } 2547 else 2548 { 2549 // long2 mul not supported before LDC 1.5 2550 la.ptr[0] *= lb.array[0]; 2551 la.ptr[1] *= lb.array[1]; 2552 return cast(__m128i)(la); 2553 } 2554 } 2555 } 2556 unittest 2557 { 2558 __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff); 2559 __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff); 2560 __m128i C = _mm_mul_epu32(A, B); 2561 long2 LC = cast(long2)C; 2562 assert(LC.array[0] == 18446744065119617025uL); 2563 assert(LC.array[1] == 12723420444339690338uL); 2564 } 2565 2566 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 2567 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 2568 { 2569 return a * b; 2570 } 2571 unittest 2572 { 2573 __m128d a = [-2.0, 1.5]; 2574 a = _mm_mul_pd(a, a); 2575 assert(a.array == [4.0, 2.25]); 2576 } 2577 2578 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 2579 /// element of result, and copy the upper element from `a` to the upper element of result. 2580 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted 2581 { 2582 version(DigitalMars) 2583 { 2584 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 2585 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 2586 asm pure nothrow @nogc @trusted { nop;} 2587 a.array[0] = a.array[0] * b.array[0]; 2588 return a; 2589 } 2590 else static if (GDC_with_SSE2) 2591 { 2592 return __builtin_ia32_mulsd(a, b); 2593 } 2594 else 2595 { 2596 a.ptr[0] *= b.array[0]; 2597 return a; 2598 } 2599 } 2600 unittest 2601 { 2602 __m128d a = [-2.0, 1.5]; 2603 a = _mm_mul_sd(a, a); 2604 assert(a.array == [4.0, 1.5]); 2605 } 2606 2607 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 2608 /// and get an unsigned 64-bit result. 2609 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe 2610 { 2611 return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b))); 2612 } 2613 unittest 2614 { 2615 __m64 A = _mm_set_pi32(42, 0xDEADBEEF); 2616 __m64 B = _mm_set_pi32(42, 0xCAFEBABE); 2617 __m64 C = _mm_mul_su32(A, B); 2618 assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL); 2619 } 2620 2621 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2622 /// high 16 bits of the intermediate integers. 2623 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted 2624 { 2625 static if (GDC_with_SSE2) 2626 { 2627 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2628 } 2629 else static if (LDC_with_SSE2) 2630 { 2631 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2632 } 2633 else 2634 { 2635 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h 2636 // PERF: it seems the simde solution has one less instruction in ARM64. 2637 // PERF: Catastrophic in ARM32. 2638 short8 sa = cast(short8)a; 2639 short8 sb = cast(short8)b; 2640 short8 r = void; 2641 r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16; 2642 r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16; 2643 r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16; 2644 r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16; 2645 r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16; 2646 r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16; 2647 r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16; 2648 r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16; 2649 return cast(__m128i)r; 2650 } 2651 } 2652 unittest 2653 { 2654 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2655 __m128i B = _mm_set1_epi16(16384); 2656 short8 R = cast(short8)_mm_mulhi_epi16(A, B); 2657 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1]; 2658 assert(R.array == correct); 2659 } 2660 2661 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2662 /// high 16 bits of the intermediate integers. 2663 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted 2664 { 2665 static if (GDC_with_SSE2) 2666 { 2667 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2668 } 2669 else static if (LDC_with_SSE2) 2670 { 2671 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2672 } 2673 else 2674 { 2675 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h 2676 // it seems the simde solution has one less instruction in ARM64 2677 // PERF: Catastrophic in ARM32. 2678 short8 sa = cast(short8)a; 2679 short8 sb = cast(short8)b; 2680 short8 r = void; 2681 r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 ); 2682 r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 ); 2683 r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 ); 2684 r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 ); 2685 r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 ); 2686 r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 ); 2687 r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 ); 2688 r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 ); 2689 return cast(__m128i)r; 2690 } 2691 } 2692 unittest 2693 { 2694 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2695 __m128i B = _mm_set1_epi16(16384); 2696 short8 R = cast(short8)_mm_mulhi_epu16(A, B); 2697 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1]; 2698 assert(R.array == correct); 2699 } 2700 2701 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 2702 /// bits of the intermediate integers. 2703 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe 2704 { 2705 return cast(__m128i)(cast(short8)a * cast(short8)b); 2706 } 2707 unittest 2708 { 2709 __m128i A = _mm_setr_epi16(16384, -16, 0, 3, 4, 1, 16, 7); 2710 __m128i B = _mm_set1_epi16(16384); 2711 short8 R = cast(short8)_mm_mullo_epi16(A, B); 2712 short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384]; 2713 assert(R.array == correct); 2714 } 2715 2716 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 2717 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 2718 { 2719 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 2720 } 2721 2722 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`. 2723 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 2724 { 2725 return a | b; 2726 } 2727 2728 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation. 2729 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted 2730 { 2731 static if (GDC_with_SSE2) 2732 { 2733 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2734 } 2735 else static if (LDC_with_SSE2) 2736 { 2737 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2738 } 2739 else static if (LDC_with_ARM64) 2740 { 2741 short4 ra = vqmovn_s32(cast(int4)a); 2742 short4 rb = vqmovn_s32(cast(int4)b); 2743 return cast(__m128i)vcombine_s16(ra, rb); 2744 } 2745 else 2746 { 2747 // PERF: catastrophic on ARM 2748 short8 r; 2749 r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]); 2750 r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]); 2751 r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]); 2752 r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]); 2753 r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]); 2754 r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]); 2755 r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]); 2756 r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]); 2757 return cast(__m128i)r; 2758 } 2759 } 2760 unittest 2761 { 2762 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 2763 short8 R = cast(short8) _mm_packs_epi32(A, A); 2764 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0]; 2765 assert(R.array == correct); 2766 } 2767 2768 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation. 2769 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted 2770 { 2771 static if (GDC_with_SSE2) 2772 { 2773 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2774 } 2775 else static if (LDC_with_SSE2) 2776 { 2777 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2778 } 2779 else static if (LDC_with_ARM64) 2780 { 2781 // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02 2782 byte8 ra = vqmovn_s16(cast(short8)a); 2783 byte8 rb = vqmovn_s16(cast(short8)b); 2784 return cast(__m128i)vcombine_s8(ra, rb); 2785 } 2786 else 2787 { 2788 // PERF: ARM32 is missing 2789 byte16 r; 2790 short8 sa = cast(short8)a; 2791 short8 sb = cast(short8)b; 2792 foreach(i; 0..8) 2793 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]); 2794 foreach(i; 0..8) 2795 r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]); 2796 return cast(__m128i)r; 2797 } 2798 } 2799 unittest 2800 { 2801 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0); 2802 byte16 R = cast(byte16) _mm_packs_epi16(A, A); 2803 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0, 2804 127, -128, 127, 0, 127, -128, 127, 0]; 2805 assert(R.array == correct); 2806 } 2807 2808 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation. 2809 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 2810 { 2811 static if (GDC_with_SSE2) 2812 { 2813 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2814 } 2815 else static if (LDC_with_SSE2) 2816 { 2817 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2818 } 2819 else static if (LDC_with_ARM64) 2820 { 2821 // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02 2822 byte8 ra = vqmovun_s16(cast(short8)a); 2823 byte8 rb = vqmovun_s16(cast(short8)b); 2824 return cast(__m128i)vcombine_s8(ra, rb); 2825 } 2826 else 2827 { 2828 short8 sa = cast(short8)a; 2829 short8 sb = cast(short8)b; 2830 ubyte[16] result = void; 2831 for (int i = 0; i < 8; ++i) 2832 { 2833 short s = sa[i]; 2834 if (s < 0) s = 0; 2835 if (s > 255) s = 255; 2836 result[i] = cast(ubyte)s; 2837 2838 s = sb[i]; 2839 if (s < 0) s = 0; 2840 if (s > 255) s = 255; 2841 result[i+8] = cast(ubyte)s; 2842 } 2843 return cast(__m128i) loadUnaligned!(byte16)(cast(byte*)result.ptr); 2844 } 2845 } 2846 unittest 2847 { 2848 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 2849 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 2850 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 2851 0, 255, 0, 255, 255, 2, 1, 0]; 2852 foreach(i; 0..16) 2853 assert(AA.array[i] == cast(byte)(correctResult[i])); 2854 } 2855 2856 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 2857 /// and power consumption of spin-wait loops. 2858 void _mm_pause() @trusted 2859 { 2860 version(GNU) 2861 { 2862 static if (GDC_with_SSE2) 2863 { 2864 __builtin_ia32_pause(); 2865 } 2866 else version(X86) 2867 { 2868 asm pure nothrow @nogc @trusted 2869 { 2870 "pause;\n" : : : ; 2871 } 2872 } 2873 else 2874 static assert(false); 2875 } 2876 else static if (LDC_with_SSE2) 2877 { 2878 __builtin_ia32_pause(); 2879 } 2880 else static if (DMD_with_asm) 2881 { 2882 asm nothrow @nogc pure @safe 2883 { 2884 rep; nop; // F3 90 = pause 2885 } 2886 } 2887 else version (LDC) 2888 { 2889 // PERF: Do nothing currently , could be the "yield" intruction on ARM. 2890 } 2891 else 2892 static assert(false); 2893 } 2894 unittest 2895 { 2896 _mm_pause(); 2897 } 2898 2899 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 2900 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 2901 /// low 16 bits of 64-bit elements in result. 2902 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted 2903 { 2904 static if (GDC_with_SSE2) 2905 { 2906 return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b); 2907 } 2908 else static if (LDC_with_SSE2) 2909 { 2910 return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b); 2911 } 2912 else static if (LDC_with_ARM64) 2913 { 2914 ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b)); 2915 2916 // PERF: Looks suboptimal vs addp 2917 ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]); 2918 ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]); 2919 ushort8 r = 0; 2920 r[0] = r0; 2921 r[4] = r4; 2922 return cast(__m128i) r; 2923 } 2924 else 2925 { 2926 // PERF: ARM32 is lacking 2927 byte16 ab = cast(byte16)a; 2928 byte16 bb = cast(byte16)b; 2929 ubyte[16] t; 2930 foreach(i; 0..16) 2931 { 2932 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]); 2933 if (diff < 0) diff = -diff; 2934 t[i] = cast(ubyte)(diff); 2935 } 2936 int4 r = _mm_setzero_si128(); 2937 r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 2938 r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15]; 2939 return r; 2940 } 2941 } 2942 unittest 2943 { 2944 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 2945 __m128i B = _mm_set1_epi8(1); 2946 __m128i R = _mm_sad_epu8(A, B); 2947 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 2948 0, 2949 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 2950 0]; 2951 assert(R.array == correct); 2952 } 2953 2954 /// Set packed 16-bit integers with the supplied values. 2955 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 2956 { 2957 short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7]; 2958 return cast(__m128i) loadUnaligned!(short8)(result.ptr); 2959 } 2960 unittest 2961 { 2962 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 2963 short8 B = cast(short8) A; 2964 foreach(i; 0..8) 2965 assert(B.array[i] == i); 2966 } 2967 2968 /// Set packed 32-bit integers with the supplied values. 2969 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 2970 { 2971 int[4] result = [e0, e1, e2, e3]; 2972 return loadUnaligned!(int4)(result.ptr); 2973 } 2974 unittest 2975 { 2976 __m128i A = _mm_set_epi32(3, 2, 1, 0); 2977 foreach(i; 0..4) 2978 assert(A.array[i] == i); 2979 } 2980 2981 /// Set packed 64-bit integers with the supplied values. 2982 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted 2983 { 2984 long[2] result = [e0.array[0], e1.array[0]]; 2985 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 2986 } 2987 unittest 2988 { 2989 __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678)); 2990 long2 B = cast(long2) A; 2991 assert(B.array[0] == 5678); 2992 assert(B.array[1] == 1234); 2993 } 2994 2995 /// Set packed 64-bit integers with the supplied values. 2996 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 2997 { 2998 long[2] result = [e0, e1]; 2999 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 3000 } 3001 unittest 3002 { 3003 __m128i A = _mm_set_epi64x(1234, 5678); 3004 long2 B = cast(long2) A; 3005 assert(B.array[0] == 5678); 3006 assert(B.array[1] == 1234); 3007 } 3008 3009 /// Set packed 8-bit integers with the supplied values. 3010 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 3011 byte e11, byte e10, byte e9, byte e8, 3012 byte e7, byte e6, byte e5, byte e4, 3013 byte e3, byte e2, byte e1, byte e0) pure @trusted 3014 { 3015 byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 3016 e8, e9, e10, e11, e12, e13, e14, e15]; 3017 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 3018 } 3019 3020 /// Set packed double-precision (64-bit) floating-point elements with the supplied values. 3021 __m128d _mm_set_pd (double e1, double e0) pure @trusted 3022 { 3023 double[2] result = [e0, e1]; 3024 return loadUnaligned!(double2)(result.ptr); 3025 } 3026 unittest 3027 { 3028 __m128d A = _mm_set_pd(61.0, 55.0); 3029 double[2] correct = [55.0, 61.0]; 3030 assert(A.array == correct); 3031 } 3032 3033 /// Broadcast double-precision (64-bit) floating-point value `a` to all element. 3034 __m128d _mm_set_pd1 (double a) pure @trusted 3035 { 3036 double[2] result = [a, a]; 3037 return loadUnaligned!(double2)(result.ptr); 3038 } 3039 unittest 3040 { 3041 __m128d A = _mm_set_pd1(61.0); 3042 double[2] correct = [61.0, 61.0]; 3043 assert(A.array == correct); 3044 } 3045 3046 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 3047 /// and zero the upper element. 3048 __m128d _mm_set_sd (double a) pure @trusted 3049 { 3050 double[2] result = [a, 0]; 3051 return loadUnaligned!(double2)(result.ptr); 3052 } 3053 3054 /// Broadcast 16-bit integer a to all elements of dst. 3055 __m128i _mm_set1_epi16 (short a) pure @trusted 3056 { 3057 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 3058 { 3059 short8 v = a; 3060 return cast(__m128i) v; 3061 } 3062 else 3063 return cast(__m128i)(short8(a)); 3064 } 3065 unittest 3066 { 3067 short8 a = cast(short8) _mm_set1_epi16(31); 3068 for (int i = 0; i < 8; ++i) 3069 assert(a.array[i] == 31); 3070 } 3071 3072 /// Broadcast 32-bit integer `a` to all elements. 3073 __m128i _mm_set1_epi32 (int a) pure @trusted 3074 { 3075 return cast(__m128i)(int4(a)); 3076 } 3077 unittest 3078 { 3079 int4 a = cast(int4) _mm_set1_epi32(31); 3080 for (int i = 0; i < 4; ++i) 3081 assert(a.array[i] == 31); 3082 } 3083 3084 /// Broadcast 64-bit integer `a` to all elements. 3085 __m128i _mm_set1_epi64 (__m64 a) pure @safe 3086 { 3087 return _mm_set_epi64(a, a); 3088 } 3089 unittest 3090 { 3091 long b = 0x1DEADCAFE; 3092 __m64 a; 3093 a.ptr[0] = b; 3094 long2 c = cast(long2) _mm_set1_epi64(a); 3095 assert(c.array[0] == b); 3096 assert(c.array[1] == b); 3097 } 3098 3099 /// Broadcast 64-bit integer `a` to all elements 3100 __m128i _mm_set1_epi64x (long a) pure @trusted 3101 { 3102 long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3103 return cast(__m128i)(b); 3104 } 3105 unittest 3106 { 3107 long b = 0x1DEADCAFE; 3108 long2 c = cast(long2) _mm_set1_epi64x(b); 3109 for (int i = 0; i < 2; ++i) 3110 assert(c.array[i] == b); 3111 } 3112 3113 /// Broadcast 8-bit integer `a` to all elements. 3114 __m128i _mm_set1_epi8 (byte a) pure @trusted 3115 { 3116 byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3117 return cast(__m128i)(b); 3118 } 3119 unittest 3120 { 3121 byte16 b = cast(byte16) _mm_set1_epi8(31); 3122 for (int i = 0; i < 16; ++i) 3123 assert(b.array[i] == 31); 3124 } 3125 3126 alias _mm_set1_pd = _mm_set_pd1; 3127 3128 /// Set packed 16-bit integers with the supplied values in reverse order. 3129 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 3130 short e3, short e2, short e1, short e0) pure @trusted 3131 { 3132 short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0]; 3133 return cast(__m128i)( loadUnaligned!(short8)(result.ptr) ); 3134 } 3135 unittest 3136 { 3137 short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0); 3138 short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0]; 3139 assert(A.array == correct); 3140 } 3141 3142 /// Set packed 32-bit integers with the supplied values in reverse order. 3143 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3144 { 3145 int[4] result = [e3, e2, e1, e0]; 3146 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 3147 } 3148 unittest 3149 { 3150 int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647); 3151 int[4] correct = [-1, 0, -2147483648, 2147483647]; 3152 assert(A.array == correct); 3153 } 3154 3155 /// Set packed 64-bit integers with the supplied values in reverse order. 3156 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 3157 { 3158 long[2] result = [e1, e0]; 3159 return cast(__m128i)( loadUnaligned!(long2)(result.ptr) ); 3160 } 3161 unittest 3162 { 3163 long2 A = cast(long2) _mm_setr_epi64(-1, 0); 3164 long[2] correct = [-1, 0]; 3165 assert(A.array == correct); 3166 } 3167 3168 /// Set packed 8-bit integers with the supplied values in reverse order. 3169 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12, 3170 byte e11, byte e10, byte e9, byte e8, 3171 byte e7, byte e6, byte e5, byte e4, 3172 byte e3, byte e2, byte e1, byte e0) pure @trusted 3173 { 3174 byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 3175 e7, e6, e5, e4, e3, e2, e1, e0]; 3176 return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) ); 3177 } 3178 3179 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order. 3180 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 3181 { 3182 double2 result; 3183 result.ptr[0] = e1; 3184 result.ptr[1] = e0; 3185 return result; 3186 } 3187 unittest 3188 { 3189 __m128d A = _mm_setr_pd(61.0, 55.0); 3190 double[2] correct = [61.0, 55.0]; 3191 assert(A.array == correct); 3192 } 3193 3194 /// Return vector of type `__m128d` with all elements set to zero. 3195 __m128d _mm_setzero_pd () pure @trusted 3196 { 3197 // Note: using loadUnaligned has better -O0 codegen compared to .ptr 3198 double[2] result = [0.0, 0.0]; 3199 return loadUnaligned!(double2)(result.ptr); 3200 } 3201 3202 /// Return vector of type `__m128i` with all elements set to zero. 3203 __m128i _mm_setzero_si128() pure @trusted 3204 { 3205 // Note: using loadUnaligned has better -O0 codegen compared to .ptr 3206 int[4] result = [0, 0, 0, 0]; 3207 return cast(__m128i)( loadUnaligned!(int4)(result.ptr) ); 3208 } 3209 3210 /// Shuffle 32-bit integers in a using the control in `imm8`. 3211 /// See_also: `_MM_SHUFFLE`. 3212 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe 3213 { 3214 static if (GDC_with_SSE2) 3215 { 3216 return __builtin_ia32_pshufd(a, imm8); 3217 } 3218 else 3219 { 3220 return shufflevector!(int4, (imm8 >> 0) & 3, 3221 (imm8 >> 2) & 3, 3222 (imm8 >> 4) & 3, 3223 (imm8 >> 6) & 3)(a, a); 3224 } 3225 } 3226 unittest 3227 { 3228 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 3229 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3230 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 3231 int[4] expectedB = [ 3, 2, 1, 0 ]; 3232 assert(B.array == expectedB); 3233 } 3234 3235 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`. 3236 /// See_also: `_MM_SHUFFLE2`. 3237 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe 3238 { 3239 static if (GDC_with_SSE2) 3240 { 3241 return __builtin_ia32_shufpd(a, b, imm8); 3242 } 3243 else 3244 { 3245 return shufflevector!(double2, 0 + ( imm8 & 1 ), 3246 2 + ( (imm8 >> 1) & 1 ))(a, b); 3247 } 3248 } 3249 unittest 3250 { 3251 __m128d A = _mm_setr_pd(0.5, 2.0); 3252 __m128d B = _mm_setr_pd(4.0, 5.0); 3253 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 3254 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B); 3255 double[2] correct = [ 2.0, 5.0 ]; 3256 assert(R.array == correct); 3257 } 3258 3259 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 3260 /// 64 bits of result, with the low 64 bits being copied from from `a` to result. 3261 /// See also: `_MM_SHUFFLE`. 3262 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe 3263 { 3264 static if (GDC_with_SSE2) 3265 { 3266 return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8); 3267 } 3268 else 3269 { 3270 return cast(__m128i) shufflevector!(short8, 0, 1, 2, 3, 3271 4 + ( (imm8 >> 0) & 3 ), 3272 4 + ( (imm8 >> 2) & 3 ), 3273 4 + ( (imm8 >> 4) & 3 ), 3274 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 3275 } 3276 } 3277 unittest 3278 { 3279 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3280 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3281 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 3282 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 3283 assert(C.array == expectedC); 3284 } 3285 3286 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 3287 /// bits of result, with the high 64 bits being copied from from `a` to result. 3288 /// See_also: `_MM_SHUFFLE`. 3289 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe 3290 { 3291 static if (GDC_with_SSE2) 3292 { 3293 return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8); 3294 } 3295 else 3296 { 3297 return cast(__m128i) shufflevector!(short8, ( (imm8 >> 0) & 3 ), 3298 ( (imm8 >> 2) & 3 ), 3299 ( (imm8 >> 4) & 3 ), 3300 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 3301 } 3302 } 3303 unittest 3304 { 3305 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3306 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3307 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 3308 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 3309 assert(B.array == expectedB); 3310 } 3311 3312 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros. 3313 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted 3314 { 3315 static if (LDC_with_SSE2) 3316 { 3317 return __builtin_ia32_pslld128(a, count); 3318 } 3319 else static if (GDC_with_SSE2) 3320 { 3321 return __builtin_ia32_pslld128(a, count); 3322 } 3323 else static if (DMD_with_32bit_asm) 3324 { 3325 asm pure nothrow @nogc @trusted 3326 { 3327 movdqu XMM0, a; 3328 movdqu XMM1, count; 3329 pslld XMM0, XMM1; 3330 movdqu a, XMM0; 3331 } 3332 return a; 3333 } 3334 else 3335 { 3336 int4 r = void; 3337 long2 lc = cast(long2)count; 3338 int bits = cast(int)(lc.array[0]); 3339 foreach(i; 0..4) 3340 r[i] = cast(uint)(a[i]) << bits; 3341 return r; 3342 } 3343 } 3344 3345 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros. 3346 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted 3347 { 3348 static if (LDC_with_SSE2) 3349 { 3350 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3351 } 3352 else static if (GDC_with_SSE2) 3353 { 3354 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3355 } 3356 else static if (DMD_with_32bit_asm) 3357 { 3358 asm pure nothrow @nogc @trusted 3359 { 3360 movdqu XMM0, a; 3361 movdqu XMM1, count; 3362 psllq XMM0, XMM1; 3363 movdqu a, XMM0; 3364 } 3365 return a; 3366 } 3367 else 3368 { 3369 // ARM: good since LDC 1.12 -O2 3370 // ~but -O0 version is catastrophic 3371 long2 r = void; 3372 long2 sa = cast(long2)a; 3373 long2 lc = cast(long2)count; 3374 int bits = cast(int)(lc.array[0]); 3375 foreach(i; 0..2) 3376 r.array[i] = cast(ulong)(sa.array[i]) << bits; 3377 return cast(__m128i)r; 3378 } 3379 } 3380 3381 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros. 3382 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 3383 { 3384 static if (LDC_with_SSE2) 3385 { 3386 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3387 } 3388 else static if (GDC_with_SSE2) 3389 { 3390 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3391 } 3392 else static if (DMD_with_32bit_asm) 3393 { 3394 asm pure nothrow @nogc 3395 { 3396 movdqu XMM0, a; 3397 movdqu XMM1, count; 3398 psllw XMM0, XMM1; 3399 movdqu a, XMM0; 3400 } 3401 return a; 3402 } 3403 else 3404 { 3405 short8 sa = cast(short8)a; 3406 long2 lc = cast(long2)count; 3407 int bits = cast(int)(lc.array[0]); 3408 short8 r = void; 3409 foreach(i; 0..8) 3410 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits); 3411 return cast(int4)r; 3412 } 3413 } 3414 3415 3416 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 3417 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted 3418 { 3419 static if (GDC_with_SSE2) 3420 { 3421 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3422 } 3423 else static if (LDC_with_SSE2) 3424 { 3425 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3426 } 3427 else 3428 { 3429 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3430 // D says "It's illegal to shift by the same or more bits 3431 // than the size of the quantity being shifted" 3432 // and it's UB instead. 3433 int4 r = _mm_setzero_si128(); 3434 3435 ubyte count = cast(ubyte) imm8; 3436 if (count > 31) 3437 return r; 3438 3439 foreach(i; 0..4) 3440 r.array[i] = cast(uint)(a.array[i]) << count; 3441 return r; 3442 } 3443 } 3444 unittest 3445 { 3446 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3447 __m128i B = _mm_slli_epi32(A, 1); 3448 __m128i B2 = _mm_slli_epi32(A, 1 + 256); 3449 int[4] expectedB = [ 0, 4, 6, -8]; 3450 assert(B.array == expectedB); 3451 assert(B2.array == expectedB); 3452 3453 __m128i C = _mm_slli_epi32(A, 0); 3454 int[4] expectedC = [ 0, 2, 3, -4]; 3455 assert(C.array == expectedC); 3456 3457 __m128i D = _mm_slli_epi32(A, 65); 3458 int[4] expectedD = [ 0, 0, 0, 0]; 3459 assert(D.array == expectedD); 3460 } 3461 3462 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros. 3463 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted 3464 { 3465 static if (GDC_with_SSE2) 3466 { 3467 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3468 } 3469 else static if (LDC_with_SSE2) 3470 { 3471 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3472 } 3473 else 3474 { 3475 long2 sa = cast(long2)a; 3476 3477 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3478 // D says "It's illegal to shift by the same or more bits 3479 // than the size of the quantity being shifted" 3480 // and it's UB instead. 3481 long2 r = cast(long2) _mm_setzero_si128(); 3482 ubyte count = cast(ubyte) imm8; 3483 if (count > 63) 3484 return cast(__m128i)r; 3485 3486 r.ptr[0] = cast(ulong)(sa.array[0]) << count; 3487 r.ptr[1] = cast(ulong)(sa.array[1]) << count; 3488 return cast(__m128i)r; 3489 } 3490 } 3491 unittest 3492 { 3493 __m128i A = _mm_setr_epi64(8, -4); 3494 long2 B = cast(long2) _mm_slli_epi64(A, 1); 3495 long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024); 3496 long[2] expectedB = [ 16, -8]; 3497 assert(B.array == expectedB); 3498 assert(B2.array == expectedB); 3499 3500 long2 C = cast(long2) _mm_slli_epi64(A, 0); 3501 long[2] expectedC = [ 8, -4]; 3502 assert(C.array == expectedC); 3503 3504 long2 D = cast(long2) _mm_slli_epi64(A, 64); 3505 long[2] expectedD = [ 0, -0]; 3506 assert(D.array == expectedD); 3507 } 3508 3509 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 3510 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted 3511 { 3512 static if (GDC_with_SSE2) 3513 { 3514 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3515 } 3516 else static if (LDC_with_SSE2) 3517 { 3518 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3519 } 3520 else static if (LDC_with_ARM64) 3521 { 3522 short8 sa = cast(short8)a; 3523 short8 r = cast(short8)_mm_setzero_si128(); 3524 ubyte count = cast(ubyte) imm8; 3525 if (count > 15) 3526 return cast(__m128i)r; 3527 r = sa << short8(count); 3528 return cast(__m128i)r; 3529 } 3530 else 3531 { 3532 short8 sa = cast(short8)a; 3533 short8 r = cast(short8)_mm_setzero_si128(); 3534 ubyte count = cast(ubyte) imm8; 3535 if (count > 15) 3536 return cast(__m128i)r; 3537 foreach(i; 0..8) 3538 r.ptr[i] = cast(short)(sa.array[i] << count); 3539 return cast(__m128i)r; 3540 } 3541 } 3542 unittest 3543 { 3544 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3545 short8 B = cast(short8)( _mm_slli_epi16(A, 1) ); 3546 short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) ); 3547 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 3548 assert(B.array == expectedB); 3549 assert(B2.array == expectedB); 3550 3551 short8 C = cast(short8)( _mm_slli_epi16(A, 16) ); 3552 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 3553 assert(C.array == expectedC); 3554 } 3555 3556 3557 /// Shift `a` left by `bytes` bytes while shifting in zeros. 3558 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted 3559 { 3560 static if (bytes & 0xF0) 3561 { 3562 return _mm_setzero_si128(); 3563 } 3564 else 3565 { 3566 static if (GDC_with_SSE2) 3567 { 3568 return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 3569 } 3570 else version(DigitalMars) 3571 { 3572 version(D_InlineAsm_X86) 3573 { 3574 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64 3575 { 3576 movdqu XMM0, op; 3577 pslldq XMM0, bytes; 3578 movdqu op, XMM0; 3579 } 3580 return op; 3581 } 3582 else 3583 { 3584 byte16 A = cast(byte16)op; 3585 byte16 R; 3586 for (int n = 15; n >= bytes; --n) 3587 R.ptr[n] = A.array[n-bytes]; 3588 for (int n = bytes-1; n >= 0; --n) 3589 R.ptr[n] = 0; 3590 return cast(__m128i)R; 3591 } 3592 } 3593 else 3594 { 3595 return cast(__m128i) shufflevector!(byte16, 3596 16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes, 3597 22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes, 3598 28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes) 3599 (cast(byte16)_mm_setzero_si128(), cast(byte16)op); 3600 } 3601 } 3602 } 3603 unittest 3604 { 3605 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3606 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left 3607 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ]; 3608 assert(R.array == correct); 3609 3610 __m128i B = _mm_srli_si128!16(_mm_set1_epi32(-1)); 3611 int[4] expectedB = [0, 0, 0, 0]; 3612 assert(B.array == expectedB); 3613 } 3614 3615 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`. 3616 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted 3617 { 3618 version(LDC) 3619 { 3620 // Disappeared with LDC 1.11 3621 static if (__VERSION__ < 2081) 3622 return __builtin_ia32_sqrtpd(vec); 3623 else 3624 { 3625 vec.array[0] = llvm_sqrt(vec.array[0]); 3626 vec.array[1] = llvm_sqrt(vec.array[1]); 3627 return vec; 3628 } 3629 } 3630 else static if (GDC_with_SSE2) 3631 { 3632 return __builtin_ia32_sqrtpd(vec); 3633 } 3634 else 3635 { 3636 vec.ptr[0] = sqrt(vec.array[0]); 3637 vec.ptr[1] = sqrt(vec.array[1]); 3638 return vec; 3639 } 3640 } 3641 3642 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 3643 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 3644 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted 3645 { 3646 // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only. 3647 // "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 3648 // The quadword at bits 127:64 of the destination operand remains unchanged." 3649 version(LDC) 3650 { 3651 // Disappeared with LDC 1.11 3652 static if (__VERSION__ < 2081) 3653 { 3654 __m128d c = __builtin_ia32_sqrtsd(b); 3655 a[0] = c[0]; 3656 return a; 3657 } 3658 else 3659 { 3660 a.array[0] = llvm_sqrt(b.array[0]); 3661 return a; 3662 } 3663 } 3664 else static if (GDC_with_SSE2) 3665 { 3666 __m128d c = __builtin_ia32_sqrtsd(b); 3667 a.ptr[0] = c.array[0]; 3668 return a; 3669 } 3670 else 3671 { 3672 a.ptr[0] = sqrt(b.array[0]); 3673 return a; 3674 } 3675 } 3676 unittest 3677 { 3678 __m128d A = _mm_setr_pd(1.0, 3.0); 3679 __m128d B = _mm_setr_pd(4.0, 5.0); 3680 __m128d R = _mm_sqrt_sd(A, B); 3681 double[2] correct = [2.0, 3.0 ]; 3682 assert(R.array == correct); 3683 } 3684 3685 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits. 3686 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted 3687 { 3688 static if (GDC_with_SSE2) 3689 { 3690 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3691 } 3692 else static if (LDC_with_SSE2) 3693 { 3694 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3695 } 3696 else 3697 { 3698 short8 sa = cast(short8)a; 3699 long2 lc = cast(long2)count; 3700 int bits = cast(int)(lc.array[0]); 3701 short8 r = void; 3702 foreach(i; 0..8) 3703 r.ptr[i] = cast(short)(sa.array[i] >> bits); 3704 return cast(int4)r; 3705 } 3706 } 3707 3708 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits. 3709 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted 3710 { 3711 static if (LDC_with_SSE2) 3712 { 3713 return __builtin_ia32_psrad128(a, count); 3714 } 3715 else static if (GDC_with_SSE2) 3716 { 3717 return __builtin_ia32_psrad128(a, count); 3718 } 3719 else 3720 { 3721 int4 r = void; 3722 long2 lc = cast(long2)count; 3723 int bits = cast(int)(lc.array[0]); 3724 r.ptr[0] = (a.array[0] >> bits); 3725 r.ptr[1] = (a.array[1] >> bits); 3726 r.ptr[2] = (a.array[2] >> bits); 3727 r.ptr[3] = (a.array[3] >> bits); 3728 return r; 3729 } 3730 } 3731 3732 3733 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 3734 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted 3735 { 3736 static if (GDC_with_SSE2) 3737 { 3738 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 3739 } 3740 else static if (LDC_with_SSE2) 3741 { 3742 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 3743 } 3744 else static if (LDC_with_ARM64) 3745 { 3746 short8 sa = cast(short8)a; 3747 ubyte count = cast(ubyte)imm8; 3748 if (count > 15) 3749 count = 15; 3750 short8 r = sa >> short8(count); 3751 return cast(__m128i)r; 3752 } 3753 else 3754 { 3755 short8 sa = cast(short8)a; 3756 short8 r = void; 3757 3758 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3759 // D says "It's illegal to shift by the same or more bits 3760 // than the size of the quantity being shifted" 3761 // and it's UB instead. 3762 ubyte count = cast(ubyte)imm8; 3763 if (count > 15) 3764 count = 15; 3765 foreach(i; 0..8) 3766 r.ptr[i] = cast(short)(sa.array[i] >> count); 3767 return cast(int4)r; 3768 } 3769 } 3770 unittest 3771 { 3772 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3773 short8 B = cast(short8)( _mm_srai_epi16(A, 1) ); 3774 short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) ); 3775 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 3776 assert(B.array == expectedB); 3777 assert(B2.array == expectedB); 3778 3779 short8 C = cast(short8)( _mm_srai_epi16(A, 18) ); 3780 short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ]; 3781 assert(C.array == expectedC); 3782 } 3783 3784 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 3785 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted 3786 { 3787 static if (LDC_with_SSE2) 3788 { 3789 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 3790 } 3791 else static if (GDC_with_SSE2) 3792 { 3793 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 3794 } 3795 else 3796 { 3797 int4 r = void; 3798 3799 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3800 // D says "It's illegal to shift by the same or more bits 3801 // than the size of the quantity being shifted" 3802 // and it's UB instead. 3803 ubyte count = cast(ubyte) imm8; 3804 if (count > 31) 3805 count = 31; 3806 3807 r.ptr[0] = (a.array[0] >> count); 3808 r.ptr[1] = (a.array[1] >> count); 3809 r.ptr[2] = (a.array[2] >> count); 3810 r.ptr[3] = (a.array[3] >> count); 3811 return r; 3812 } 3813 } 3814 unittest 3815 { 3816 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3817 __m128i B = _mm_srai_epi32(A, 1); 3818 __m128i B2 = _mm_srai_epi32(A, 1 + 256); 3819 int[4] expectedB = [ 0, 1, 1, -2]; 3820 assert(B.array == expectedB); 3821 assert(B2.array == expectedB); 3822 3823 __m128i C = _mm_srai_epi32(A, 32); 3824 int[4] expectedC = [ 0, 0, 0, -1]; 3825 assert(C.array == expectedC); 3826 3827 __m128i D = _mm_srai_epi32(A, 0); 3828 int[4] expectedD = [ 0, 2, 3, -4]; 3829 assert(D.array == expectedD); 3830 } 3831 3832 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted 3833 { 3834 static if (LDC_with_SSE2) 3835 { 3836 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 3837 } 3838 else static if (GDC_with_SSE2) 3839 { 3840 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 3841 } 3842 else 3843 { 3844 short8 sa = cast(short8)a; 3845 long2 lc = cast(long2)count; 3846 int bits = cast(int)(lc.array[0]); 3847 short8 r = void; 3848 foreach(i; 0..8) 3849 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits); 3850 return cast(int4)r; 3851 } 3852 } 3853 3854 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted 3855 { 3856 static if (LDC_with_SSE2) 3857 { 3858 return __builtin_ia32_psrld128(a, count); 3859 } 3860 else static if (GDC_with_SSE2) 3861 { 3862 return __builtin_ia32_psrld128(a, count); 3863 } 3864 else 3865 { 3866 int4 r = void; 3867 long2 lc = cast(long2)count; 3868 int bits = cast(int)(lc.array[0]); 3869 r.ptr[0] = cast(uint)(a.array[0]) >> bits; 3870 r.ptr[1] = cast(uint)(a.array[1]) >> bits; 3871 r.ptr[2] = cast(uint)(a.array[2]) >> bits; 3872 r.ptr[3] = cast(uint)(a.array[3]) >> bits; 3873 return r; 3874 } 3875 } 3876 3877 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted 3878 { 3879 static if (LDC_with_SSE2) 3880 { 3881 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 3882 } 3883 else static if (GDC_with_SSE2) 3884 { 3885 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 3886 } 3887 else 3888 { 3889 long2 r = void; 3890 long2 sa = cast(long2)a; 3891 long2 lc = cast(long2)count; 3892 int bits = cast(int)(lc.array[0]); 3893 r.ptr[0] = cast(ulong)(sa.array[0]) >> bits; 3894 r.ptr[1] = cast(ulong)(sa.array[1]) >> bits; 3895 return cast(__m128i)r; 3896 } 3897 } 3898 3899 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 3900 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted 3901 { 3902 static if (GDC_with_SSE2) 3903 { 3904 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 3905 } 3906 else static if (LDC_with_SSE2) 3907 { 3908 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 3909 } 3910 else static if (LDC_with_ARM64) 3911 { 3912 short8 sa = cast(short8)a; 3913 short8 r = cast(short8) _mm_setzero_si128(); 3914 3915 ubyte count = cast(ubyte)imm8; 3916 if (count >= 16) 3917 return cast(__m128i)r; 3918 3919 r = sa >>> short8(count); // This facility offered with LDC, but not DMD. 3920 return cast(__m128i)r; 3921 } 3922 else 3923 { 3924 short8 sa = cast(short8)a; 3925 ubyte count = cast(ubyte)imm8; 3926 3927 short8 r = cast(short8) _mm_setzero_si128(); 3928 if (count >= 16) 3929 return cast(__m128i)r; 3930 3931 foreach(i; 0..8) 3932 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count); 3933 return cast(__m128i)r; 3934 } 3935 } 3936 unittest 3937 { 3938 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3939 short8 B = cast(short8)( _mm_srli_epi16(A, 1) ); 3940 short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) ); 3941 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 3942 assert(B.array == expectedB); 3943 assert(B2.array == expectedB); 3944 3945 short8 C = cast(short8)( _mm_srli_epi16(A, 16) ); 3946 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0]; 3947 assert(C.array == expectedC); 3948 3949 short8 D = cast(short8)( _mm_srli_epi16(A, 0) ); 3950 short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ]; 3951 assert(D.array == expectedD); 3952 } 3953 3954 3955 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 3956 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted 3957 { 3958 static if (GDC_with_SSE2) 3959 { 3960 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 3961 } 3962 else static if (LDC_with_SSE2) 3963 { 3964 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 3965 } 3966 else 3967 { 3968 ubyte count = cast(ubyte) imm8; 3969 3970 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3971 // D says "It's illegal to shift by the same or more bits 3972 // than the size of the quantity being shifted" 3973 // and it's UB instead. 3974 int4 r = _mm_setzero_si128(); 3975 if (count >= 32) 3976 return r; 3977 r.ptr[0] = a.array[0] >>> count; 3978 r.ptr[1] = a.array[1] >>> count; 3979 r.ptr[2] = a.array[2] >>> count; 3980 r.ptr[3] = a.array[3] >>> count; 3981 return r; 3982 } 3983 } 3984 unittest 3985 { 3986 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3987 __m128i B = _mm_srli_epi32(A, 1); 3988 __m128i B2 = _mm_srli_epi32(A, 1 + 256); 3989 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 3990 assert(B.array == expectedB); 3991 assert(B2.array == expectedB); 3992 3993 __m128i C = _mm_srli_epi32(A, 255); 3994 int[4] expectedC = [ 0, 0, 0, 0 ]; 3995 assert(C.array == expectedC); 3996 } 3997 3998 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros. 3999 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted 4000 { 4001 static if (GDC_with_SSE2) 4002 { 4003 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4004 } 4005 else static if (LDC_with_SSE2) 4006 { 4007 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4008 } 4009 else 4010 { 4011 long2 r = cast(long2) _mm_setzero_si128(); 4012 long2 sa = cast(long2)a; 4013 4014 ubyte count = cast(ubyte) imm8; 4015 if (count >= 64) 4016 return cast(__m128i)r; 4017 4018 r.ptr[0] = sa.array[0] >>> count; 4019 r.ptr[1] = sa.array[1] >>> count; 4020 return cast(__m128i)r; 4021 } 4022 } 4023 unittest 4024 { 4025 __m128i A = _mm_setr_epi64(8, -4); 4026 long2 B = cast(long2) _mm_srli_epi64(A, 1); 4027 long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512); 4028 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 4029 assert(B.array == expectedB); 4030 assert(B2.array == expectedB); 4031 4032 long2 C = cast(long2) _mm_srli_epi64(A, 64); 4033 long[2] expectedC = [ 0, 0 ]; 4034 assert(C.array == expectedC); 4035 } 4036 4037 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4038 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @safe 4039 { 4040 static if (bytes & 0xF0) 4041 { 4042 return _mm_setzero_si128(); 4043 } 4044 else static if (GDC_with_SSE2) 4045 { 4046 return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8)); 4047 } 4048 else static if (DMD_with_32bit_asm) 4049 { 4050 asm pure nothrow @nogc @trusted 4051 { 4052 movdqu XMM0, v; 4053 psrldq XMM0, bytes; 4054 movdqu v, XMM0; 4055 } 4056 return v; 4057 } 4058 else 4059 { 4060 return cast(__m128i) shufflevector!(byte16, 4061 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7, 4062 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15) 4063 (cast(byte16) v, cast(byte16)_mm_setzero_si128()); 4064 } 4065 } 4066 unittest 4067 { 4068 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1)); 4069 int[4] correct = [2, 3, 4, 0]; 4070 assert(R.array == correct); 4071 4072 __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1)); 4073 int[4] expectedA = [0, 0, 0, 0]; 4074 assert(A.array == expectedA); 4075 } 4076 4077 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4078 /// #BONUS 4079 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe 4080 { 4081 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v); 4082 } 4083 unittest 4084 { 4085 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)); 4086 float[4] correct = [3.0f, 4.0f, 0, 0]; 4087 assert(R.array == correct); 4088 } 4089 4090 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4091 /// #BONUS 4092 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe 4093 { 4094 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v); 4095 } 4096 4097 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4098 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4099 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted 4100 { 4101 __m128d* aligned = cast(__m128d*)mem_addr; 4102 *aligned = a; 4103 } 4104 4105 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 4106 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4107 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted 4108 { 4109 __m128d* aligned = cast(__m128d*)mem_addr; 4110 __m128d r; 4111 r.ptr[0] = a.array[0]; 4112 r.ptr[1] = a.array[0]; 4113 *aligned = r; 4114 } 4115 4116 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 4117 /// be aligned on any particular boundary. 4118 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 4119 { 4120 *mem_addr = a.array[0]; 4121 } 4122 4123 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 4124 /// general-protection exception may be generated. 4125 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 4126 { 4127 *mem_addr = a; 4128 } 4129 4130 alias _mm_store1_pd = _mm_store_pd1; /// 4131 4132 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory. 4133 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 4134 { 4135 *mem_addr = a.array[1]; 4136 } 4137 4138 // Note: `mem_addr` doesn't have to actually be aligned, which breaks 4139 // expectations from the user point of view. This problem also exist in C++. 4140 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 4141 { 4142 long* dest = cast(long*)mem_addr; 4143 long2 la = cast(long2)a; 4144 *dest = la.array[0]; 4145 } 4146 unittest 4147 { 4148 long[3] A = [1, 2, 3]; 4149 _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 4150 long[3] correct = [1, 0x1_0000_0000, 3]; 4151 assert(A == correct); 4152 } 4153 4154 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. 4155 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 4156 { 4157 *mem_addr = a.array[0]; 4158 } 4159 4160 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order. `mem_addr` must be 4161 /// aligned on a 16-byte boundary or a general-protection exception may be generated. 4162 void _mm_storer_pd (double* mem_addr, __m128d a) pure 4163 { 4164 __m128d* aligned = cast(__m128d*)mem_addr; 4165 *aligned = shufflevector!(double2, 1, 0)(a, a); 4166 } 4167 4168 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4169 /// `mem_addr` does not need to be aligned on any particular boundary. 4170 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe 4171 { 4172 storeUnaligned!double2(a, mem_addr); 4173 } 4174 4175 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 4176 /// boundary. 4177 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe 4178 { 4179 storeUnaligned!__m128i(a, cast(int*)mem_addr); 4180 } 4181 4182 /// Store 32-bit integer from the first element of `a` into memory. 4183 /// `mem_addr` does not need to be aligned on any particular boundary. 4184 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted 4185 { 4186 int* dest = cast(int*)mem_addr; 4187 *dest = a.array[0]; 4188 } 4189 unittest 4190 { 4191 int[2] arr = [-24, 12]; 4192 _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7)); 4193 assert(arr == [-24, -1]); 4194 } 4195 4196 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 4197 /// from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte 4198 /// boundary or a general-protection exception may be generated. 4199 void _mm_stream_pd (double* mem_addr, __m128d a) 4200 { 4201 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4202 __m128d* dest = cast(__m128d*)mem_addr; 4203 *dest = a; 4204 } 4205 4206 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 4207 /// mem_addr must be aligned on a 16-byte boundary or a general-protection exception 4208 /// may be generated. 4209 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) 4210 { 4211 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4212 __m128i* dest = cast(__m128i*)mem_addr; 4213 *dest = a; 4214 } 4215 4216 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 4217 /// pollution. If the cache line containing address mem_addr is already in the cache, 4218 /// the cache will be updated. 4219 void _mm_stream_si32 (int* mem_addr, int a) 4220 { 4221 // BUG see `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4222 *mem_addr = a; 4223 } 4224 4225 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 4226 /// cache pollution. If the cache line containing address mem_addr is already 4227 /// in the cache, the cache will be updated. 4228 void _mm_stream_si64 (long* mem_addr, long a) 4229 { 4230 // BUG See `_mm_stream_ps` for an explanation why we don't implement non-temporal moves 4231 *mem_addr = a; 4232 } 4233 4234 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 4235 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 4236 { 4237 return cast(__m128i)(cast(short8)a - cast(short8)b); 4238 } 4239 4240 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 4241 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 4242 { 4243 return cast(__m128i)(cast(int4)a - cast(int4)b); 4244 } 4245 4246 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`. 4247 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 4248 { 4249 return cast(__m128i)(cast(long2)a - cast(long2)b); 4250 } 4251 4252 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 4253 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 4254 { 4255 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 4256 } 4257 4258 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 4259 /// floating-point elements in `a`. 4260 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 4261 { 4262 return a - b; 4263 } 4264 4265 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 4266 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the 4267 /// upper element of result. 4268 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted 4269 { 4270 version(DigitalMars) 4271 { 4272 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 4273 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 4274 asm pure nothrow @nogc @trusted { nop;} 4275 a[0] = a[0] - b[0]; 4276 return a; 4277 } 4278 else static if (GDC_with_SSE2) 4279 { 4280 return __builtin_ia32_subsd(a, b); 4281 } 4282 else 4283 { 4284 a.ptr[0] -= b.array[0]; 4285 return a; 4286 } 4287 } 4288 unittest 4289 { 4290 __m128d a = [1.5, -2.0]; 4291 a = _mm_sub_sd(a, a); 4292 assert(a.array == [0.0, -2.0]); 4293 } 4294 4295 /// Subtract 64-bit integer `b` from 64-bit integer `a`. 4296 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe 4297 { 4298 return a - b; 4299 } 4300 4301 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4302 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 4303 { 4304 version(LDC) 4305 { 4306 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4307 { 4308 // Generates PSUBSW since LDC 1.15 -O0 4309 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4310 4311 enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4312 enum ir = ` 4313 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4314 ret <8 x i16> %r`; 4315 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4316 } 4317 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4318 { 4319 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4320 short[8] res; 4321 short8 sa = cast(short8)a; 4322 short8 sb = cast(short8)b; 4323 foreach(i; 0..8) 4324 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4325 return _mm_loadu_si128(cast(int4*)res.ptr); 4326 } 4327 else static if (LDC_with_SSE2) 4328 { 4329 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 4330 } 4331 else 4332 static assert(false); 4333 } 4334 else static if (GDC_with_SSE2) 4335 { 4336 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 4337 } 4338 else 4339 { 4340 short[8] res; 4341 short8 sa = cast(short8)a; 4342 short8 sb = cast(short8)b; 4343 foreach(i; 0..8) 4344 res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4345 return _mm_loadu_si128(cast(int4*)res.ptr); 4346 } 4347 } 4348 unittest 4349 { 4350 short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0), 4351 _mm_setr_epi16(-10 , 16, 5, 4, 3, 2, 1, 0)); 4352 static immutable short[8] correctResult = [32767, -32768, 0, 0, 0, 0, 0, 0]; 4353 assert(res.array == correctResult); 4354 } 4355 4356 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 4357 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 4358 { 4359 version(LDC) 4360 { 4361 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4362 { 4363 // x86: Generates PSUBSB since LDC 1.15 -O0 4364 // ARM: Generates sqsub.16b since LDC 1.21 -O0 4365 enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 4366 enum ir = ` 4367 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 4368 ret <16 x i8> %r`; 4369 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 4370 } 4371 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4372 { 4373 byte[16] res; 4374 byte16 sa = cast(byte16)a; 4375 byte16 sb = cast(byte16)b; 4376 foreach(i; 0..16) 4377 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4378 return _mm_loadu_si128(cast(int4*)res.ptr); 4379 } 4380 else static if (LDC_with_SSE2) 4381 { 4382 return cast(__m128i) __builtin_ia32_psubsb128(cast(byte16) a, cast(byte16) b); 4383 } 4384 else 4385 static assert(false); 4386 } 4387 else static if (GDC_with_SSE2) 4388 { 4389 return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b); 4390 } 4391 else 4392 { 4393 byte[16] res; 4394 byte16 sa = cast(byte16)a; 4395 byte16 sb = cast(byte16)b; 4396 foreach(i; 0..16) 4397 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4398 return _mm_loadu_si128(cast(int4*)res.ptr); 4399 } 4400 } 4401 unittest 4402 { 4403 byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4404 _mm_setr_epi8( 15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4405 static immutable byte[16] correctResult = [-128, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4406 assert(res.array == correctResult); 4407 } 4408 4409 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 4410 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 4411 { 4412 version(LDC) 4413 { 4414 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4415 { 4416 // x86: Generates PSUBUSW since LDC 1.15 -O0 4417 // ARM: Generates uqsub.8h since LDC 1.21 -O0 4418 enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4419 enum ir = ` 4420 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4421 ret <8 x i16> %r`; 4422 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4423 } 4424 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4425 { 4426 short[8] res; 4427 short8 sa = cast(short8)a; 4428 short8 sb = cast(short8)b; 4429 foreach(i; 0..8) 4430 { 4431 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4432 res[i] = saturateSignedIntToUnsignedShort(sum); 4433 } 4434 return _mm_loadu_si128(cast(int4*)res.ptr); 4435 } 4436 else static if (LDC_with_SSE2) 4437 { 4438 return cast(__m128i) __builtin_ia32_psubusw128(a, b); 4439 } 4440 else 4441 static assert(false); 4442 } 4443 else static if (GDC_with_SSE2) 4444 { 4445 return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b); 4446 } 4447 else 4448 { 4449 short[8] res; 4450 short8 sa = cast(short8)a; 4451 short8 sb = cast(short8)b; 4452 foreach(i; 0..8) 4453 { 4454 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4455 res[i] = saturateSignedIntToUnsignedShort(sum); 4456 } 4457 return _mm_loadu_si128(cast(int4*)res.ptr); 4458 } 4459 } 4460 unittest 4461 { 4462 short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534, 1, 5, 4, 3, 2, 1, 0), 4463 _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0)); 4464 static immutable short[8] correct = [ 0, 0, 1, 0, 0, 2, 0, 0]; 4465 assert(R.array == correct); 4466 } 4467 4468 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4469 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4470 { 4471 version(LDC) 4472 { 4473 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4474 { 4475 // x86: Generates PSUBUSB since LDC 1.15 -O0 4476 // ARM: Generates uqsub.16b since LDC 1.21 -O0 4477 enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 4478 enum ir = ` 4479 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 4480 ret <16 x i8> %r`; 4481 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 4482 } 4483 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4484 { 4485 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4486 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4487 { 4488 ubyte[16] res; 4489 byte16 sa = cast(byte16)a; 4490 byte16 sb = cast(byte16)b; 4491 foreach(i; 0..16) 4492 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4493 return _mm_loadu_si128(cast(int4*)res.ptr); 4494 } 4495 } 4496 else static if (LDC_with_SSE2) 4497 { 4498 return __builtin_ia32_psubusb128(a, b); 4499 } 4500 else 4501 static assert(false); 4502 } 4503 else static if (GDC_with_SSE2) 4504 { 4505 return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b); 4506 } 4507 else 4508 { 4509 ubyte[16] res; 4510 byte16 sa = cast(byte16)a; 4511 byte16 sb = cast(byte16)b; 4512 foreach(i; 0..16) 4513 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4514 return _mm_loadu_si128(cast(int4*)res.ptr); 4515 } 4516 } 4517 unittest 4518 { 4519 byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4520 _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4521 static immutable byte[16] correctResult = [ 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4522 assert(res.array == correctResult); 4523 } 4524 4525 // Note: the only difference between these intrinsics is the signalling 4526 // behaviour of quiet NaNs. This is incorrect but the case where 4527 // you would want to differentiate between qNaN and sNaN and then 4528 // treat them differently on purpose seems extremely rare. 4529 alias _mm_ucomieq_sd = _mm_comieq_sd; /// 4530 alias _mm_ucomige_sd = _mm_comige_sd; /// 4531 alias _mm_ucomigt_sd = _mm_comigt_sd; /// 4532 alias _mm_ucomile_sd = _mm_comile_sd; /// 4533 alias _mm_ucomilt_sd = _mm_comilt_sd; /// 4534 alias _mm_ucomineq_sd = _mm_comineq_sd; /// 4535 4536 /// Return vector of type `__m128d` with undefined elements. 4537 __m128d _mm_undefined_pd() pure @safe 4538 { 4539 __m128d result = void; 4540 return result; 4541 } 4542 4543 /// Return vector of type `__m128i` with undefined elements. 4544 __m128i _mm_undefined_si128() pure @safe 4545 { 4546 __m128i result = void; 4547 return result; 4548 } 4549 4550 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. 4551 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe 4552 { 4553 static if (GDC_with_SSE2) 4554 { 4555 return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b); 4556 } 4557 else static if (DMD_with_32bit_asm) 4558 { 4559 asm pure nothrow @nogc @trusted 4560 { 4561 movdqu XMM0, a; 4562 movdqu XMM1, b; 4563 punpckhwd XMM0, XMM1; 4564 movdqu a, XMM0; 4565 } 4566 return a; 4567 } 4568 else 4569 { 4570 return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 4571 (cast(short8)a, cast(short8)b); 4572 } 4573 } 4574 unittest 4575 { 4576 __m128i A = _mm_setr_epi16(4, 5, 6, 7, 8, 9, 10, 11); 4577 __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19); 4578 short8 C = cast(short8)(_mm_unpackhi_epi16(A, B)); 4579 short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19]; 4580 assert(C.array == correct); 4581 } 4582 4583 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. 4584 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted 4585 { 4586 static if (GDC_with_SSE2) 4587 { 4588 return __builtin_ia32_punpckhdq128(a, b); 4589 } 4590 else version(DigitalMars) 4591 { 4592 __m128i r; 4593 r.ptr[0] = a.array[2]; 4594 r.ptr[1] = b.array[2]; 4595 r.ptr[2] = a.array[3]; 4596 r.ptr[3] = b.array[3]; 4597 return r; 4598 } 4599 else 4600 { 4601 return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 4602 } 4603 } 4604 unittest 4605 { 4606 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 4607 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 4608 __m128i C = _mm_unpackhi_epi32(A, B); 4609 int[4] correct = [3, 7, 4, 8]; 4610 assert(C.array == correct); 4611 } 4612 4613 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`. 4614 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted 4615 { 4616 static if (GDC_with_SSE2) 4617 { 4618 return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b); 4619 } 4620 else 4621 { 4622 __m128i r = cast(__m128i)b; 4623 r[0] = a[2]; 4624 r[1] = a[3]; 4625 return r; 4626 } 4627 } 4628 unittest // Issue #36 4629 { 4630 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4631 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4632 long2 C = cast(long2)(_mm_unpackhi_epi64(A, B)); 4633 long[2] correct = [0x33333333_33333333, 0x55555555_55555555]; 4634 assert(C.array == correct); 4635 } 4636 4637 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. 4638 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe 4639 { 4640 static if (GDC_with_SSE2) 4641 { 4642 return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b); 4643 } 4644 else static if (DMD_with_32bit_asm) 4645 { 4646 asm pure nothrow @nogc @trusted 4647 { 4648 movdqu XMM0, a; 4649 movdqu XMM1, b; 4650 punpckhbw XMM0, XMM1; 4651 movdqu a, XMM0; 4652 } 4653 return a; 4654 } 4655 else 4656 { 4657 return cast(__m128i)shufflevector!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 4658 12, 28, 13, 29, 14, 30, 15, 31) 4659 (cast(byte16)a, cast(byte16)b); 4660 } 4661 } 4662 unittest 4663 { 4664 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 4665 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 4666 byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B); 4667 byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]; 4668 assert(C.array == correct); 4669 } 4670 4671 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`. 4672 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe 4673 { 4674 static if (GDC_with_SSE2) 4675 { 4676 return __builtin_ia32_unpckhpd(a, b); 4677 } 4678 else 4679 { 4680 return shufflevector!(__m128d, 1, 3)(a, b); 4681 } 4682 } 4683 unittest 4684 { 4685 __m128d A = _mm_setr_pd(4.0, 6.0); 4686 __m128d B = _mm_setr_pd(7.0, 9.0); 4687 __m128d C = _mm_unpackhi_pd(A, B); 4688 double[2] correct = [6.0, 9.0]; 4689 assert(C.array == correct); 4690 } 4691 4692 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. 4693 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe 4694 { 4695 static if (GDC_with_SSE2) 4696 { 4697 return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b); 4698 } 4699 else static if (DMD_with_32bit_asm) 4700 { 4701 asm pure nothrow @nogc @trusted 4702 { 4703 movdqu XMM0, a; 4704 movdqu XMM1, b; 4705 punpcklwd XMM0, XMM1; 4706 movdqu a, XMM0; 4707 } 4708 return a; 4709 } 4710 else 4711 { 4712 return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11) 4713 (cast(short8)a, cast(short8)b); 4714 } 4715 } 4716 unittest 4717 { 4718 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 4719 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 4720 short8 C = cast(short8) _mm_unpacklo_epi16(A, B); 4721 short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11]; 4722 assert(C.array == correct); 4723 } 4724 4725 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. 4726 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted 4727 { 4728 static if (GDC_with_SSE2) 4729 { 4730 return __builtin_ia32_punpckldq128(a, b); 4731 } 4732 else version(DigitalMars) 4733 { 4734 __m128i r; 4735 r.ptr[0] = a.array[0]; 4736 r.ptr[1] = b.array[0]; 4737 r.ptr[2] = a.array[1]; 4738 r.ptr[3] = b.array[1]; 4739 return r; 4740 } 4741 else 4742 { 4743 return shufflevector!(int4, 0, 4, 1, 5)(cast(int4)a, cast(int4)b); 4744 } 4745 } 4746 unittest 4747 { 4748 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 4749 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 4750 __m128i C = _mm_unpacklo_epi32(A, B); 4751 int[4] correct = [1, 5, 2, 6]; 4752 assert(C.array == correct); 4753 } 4754 4755 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`. 4756 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted 4757 { 4758 static if (GDC_with_SSE2) 4759 { 4760 return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b); 4761 } 4762 else 4763 { 4764 long2 lA = cast(long2)a; 4765 long2 lB = cast(long2)b; 4766 long2 R; 4767 R.ptr[0] = lA.array[0]; 4768 R.ptr[1] = lB.array[0]; 4769 return cast(__m128i)R; 4770 } 4771 } 4772 unittest // Issue #36 4773 { 4774 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 4775 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 4776 long2 C = cast(long2)(_mm_unpacklo_epi64(A, B)); 4777 long[2] correct = [0x22222222_22222222, 0x44444444_44444444]; 4778 assert(C.array == correct); 4779 } 4780 4781 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. 4782 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe 4783 { 4784 static if (GDC_with_SSE2) 4785 { 4786 return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b); 4787 } 4788 else static if (DMD_with_32bit_asm) 4789 { 4790 asm pure nothrow @nogc @trusted 4791 { 4792 movdqu XMM0, a; 4793 movdqu XMM1, b; 4794 punpcklbw XMM0, XMM1; 4795 movdqu a, XMM0; 4796 } 4797 return a; 4798 } 4799 else 4800 { 4801 return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 4802 4, 20, 5, 21, 6, 22, 7, 23) 4803 (cast(byte16)a, cast(byte16)b); 4804 } 4805 } 4806 unittest 4807 { 4808 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 4809 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 4810 byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B); 4811 byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]; 4812 assert(C.array == correct); 4813 } 4814 4815 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`. 4816 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe 4817 { 4818 static if (GDC_with_SSE2) 4819 { 4820 return __builtin_ia32_unpcklpd(a, b); 4821 } 4822 else 4823 { 4824 return shufflevector!(__m128d, 0, 2)(a, b); 4825 } 4826 } 4827 unittest 4828 { 4829 __m128d A = _mm_setr_pd(4.0, 6.0); 4830 __m128d B = _mm_setr_pd(7.0, 9.0); 4831 __m128d C = _mm_unpacklo_pd(A, B); 4832 double[2] correct = [4.0, 7.0]; 4833 assert(C.array == correct); 4834 } 4835 4836 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 4837 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 4838 { 4839 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 4840 } 4841 4842 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`. 4843 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 4844 { 4845 return a ^ b; 4846 } 4847 4848 unittest 4849 { 4850 float distance(float[4] a, float[4] b) nothrow @nogc 4851 { 4852 __m128 va = _mm_loadu_ps(a.ptr); 4853 __m128 vb = _mm_loadu_ps(b.ptr); 4854 __m128 diffSquared = _mm_sub_ps(va, vb); 4855 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 4856 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared)); 4857 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum)); 4858 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 4859 } 4860 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 4861 }