1 /** 2 * SSE2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE2 4 * 5 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.emmintrin; 9 10 public import inteli.types; 11 public import inteli.xmmintrin; // SSE2 includes SSE1 12 import inteli.mmx; 13 import inteli.internals; 14 15 nothrow @nogc: 16 17 18 // SSE2 instructions 19 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2 20 21 /// Add packed 16-bit integers in `a` and `b`. 22 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe 23 { 24 pragma(inline, true); 25 return cast(__m128i)(cast(short8)a + cast(short8)b); 26 } 27 unittest 28 { 29 __m128i A = _mm_setr_epi16(4, 8, 13, -7, -1, 0, 9, 77); 30 short8 R = cast(short8) _mm_add_epi16(A, A); 31 short[8] correct = [8, 16, 26, -14, -2, 0, 18, 154]; 32 assert(R.array == correct); 33 } 34 35 /// Add packed 32-bit integers in `a` and `b`. 36 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe 37 { 38 pragma(inline, true); 39 return cast(__m128i)(cast(int4)a + cast(int4)b); 40 } 41 unittest 42 { 43 __m128i A = _mm_setr_epi32( -7, -1, 0, 9); 44 int4 R = _mm_add_epi32(A, A); 45 int[4] correct = [ -14, -2, 0, 18 ]; 46 assert(R.array == correct); 47 } 48 49 /// Add packed 64-bit integers in `a` and `b`. 50 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe 51 { 52 pragma(inline, true); 53 return cast(__m128i)(cast(long2)a + cast(long2)b); 54 } 55 unittest 56 { 57 __m128i A = _mm_setr_epi64(-1, 0x8000_0000_0000_0000); 58 long2 R = cast(long2) _mm_add_epi64(A, A); 59 long[2] correct = [ -2, 0 ]; 60 assert(R.array == correct); 61 } 62 63 /// Add packed 8-bit integers in `a` and `b`. 64 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe 65 { 66 pragma(inline, true); 67 return cast(__m128i)(cast(byte16)a + cast(byte16)b); 68 } 69 unittest 70 { 71 __m128i A = _mm_setr_epi8(4, 8, 13, -7, -1, 0, 9, 77, 4, 8, 13, -7, -1, 0, 9, 78); 72 byte16 R = cast(byte16) _mm_add_epi8(A, A); 73 byte[16] correct = [8, 16, 26, -14, -2, 0, 18, -102, 8, 16, 26, -14, -2, 0, 18, -100]; 74 assert(R.array == correct); 75 } 76 77 /// Add the lower double-precision (64-bit) floating-point element 78 /// in `a` and `b`, store the result in the lower element of dst, 79 /// and copy the upper element from `a` to the upper element of destination. 80 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe 81 { 82 static if (GDC_with_SSE2) 83 { 84 return __builtin_ia32_addsd(a, b); 85 } 86 else version(DigitalMars) 87 { 88 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 89 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 90 asm pure nothrow @nogc @trusted { nop;} 91 a[0] = a[0] + b[0]; 92 return a; 93 } 94 else 95 { 96 a[0] += b[0]; 97 return a; 98 } 99 } 100 unittest 101 { 102 __m128d a = [1.5, -2.0]; 103 a = _mm_add_sd(a, a); 104 assert(a.array == [3.0, -2.0]); 105 } 106 107 /// Add packed double-precision (64-bit) floating-point elements in `a` and `b`. 108 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe 109 { 110 pragma(inline, true); 111 return a + b; 112 } 113 unittest 114 { 115 __m128d a = [1.5, -2.0]; 116 a = _mm_add_pd(a, a); 117 assert(a.array == [3.0, -4.0]); 118 } 119 120 /// Add 64-bit integers `a` and `b`. 121 __m64 _mm_add_si64 (__m64 a, __m64 b) pure @safe 122 { 123 pragma(inline, true); 124 return a + b; 125 } 126 127 /// Add packed 16-bit integers in `a` and `b` using signed saturation. 128 __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted 129 { 130 static if (GDC_with_SSE2) 131 { 132 return cast(__m128i)__builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 133 } 134 else version(LDC) 135 { 136 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 137 { 138 // x86: Generates PADDSW since LDC 1.15 -O0 139 // ARM: Generates sqadd.8h since LDC 1.21 -O1, really bad in <= 1.20 140 enum prefix = `declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 141 enum ir = ` 142 %r = call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 143 ret <8 x i16> %r`; 144 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 145 } 146 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 147 { 148 short[8] res; // PERF =void; 149 short8 sa = cast(short8)a; 150 short8 sb = cast(short8)b; 151 foreach(i; 0..8) 152 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 153 return _mm_loadu_si128(cast(int4*)res.ptr); 154 } 155 else 156 return cast(__m128i) __builtin_ia32_paddsw128(cast(short8)a, cast(short8)b); 157 } 158 else 159 { 160 short[8] res; // PERF =void; 161 short8 sa = cast(short8)a; 162 short8 sb = cast(short8)b; 163 foreach(i; 0..8) 164 res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]); 165 return _mm_loadu_si128(cast(int4*)res.ptr); 166 } 167 } 168 unittest 169 { 170 short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), 171 _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); 172 static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14]; 173 assert(res.array == correctResult); 174 } 175 176 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 177 __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted 178 { 179 static if (GDC_with_SSE2) 180 { 181 return cast(__m128i) __builtin_ia32_paddsb128(cast(ubyte16)a, cast(ubyte16)b); 182 } 183 else version(LDC) 184 { 185 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 186 { 187 // x86: Generates PADDSB since LDC 1.15 -O0 188 // ARM: Generates sqadd.16b since LDC 1.21 -O1, really bad in <= 1.20 189 enum prefix = `declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 190 enum ir = ` 191 %r = call <16 x i8> @llvm.sadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 192 ret <16 x i8> %r`; 193 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 194 } 195 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 196 { 197 byte[16] res; // PERF =void; 198 byte16 sa = cast(byte16)a; 199 byte16 sb = cast(byte16)b; 200 foreach(i; 0..16) 201 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 202 return _mm_loadu_si128(cast(int4*)res.ptr); 203 } 204 else 205 return cast(__m128i) __builtin_ia32_paddsb128(cast(byte16)a, cast(byte16)b); 206 } 207 else 208 { 209 byte[16] res; // PERF =void; 210 byte16 sa = cast(byte16)a; 211 byte16 sb = cast(byte16)b; 212 foreach(i; 0..16) 213 res[i] = saturateSignedWordToSignedByte(sa[i] + sb[i]); 214 return _mm_loadu_si128(cast(int4*)res.ptr); 215 } 216 } 217 unittest 218 { 219 byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 220 _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 221 static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14, 222 16, 18, 20, 22, 24, 26, 28, 30]; 223 assert(res.array == correctResult); 224 } 225 226 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 227 // PERF: #GDC version? 228 __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted 229 { 230 version(LDC) 231 { 232 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 233 { 234 // x86: Generates PADDUSB since LDC 1.15 -O0 235 // ARM: Generates uqadd.16b since LDC 1.21 -O1 236 enum prefix = `declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 237 enum ir = ` 238 %r = call <16 x i8> @llvm.uadd.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 239 ret <16 x i8> %r`; 240 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 241 } 242 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 243 { 244 ubyte[16] res; // PERF =void; 245 byte16 sa = cast(byte16)a; 246 byte16 sb = cast(byte16)b; 247 foreach(i; 0..16) 248 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 249 return _mm_loadu_si128(cast(int4*)res.ptr); 250 } 251 else 252 return __builtin_ia32_paddusb128(a, b); 253 } 254 else 255 { 256 ubyte[16] res; // PERF =void; 257 byte16 sa = cast(byte16)a; 258 byte16 sb = cast(byte16)b; 259 foreach(i; 0..16) 260 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i])); 261 return _mm_loadu_si128(cast(int4*)res.ptr); 262 } 263 } 264 unittest 265 { 266 byte16 res = cast(byte16) 267 _mm_adds_epu8(_mm_set_epi8(7, 6, 5, 4, 3, 2, cast(byte)255, 0, 7, 6, 5, 4, 3, 2, cast(byte)255, 0), 268 _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0)); 269 static immutable byte[16] correctResult = [0, cast(byte)255, 4, 6, 8, 10, 12, 14, 270 0, cast(byte)255, 4, 6, 8, 10, 12, 14]; 271 assert(res.array == correctResult); 272 } 273 274 /// Add packed unsigned 16-bit integers in `a` and `b` using unsigned saturation. 275 // PERF: #GDC version? 276 __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted 277 { 278 version(LDC) 279 { 280 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 281 { 282 // x86: Generates PADDUSW since LDC 1.15 -O0 283 // ARM: Generates uqadd.8h since LDC 1.21 -O1 284 enum prefix = `declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 285 enum ir = ` 286 %r = call <8 x i16> @llvm.uadd.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 287 ret <8 x i16> %r`; 288 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 289 } 290 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 291 { 292 ushort[8] res; // PERF =void; 293 short8 sa = cast(short8)a; 294 short8 sb = cast(short8)b; 295 foreach(i; 0..8) 296 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 297 return _mm_loadu_si128(cast(int4*)res.ptr); 298 } 299 else 300 return __builtin_ia32_paddusw128(a, b); 301 } 302 else 303 { 304 ushort[8] res; // PERF =void; 305 short8 sa = cast(short8)a; 306 short8 sb = cast(short8)b; 307 foreach(i; 0..8) 308 res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i])); 309 return _mm_loadu_si128(cast(int4*)res.ptr); 310 } 311 } 312 unittest 313 { 314 short8 res = cast(short8) _mm_adds_epu16(_mm_set_epi16(3, 2, cast(short)65535, 0, 3, 2, cast(short)65535, 0), 315 _mm_set_epi16(3, 2, 1, 0, 3, 2, 1, 0)); 316 static immutable short[8] correctResult = [0, cast(short)65535, 4, 6, 0, cast(short)65535, 4, 6]; 317 assert(res.array == correctResult); 318 } 319 320 /// Compute the bitwise AND of packed double-precision (64-bit) 321 /// floating-point elements in `a` and `b`. 322 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe 323 { 324 pragma(inline, true); 325 return cast(__m128d)( cast(long2)a & cast(long2)b ); 326 } 327 unittest 328 { 329 double a = 4.32; 330 double b = -78.99; 331 long correct = (*cast(long*)(&a)) & (*cast(long*)(&b)); 332 __m128d A = _mm_set_pd(a, b); 333 __m128d B = _mm_set_pd(b, a); 334 long2 R = cast(long2)( _mm_and_pd(A, B) ); 335 assert(R.array[0] == correct); 336 assert(R.array[1] == correct); 337 } 338 339 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`. 340 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe 341 { 342 pragma(inline, true); 343 return a & b; 344 } 345 unittest 346 { 347 __m128i A = _mm_set1_epi32(7); 348 __m128i B = _mm_set1_epi32(14); 349 __m128i R = _mm_and_si128(A, B); 350 int[4] correct = [6, 6, 6, 6]; 351 assert(R.array == correct); 352 } 353 354 /// Compute the bitwise NOT of packed double-precision (64-bit) 355 /// floating-point elements in `a` and then AND with `b`. 356 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe 357 { 358 return cast(__m128d)( ~(cast(long2)a) & cast(long2)b); 359 } 360 unittest 361 { 362 double a = 4.32; 363 double b = -78.99; 364 long correct = (~*cast(long*)(&a)) & ( *cast(long*)(&b)); 365 long correct2 = ( *cast(long*)(&a)) & (~*cast(long*)(&b)); 366 __m128d A = _mm_setr_pd(a, b); 367 __m128d B = _mm_setr_pd(b, a); 368 long2 R = cast(long2)( _mm_andnot_pd(A, B) ); 369 assert(R.array[0] == correct); 370 assert(R.array[1] == correct2); 371 } 372 373 /// Compute the bitwise NOT of 128 bits (representing integer data) 374 /// in `a` and then AND with `b`. 375 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe 376 { 377 return (~a) & b; 378 } 379 unittest 380 { 381 __m128i A = _mm_set1_epi32(7); 382 __m128i B = _mm_set1_epi32(14); 383 __m128i R = _mm_andnot_si128(A, B); 384 int[4] correct = [8, 8, 8, 8]; 385 assert(R.array == correct); 386 } 387 388 /// Average packed unsigned 16-bit integers in `a` and `b`. 389 __m128i _mm_avg_epu16 (__m128i a, __m128i b) pure @trusted 390 { 391 static if (GDC_with_SSE2) 392 { 393 return cast(__m128i) __builtin_ia32_pavgw128(cast(short8)a, cast(short8)b); 394 } 395 else static if (LDC_with_ARM64) 396 { 397 return cast(__m128i) vrhadd_u16(cast(short8)a, cast(short8)b); 398 } 399 else version(LDC) 400 { 401 // Generates pavgw even in LDC 1.0, even in -O0 402 // But not in ARM 403 enum ir = ` 404 %ia = zext <8 x i16> %0 to <8 x i32> 405 %ib = zext <8 x i16> %1 to <8 x i32> 406 %isum = add <8 x i32> %ia, %ib 407 %isum1 = add <8 x i32> %isum, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 408 %isums = lshr <8 x i32> %isum1, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 409 %r = trunc <8 x i32> %isums to <8 x i16> 410 ret <8 x i16> %r`; 411 return cast(__m128i) LDCInlineIR!(ir, short8, short8, short8)(cast(short8)a, cast(short8)b); 412 } 413 else 414 { 415 short8 sa = cast(short8)a; 416 short8 sb = cast(short8)b; 417 short8 sr = void; 418 foreach(i; 0..8) 419 { 420 sr.ptr[i] = cast(ushort)( (cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]) + 1) >> 1 ); 421 } 422 return cast(int4)sr; 423 } 424 } 425 unittest 426 { 427 __m128i A = _mm_set1_epi16(31); 428 __m128i B = _mm_set1_epi16(64); 429 short8 avg = cast(short8)(_mm_avg_epu16(A, B)); 430 foreach(i; 0..8) 431 assert(avg.array[i] == 48); 432 } 433 434 /// Average packed unsigned 8-bit integers in `a` and `b`. 435 __m128i _mm_avg_epu8 (__m128i a, __m128i b) pure @trusted 436 { 437 static if (GDC_with_SSE2) 438 { 439 return cast(__m128i) __builtin_ia32_pavgb128(cast(ubyte16)a, cast(ubyte16)b); 440 } 441 else static if (LDC_with_ARM64) 442 { 443 return cast(__m128i) vrhadd_u8(cast(byte16)a, cast(byte16)b); 444 } 445 else version(LDC) 446 { 447 // Generates pavgb even in LDC 1.0, even in -O0 448 // But not in ARM 449 enum ir = ` 450 %ia = zext <16 x i8> %0 to <16 x i16> 451 %ib = zext <16 x i8> %1 to <16 x i16> 452 %isum = add <16 x i16> %ia, %ib 453 %isum1 = add <16 x i16> %isum, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 454 %isums = lshr <16 x i16> %isum1, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 455 %r = trunc <16 x i16> %isums to <16 x i8> 456 ret <16 x i8> %r`; 457 return cast(__m128i) LDCInlineIR!(ir, byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 458 } 459 else 460 { 461 byte16 sa = cast(byte16)a; 462 byte16 sb = cast(byte16)b; 463 byte16 sr = void; 464 foreach(i; 0..16) 465 { 466 sr[i] = cast(ubyte)( (cast(ubyte)(sa[i]) + cast(ubyte)(sb[i]) + 1) >> 1 ); 467 } 468 return cast(int4)sr; 469 } 470 } 471 unittest 472 { 473 __m128i A = _mm_set1_epi8(31); 474 __m128i B = _mm_set1_epi8(64); 475 byte16 avg = cast(byte16)(_mm_avg_epu8(A, B)); 476 foreach(i; 0..16) 477 assert(avg.array[i] == 48); 478 } 479 480 /// Shift `a` left by `bytes` bytes while shifting in zeros. 481 alias _mm_bslli_si128 = _mm_slli_si128; 482 unittest 483 { 484 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 485 byte[16] exact = [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 486 __m128i result = _mm_bslli_si128!5(toShift); 487 assert( (cast(byte16)result).array == exact); 488 } 489 490 /// Shift `v` right by `bytes` bytes while shifting in zeros. 491 alias _mm_bsrli_si128 = _mm_srli_si128; 492 unittest 493 { 494 __m128i toShift = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 495 byte[16] exact = [5, 6, 7, 8, 9,10,11,12,13,14, 15, 0, 0, 0, 0, 0]; 496 __m128i result = _mm_bsrli_si128!5(toShift); 497 assert( (cast(byte16)result).array == exact); 498 } 499 500 /// Cast vector of type `__m128d` to type `__m128`. 501 /// Note: Also possible with a regular `cast(__m128)(a)`. 502 __m128 _mm_castpd_ps (__m128d a) pure @safe 503 { 504 return cast(__m128)a; 505 } 506 507 /// Cast vector of type `__m128d` to type `__m128i`. 508 /// Note: Also possible with a regular `cast(__m128i)(a)`. 509 __m128i _mm_castpd_si128 (__m128d a) pure @safe 510 { 511 return cast(__m128i)a; 512 } 513 514 /// Cast vector of type `__m128` to type `__m128d`. 515 /// Note: Also possible with a regular `cast(__m128d)(a)`. 516 __m128d _mm_castps_pd (__m128 a) pure @safe 517 { 518 return cast(__m128d)a; 519 } 520 521 /// Cast vector of type `__m128` to type `__m128i`. 522 /// Note: Also possible with a regular `cast(__m128i)(a)`. 523 __m128i _mm_castps_si128 (__m128 a) pure @safe 524 { 525 return cast(__m128i)a; 526 } 527 528 /// Cast vector of type `__m128i` to type `__m128d`. 529 /// Note: Also possible with a regular `cast(__m128d)(a)`. 530 __m128d _mm_castsi128_pd (__m128i a) pure @safe 531 { 532 return cast(__m128d)a; 533 } 534 535 /// Cast vector of type `__m128i` to type `__m128`. 536 /// Note: Also possible with a regular `cast(__m128)(a)`. 537 __m128 _mm_castsi128_ps (__m128i a) pure @safe 538 { 539 return cast(__m128)a; 540 } 541 542 /// Invalidate and flush the cache line that contains `p` 543 /// from all levels of the cache hierarchy. 544 void _mm_clflush (const(void)* p) @trusted 545 { 546 static if (GDC_with_SSE2) 547 { 548 __builtin_ia32_clflush(p); 549 } 550 else static if (LDC_with_SSE2) 551 { 552 __builtin_ia32_clflush(cast(void*)p); 553 } 554 else version(D_InlineAsm_X86) 555 { 556 asm pure nothrow @nogc @safe 557 { 558 mov EAX, p; 559 clflush [EAX]; 560 } 561 } 562 else version(D_InlineAsm_X86_64) 563 { 564 asm pure nothrow @nogc @safe 565 { 566 mov RAX, p; 567 clflush [RAX]; 568 } 569 } 570 else 571 { 572 // Do nothing. Invalidating cacheline does 573 // not affect correctness. 574 } 575 } 576 unittest 577 { 578 ubyte[64] cacheline; 579 _mm_clflush(cacheline.ptr); 580 } 581 582 /// Compare packed 16-bit integers in `a` and `b` for equality. 583 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe 584 { 585 static if (GDC_with_SSE2) 586 { 587 return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8)a, cast(short8)b); 588 } 589 else 590 { 591 return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b); 592 } 593 } 594 unittest 595 { 596 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 597 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 598 short[8] E = [ 0, 0, 0, 0, -1, 0, 0, 0]; 599 short8 R = cast(short8)(_mm_cmpeq_epi16(cast(__m128i)A, cast(__m128i)B)); 600 assert(R.array == E); 601 } 602 603 /// Compare packed 32-bit integers in `a` and `b` for equality. 604 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe 605 { 606 static if (GDC_with_SSE2) 607 { 608 return __builtin_ia32_pcmpeqd128(a, b); 609 } 610 else 611 { 612 return equalMask!__m128i(a, b); 613 } 614 } 615 unittest 616 { 617 int4 A = [-3, -2, -1, 0]; 618 int4 B = [ 4, -2, 2, 0]; 619 int[4] E = [ 0, -1, 0, -1]; 620 int4 R = cast(int4)(_mm_cmpeq_epi32(A, B)); 621 assert(R.array == E); 622 } 623 624 /// Compare packed 8-bit integers in `a` and `b` for equality. 625 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe 626 { 627 static if (GDC_with_SSE2) 628 { 629 return cast(__m128i) __builtin_ia32_pcmpeqb128(cast(ubyte16)a, cast(ubyte16)b); 630 } 631 else 632 { 633 return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b); 634 } 635 } 636 unittest 637 { 638 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 639 __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 640 byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B); 641 byte[16] correct = [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1]; 642 assert(C.array == correct); 643 } 644 645 /// Compare packed double-precision (64-bit) floating-point elements 646 /// in `a` and `b` for equality. 647 __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe 648 { 649 static if (GDC_with_SSE2) 650 { 651 return __builtin_ia32_cmpeqpd(a, b); 652 } 653 else 654 { 655 return cast(__m128d) cmppd!(FPComparison.oeq)(a, b); 656 } 657 } 658 659 /// Compare the lower double-precision (64-bit) floating-point elements 660 /// in `a` and `b` for equality, store the result in the lower element, 661 /// and copy the upper element from `a`. 662 __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe 663 { 664 static if (GDC_with_SSE2) 665 { 666 return __builtin_ia32_cmpeqsd(a, b); 667 } 668 else 669 { 670 return cast(__m128d) cmpsd!(FPComparison.oeq)(a, b); 671 } 672 } 673 674 /// Compare packed double-precision (64-bit) floating-point elements 675 /// in `a` and `b` for greater-than-or-equal. 676 __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe 677 { 678 static if (GDC_with_SSE2) 679 { 680 return __builtin_ia32_cmpgepd(a, b); 681 } 682 else 683 { 684 return cast(__m128d) cmppd!(FPComparison.oge)(a, b); 685 } 686 } 687 688 /// Compare the lower double-precision (64-bit) floating-point elements 689 /// in `a` and `b` for greater-than-or-equal, store the result in the 690 /// lower element, and copy the upper element from `a`. 691 __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe 692 { 693 // Note: There is no __builtin_ia32_cmpgesd builtin. 694 static if (GDC_with_SSE2) 695 { 696 return __builtin_ia32_cmpnltsd(b, a); 697 } 698 else 699 { 700 return cast(__m128d) cmpsd!(FPComparison.oge)(a, b); 701 } 702 } 703 704 /// Compare packed 16-bit integers in `a` and `b` for greater-than. 705 __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe 706 { 707 static if (GDC_with_SSE2) 708 { 709 return cast(__m128i) __builtin_ia32_pcmpgtw128(cast(short8)a, cast(short8)b); 710 } 711 else 712 { 713 return cast(__m128i) greaterMask!short8(cast(short8)a, cast(short8)b); 714 } 715 } 716 unittest 717 { 718 short8 A = [-3, -2, -1, 0, 0, 1, 2, 3]; 719 short8 B = [ 4, 3, 2, 1, 0, -1, -2, -3]; 720 short[8] E = [ 0, 0, 0, 0, 0, -1, -1, -1]; 721 short8 R = cast(short8)(_mm_cmpgt_epi16(cast(__m128i)A, cast(__m128i)B)); 722 assert(R.array == E); 723 } 724 725 /// Compare packed 32-bit integers in `a` and `b` for greater-than. 726 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe 727 { 728 static if (GDC_with_SSE2) 729 { 730 return __builtin_ia32_pcmpgtd128(a, b); 731 } 732 else 733 { 734 return cast(__m128i)( greaterMask!int4(a, b)); 735 } 736 } 737 unittest 738 { 739 int4 A = [-3, 2, -1, 0]; 740 int4 B = [ 4, -2, 2, 0]; 741 int[4] E = [ 0, -1, 0, 0]; 742 int4 R = cast(int4)(_mm_cmpgt_epi32(A, B)); 743 assert(R.array == E); 744 } 745 746 /// Compare packed 8-bit integers in `a` and `b` for greater-than. 747 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe 748 { 749 // Workaround of a GCC bug here. 750 // Of course the GCC builtin is buggy and generates a weird (and wrong) sequence 751 // with __builtin_ia32_pcmpgtb128. 752 // GCC's emmintrin.h uses comparison operators we don't have instead. 753 // PERF: this is a quite severe GDC performance problem. 754 // Could be workarounded with inline assembly, or another algorithm I guess. 755 756 /* 757 static if (GDC_with_SSE2) 758 { 759 return cast(__m128i) __builtin_ia32_pcmpgtb128(cast(ubyte16)a, cast(ubyte16)b); 760 } 761 else */ 762 { 763 return cast(__m128i) greaterMask!byte16(cast(byte16)a, cast(byte16)b); 764 } 765 } 766 unittest 767 { 768 __m128i A = _mm_setr_epi8(1, 2, 3, 1, 127, -80, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1); 769 __m128i B = _mm_setr_epi8(2, 2, 1, 2, -128, -42, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1); 770 byte16 C = cast(byte16) _mm_cmpgt_epi8(A, B); 771 byte[16] correct = [0, 0,-1, 0, -1, 0, 0, 0,-1,-1,-1, 0, 0, 0,-1, 0]; 772 __m128i D = _mm_cmpeq_epi8(A, B); 773 assert(C.array == correct); 774 } 775 776 /// Compare packed double-precision (64-bit) floating-point elements 777 /// in `a` and `b` for greater-than. 778 __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe 779 { 780 static if (GDC_with_SSE2) 781 { 782 return __builtin_ia32_cmpgtpd(a, b); 783 } 784 else 785 { 786 return cast(__m128d) cmppd!(FPComparison.ogt)(a, b); 787 } 788 } 789 790 /// Compare the lower double-precision (64-bit) floating-point elements 791 /// in `a` and `b` for greater-than, store the result in the lower element, 792 /// and copy the upper element from `a`. 793 __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe 794 { 795 // Note: There is no __builtin_ia32_cmpgtsd builtin. 796 static if (GDC_with_SSE2) 797 { 798 return __builtin_ia32_cmpnlesd(b, a); 799 } 800 else 801 { 802 return cast(__m128d) cmpsd!(FPComparison.ogt)(a, b); 803 } 804 } 805 806 /// Compare packed double-precision (64-bit) floating-point elements 807 /// in `a` and `b` for less-than-or-equal. 808 __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe 809 { 810 static if (GDC_with_SSE2) 811 { 812 return __builtin_ia32_cmplepd(a, b); 813 } 814 else 815 { 816 return cast(__m128d) cmppd!(FPComparison.ole)(a, b); 817 } 818 } 819 820 /// Compare the lower double-precision (64-bit) floating-point elements 821 /// in `a` and `b` for less-than-or-equal, store the result in the 822 /// lower element, and copy the upper element from `a`. 823 __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe 824 { 825 static if (GDC_with_SSE2) 826 { 827 return __builtin_ia32_cmplesd(a, b); 828 } 829 else 830 { 831 return cast(__m128d) cmpsd!(FPComparison.ole)(a, b); 832 } 833 } 834 835 /// Compare packed 16-bit integers in `a` and `b` for less-than. 836 __m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe 837 { 838 return _mm_cmpgt_epi16(b, a); 839 } 840 841 /// Compare packed 32-bit integers in `a` and `b` for less-than. 842 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe 843 { 844 return _mm_cmpgt_epi32(b, a); 845 } 846 847 /// Compare packed 8-bit integers in `a` and `b` for less-than. 848 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe 849 { 850 return _mm_cmpgt_epi8(b, a); 851 } 852 853 /// Compare packed double-precision (64-bit) floating-point elements 854 /// in `a` and `b` for less-than. 855 __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe 856 { 857 static if (GDC_with_SSE2) 858 { 859 return __builtin_ia32_cmpltpd(a, b); 860 } 861 else 862 { 863 return cast(__m128d) cmppd!(FPComparison.olt)(a, b); 864 } 865 } 866 867 /// Compare the lower double-precision (64-bit) floating-point elements 868 /// in `a` and `b` for less-than, store the result in the lower 869 /// element, and copy the upper element from `a`. 870 __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe 871 { 872 static if (GDC_with_SSE2) 873 { 874 return __builtin_ia32_cmpltsd(a, b); 875 } 876 else 877 { 878 return cast(__m128d) cmpsd!(FPComparison.olt)(a, b); 879 } 880 } 881 882 /// Compare packed double-precision (64-bit) floating-point elements 883 /// in `a` and `b` for not-equal. 884 __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe 885 { 886 static if (GDC_with_SSE2) 887 { 888 return __builtin_ia32_cmpneqpd(a, b); 889 } 890 else 891 { 892 return cast(__m128d) cmppd!(FPComparison.une)(a, b); 893 } 894 } 895 896 /// Compare the lower double-precision (64-bit) floating-point elements 897 /// in `a` and `b` for not-equal, store the result in the lower 898 /// element, and copy the upper element from `a`. 899 __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe 900 { 901 static if (GDC_with_SSE2) 902 { 903 return __builtin_ia32_cmpneqsd(a, b); 904 } 905 else 906 { 907 return cast(__m128d) cmpsd!(FPComparison.une)(a, b); 908 } 909 } 910 911 /// Compare packed double-precision (64-bit) floating-point elements 912 /// in `a` and `b` for not-greater-than-or-equal. 913 __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe 914 { 915 static if (GDC_with_SSE2) 916 { 917 return __builtin_ia32_cmpngepd(a, b); 918 } 919 else 920 { 921 return cast(__m128d) cmppd!(FPComparison.ult)(a, b); 922 } 923 } 924 925 /// Compare the lower double-precision (64-bit) floating-point elements 926 /// in `a` and `b` for not-greater-than-or-equal, store the result in 927 /// the lower element, and copy the upper element from `a`. 928 __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe 929 { 930 // Note: There is no __builtin_ia32_cmpngesd builtin. 931 static if (GDC_with_SSE2) 932 { 933 return __builtin_ia32_cmpltsd(b, a); 934 } 935 else 936 { 937 return cast(__m128d) cmpsd!(FPComparison.ult)(a, b); 938 } 939 } 940 941 /// Compare packed double-precision (64-bit) floating-point elements 942 /// in `a` and `b` for not-greater-than. 943 __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe 944 { 945 static if (GDC_with_SSE2) 946 { 947 return __builtin_ia32_cmpngtpd(a, b); 948 } 949 else 950 { 951 return cast(__m128d) cmppd!(FPComparison.ule)(a, b); 952 } 953 } 954 955 /// Compare the lower double-precision (64-bit) floating-point elements 956 /// in `a` and `b` for not-greater-than, store the result in the 957 /// lower element, and copy the upper element from `a`. 958 __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe 959 { 960 // Note: There is no __builtin_ia32_cmpngtsd builtin. 961 static if (GDC_with_SSE2) 962 { 963 return __builtin_ia32_cmplesd(b, a); 964 } 965 else 966 { 967 return cast(__m128d) cmpsd!(FPComparison.ule)(a, b); 968 } 969 } 970 971 /// Compare packed double-precision (64-bit) floating-point elements 972 /// in `a` and `b` for not-less-than-or-equal. 973 __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe 974 { 975 static if (GDC_with_SSE2) 976 { 977 return __builtin_ia32_cmpnlepd(a, b); 978 } 979 else 980 { 981 return cast(__m128d) cmppd!(FPComparison.ugt)(a, b); 982 } 983 } 984 985 /// Compare the lower double-precision (64-bit) floating-point elements 986 /// in `a` and `b` for not-less-than-or-equal, store the result in the 987 /// lower element, and copy the upper element from `a`. 988 __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe 989 { 990 static if (GDC_with_SSE2) 991 { 992 return __builtin_ia32_cmpnlesd(a, b); 993 } 994 else 995 { 996 return cast(__m128d) cmpsd!(FPComparison.ugt)(a, b); 997 } 998 } 999 1000 /// Compare packed double-precision (64-bit) floating-point elements 1001 /// in `a` and `b` for not-less-than. 1002 __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe 1003 { 1004 static if (GDC_with_SSE2) 1005 { 1006 return __builtin_ia32_cmpnltpd(a, b); 1007 } 1008 else 1009 { 1010 return cast(__m128d) cmppd!(FPComparison.uge)(a, b); 1011 } 1012 } 1013 1014 /// Compare the lower double-precision (64-bit) floating-point elements 1015 /// in `a` and `b` for not-less-than, store the result in the lower 1016 /// element, and copy the upper element from `a`. 1017 __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe 1018 { 1019 static if (GDC_with_SSE2) 1020 { 1021 return __builtin_ia32_cmpnltsd(a, b); 1022 } 1023 else 1024 { 1025 return cast(__m128d) cmpsd!(FPComparison.uge)(a, b); 1026 } 1027 } 1028 1029 /// Compare packed double-precision (64-bit) floating-point elements 1030 /// in `a` and `b` to see if neither is NaN. 1031 __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe 1032 { 1033 static if (GDC_with_SSE2) 1034 { 1035 return __builtin_ia32_cmpordpd(a, b); 1036 } 1037 else 1038 { 1039 return cast(__m128d) cmppd!(FPComparison.ord)(a, b); 1040 } 1041 } 1042 1043 /// Compare the lower double-precision (64-bit) floating-point elements 1044 /// in `a` and `b` to see if neither is NaN, store the result in the 1045 /// lower element, and copy the upper element from `a` to the upper element. 1046 __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe 1047 { 1048 static if (GDC_with_SSE2) 1049 { 1050 return __builtin_ia32_cmpordsd(a, b); 1051 } 1052 else 1053 { 1054 return cast(__m128d) cmpsd!(FPComparison.ord)(a, b); 1055 } 1056 } 1057 1058 /// Compare packed double-precision (64-bit) floating-point elements 1059 /// in `a` and `b` to see if either is NaN. 1060 __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe 1061 { 1062 static if (GDC_with_SSE2) 1063 { 1064 return __builtin_ia32_cmpunordpd(a, b); 1065 } 1066 else 1067 { 1068 return cast(__m128d) cmppd!(FPComparison.uno)(a, b); 1069 } 1070 } 1071 1072 /// Compare the lower double-precision (64-bit) floating-point elements 1073 /// in `a` and `b` to see if either is NaN, store the result in the lower 1074 /// element, and copy the upper element from `a` to the upper element. 1075 __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe 1076 { 1077 static if (GDC_with_SSE2) 1078 { 1079 return __builtin_ia32_cmpunordsd(a, b); 1080 } 1081 else 1082 { 1083 return cast(__m128d) cmpsd!(FPComparison.uno)(a, b); 1084 } 1085 } 1086 1087 /// Compare the lower double-precision (64-bit) floating-point element 1088 /// in `a` and `b` for equality, and return the boolean result (0 or 1). 1089 int _mm_comieq_sd (__m128d a, __m128d b) pure @safe 1090 { 1091 // Note: For some of the _mm_comixx_sx intrinsics, NaN semantics of the intrinsic are not the same as the 1092 // comisd instruction, it returns false in case of unordered instead. 1093 // 1094 // Actually C++ compilers disagree over the meaning of that instruction. 1095 // GCC will manage NaNs like the comisd instruction (return true if unordered), 1096 // but ICC, clang and MSVC will deal with NaN like the Intel Intrinsics Guide says. 1097 // We choose to do like the most numerous. It seems GCC is buggy with NaNs. 1098 return a.array[0] == b.array[0]; 1099 } 1100 unittest 1101 { 1102 assert(1 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1103 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1104 assert(0 == _mm_comieq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1105 assert(0 == _mm_comieq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1106 assert(1 == _mm_comieq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1107 } 1108 1109 /// Compare the lower double-precision (64-bit) floating-point element 1110 /// in `a` and `b` for greater-than-or-equal, and return the boolean 1111 /// result (0 or 1). 1112 int _mm_comige_sd (__m128d a, __m128d b) pure @safe 1113 { 1114 return a.array[0] >= b.array[0]; 1115 } 1116 unittest 1117 { 1118 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1119 assert(1 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1120 assert(0 == _mm_comige_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1121 assert(0 == _mm_comige_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1122 assert(0 == _mm_comige_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1123 assert(1 == _mm_comige_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1124 } 1125 1126 /// Compare the lower double-precision (64-bit) floating-point element 1127 /// in `a` and `b` for greater-than, and return the boolean result (0 or 1). 1128 int _mm_comigt_sd (__m128d a, __m128d b) pure @safe 1129 { 1130 return a.array[0] > b.array[0]; 1131 } 1132 unittest 1133 { 1134 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1135 assert(1 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1136 assert(0 == _mm_comigt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1137 assert(0 == _mm_comigt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1138 assert(0 == _mm_comigt_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1139 } 1140 1141 /// Compare the lower double-precision (64-bit) floating-point element 1142 /// in `a` and `b` for less-than-or-equal. 1143 int _mm_comile_sd (__m128d a, __m128d b) pure @safe 1144 { 1145 return a.array[0] <= b.array[0]; 1146 } 1147 unittest 1148 { 1149 assert(1 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1150 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1151 assert(1 == _mm_comile_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1152 assert(0 == _mm_comile_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1153 assert(0 == _mm_comile_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1154 assert(1 == _mm_comile_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1155 } 1156 1157 /// Compare the lower double-precision (64-bit) floating-point element 1158 /// in `a` and `b` for less-than, and return the boolean result (0 or 1). 1159 int _mm_comilt_sd (__m128d a, __m128d b) pure @safe 1160 { 1161 return a.array[0] < b.array[0]; 1162 } 1163 unittest 1164 { 1165 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1166 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1167 assert(1 == _mm_comilt_sd(_mm_set_sd(-78.0), _mm_set_sd(78.0))); 1168 assert(0 == _mm_comilt_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1169 assert(0 == _mm_comilt_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1170 assert(0 == _mm_comilt_sd(_mm_set_sd(-0.0), _mm_set_sd(0.0))); 1171 } 1172 1173 /// Compare the lower double-precision (64-bit) floating-point element 1174 /// in `a` and `b` for not-equal, and return the boolean result (0 or 1). 1175 int _mm_comineq_sd (__m128d a, __m128d b) pure @safe 1176 { 1177 return a.array[0] != b.array[0]; 1178 } 1179 unittest 1180 { 1181 assert(0 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(78.0))); 1182 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(-78.0))); 1183 assert(1 == _mm_comineq_sd(_mm_set_sd(78.0), _mm_set_sd(double.nan))); 1184 assert(1 == _mm_comineq_sd(_mm_set_sd(double.nan), _mm_set_sd(-4.22))); 1185 assert(0 == _mm_comineq_sd(_mm_set_sd(0.0), _mm_set_sd(-0.0))); 1186 } 1187 1188 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) 1189 /// floating-point elements. 1190 __m128d _mm_cvtepi32_pd (__m128i a) pure @trusted 1191 { 1192 version(LDC) 1193 { 1194 // Generates cvtdq2pd since LDC 1.0, even without optimizations 1195 enum ir = ` 1196 %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1> 1197 %r = sitofp <2 x i32> %v to <2 x double> 1198 ret <2 x double> %r`; 1199 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128i)(a); 1200 } 1201 else static if (GDC_with_SSE2) 1202 { 1203 return __builtin_ia32_cvtdq2pd(a); 1204 } 1205 else 1206 { 1207 double2 r = void; 1208 r.ptr[0] = a.array[0]; 1209 r.ptr[1] = a.array[1]; 1210 return r; 1211 } 1212 } 1213 unittest 1214 { 1215 __m128d A = _mm_cvtepi32_pd(_mm_set1_epi32(54)); 1216 assert(A.array[0] == 54.0); 1217 assert(A.array[1] == 54.0); 1218 } 1219 1220 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) 1221 /// floating-point elements. 1222 __m128 _mm_cvtepi32_ps(__m128i a) pure @trusted 1223 { 1224 static if (GDC_with_SSE2) 1225 { 1226 return __builtin_ia32_cvtdq2ps(a); 1227 } 1228 else version(LDC) 1229 { 1230 // See #86 for why we had to resort to LLVM IR. 1231 // Plain code below was leading to catastrophic behaviour. 1232 // x86: Generates cvtdq2ps since LDC 1.1.0 -O0 1233 // ARM: Generats scvtf.4s since LDC 1.8.0 -O0 1234 enum ir = ` 1235 %r = sitofp <4 x i32> %0 to <4 x float> 1236 ret <4 x float> %r`; 1237 return cast(__m128) LDCInlineIR!(ir, float4, int4)(a); 1238 } 1239 else 1240 { 1241 __m128 res; // PERF =void; 1242 res.ptr[0] = cast(float)a.array[0]; 1243 res.ptr[1] = cast(float)a.array[1]; 1244 res.ptr[2] = cast(float)a.array[2]; 1245 res.ptr[3] = cast(float)a.array[3]; 1246 return res; 1247 } 1248 } 1249 unittest 1250 { 1251 __m128 a = _mm_cvtepi32_ps(_mm_setr_epi32(-1, 0, 1, 1000)); 1252 assert(a.array == [-1.0f, 0.0f, 1.0f, 1000.0f]); 1253 } 1254 1255 /// Convert packed double-precision (64-bit) floating-point elements 1256 /// in `a` to packed 32-bit integers. 1257 __m128i _mm_cvtpd_epi32 (__m128d a) @trusted 1258 { 1259 // PERF ARM32 1260 static if (LDC_with_SSE2) 1261 { 1262 return __builtin_ia32_cvtpd2dq(a); 1263 } 1264 else static if (GDC_with_SSE2) 1265 { 1266 return __builtin_ia32_cvtpd2dq(a); 1267 } 1268 else static if (LDC_with_ARM64) 1269 { 1270 // Get current rounding mode. 1271 uint fpscr = arm_get_fpcr(); 1272 long2 i; 1273 switch(fpscr & _MM_ROUND_MASK_ARM) 1274 { 1275 default: 1276 case _MM_ROUND_NEAREST_ARM: i = vcvtnq_s64_f64(a); break; 1277 case _MM_ROUND_DOWN_ARM: i = vcvtmq_s64_f64(a); break; 1278 case _MM_ROUND_UP_ARM: i = vcvtpq_s64_f64(a); break; 1279 case _MM_ROUND_TOWARD_ZERO_ARM: i = vcvtzq_s64_f64(a); break; 1280 } 1281 int4 zero = 0; 1282 return cast(__m128i) shufflevectorLDC!(int4, 0, 2, 4, 6)(cast(int4)i, zero); 1283 } 1284 else 1285 { 1286 // PERF ARM32 1287 __m128i r = _mm_setzero_si128(); 1288 r.ptr[0] = convertDoubleToInt32UsingMXCSR(a.array[0]); 1289 r.ptr[1] = convertDoubleToInt32UsingMXCSR(a.array[1]); 1290 return r; 1291 } 1292 } 1293 unittest 1294 { 1295 int4 A = _mm_cvtpd_epi32(_mm_set_pd(61.0, 55.0)); 1296 assert(A.array[0] == 55 && A.array[1] == 61 && A.array[2] == 0 && A.array[3] == 0); 1297 } 1298 1299 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1300 /// to packed 32-bit integers 1301 __m64 _mm_cvtpd_pi32 (__m128d v) @safe 1302 { 1303 return to_m64(_mm_cvtpd_epi32(v)); 1304 } 1305 unittest 1306 { 1307 int2 A = cast(int2) _mm_cvtpd_pi32(_mm_set_pd(61.0, 55.0)); 1308 assert(A.array[0] == 55 && A.array[1] == 61); 1309 } 1310 1311 /// Convert packed double-precision (64-bit) floating-point elements 1312 /// in `a` to packed single-precision (32-bit) floating-point elements. 1313 __m128 _mm_cvtpd_ps (__m128d a) pure @trusted 1314 { 1315 static if (LDC_with_SSE2) 1316 { 1317 return __builtin_ia32_cvtpd2ps(a); // can't be done with IR unfortunately 1318 } 1319 else static if (GDC_with_SSE2) 1320 { 1321 return __builtin_ia32_cvtpd2ps(a); 1322 } 1323 else 1324 { 1325 __m128 r = void; 1326 r.ptr[0] = a.array[0]; 1327 r.ptr[1] = a.array[1]; 1328 r.ptr[2] = 0; 1329 r.ptr[3] = 0; 1330 return r; 1331 } 1332 } 1333 unittest 1334 { 1335 __m128d A = _mm_set_pd(5.25, 4.0); 1336 __m128 B = _mm_cvtpd_ps(A); 1337 assert(B.array == [4.0f, 5.25f, 0, 0]); 1338 } 1339 1340 /// Convert packed 32-bit integers in `v` to packed double-precision 1341 /// (64-bit) floating-point elements. 1342 __m128d _mm_cvtpi32_pd (__m64 v) pure @safe 1343 { 1344 return _mm_cvtepi32_pd(to_m128i(v)); 1345 } 1346 unittest 1347 { 1348 __m128d A = _mm_cvtpi32_pd(_mm_setr_pi32(4, -5)); 1349 assert(A.array[0] == 4.0 && A.array[1] == -5.0); 1350 } 1351 1352 /// Convert packed single-precision (32-bit) floating-point elements 1353 /// in `a` to packed 32-bit integers 1354 __m128i _mm_cvtps_epi32 (__m128 a) @trusted 1355 { 1356 static if (LDC_with_SSE2) 1357 { 1358 return cast(__m128i) __builtin_ia32_cvtps2dq(a); 1359 } 1360 else static if (GDC_with_SSE2) 1361 { 1362 return __builtin_ia32_cvtps2dq(a); 1363 } 1364 else static if (LDC_with_ARM64) 1365 { 1366 // Get current rounding mode. 1367 uint fpscr = arm_get_fpcr(); 1368 switch(fpscr & _MM_ROUND_MASK_ARM) 1369 { 1370 default: 1371 case _MM_ROUND_NEAREST_ARM: return vcvtnq_s32_f32(a); 1372 case _MM_ROUND_DOWN_ARM: return vcvtmq_s32_f32(a); 1373 case _MM_ROUND_UP_ARM: return vcvtpq_s32_f32(a); 1374 case _MM_ROUND_TOWARD_ZERO_ARM: return vcvtzq_s32_f32(a); 1375 } 1376 } 1377 else 1378 { 1379 __m128i r = void; 1380 r.ptr[0] = convertFloatToInt32UsingMXCSR(a.array[0]); 1381 r.ptr[1] = convertFloatToInt32UsingMXCSR(a.array[1]); 1382 r.ptr[2] = convertFloatToInt32UsingMXCSR(a.array[2]); 1383 r.ptr[3] = convertFloatToInt32UsingMXCSR(a.array[3]); 1384 return r; 1385 } 1386 } 1387 unittest 1388 { 1389 // GDC bug #98607 1390 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98607 1391 // GDC does not provide optimization barrier for rounding mode. 1392 // Workarounded with different literals. This bug will likely only manifest in unittest. 1393 // GCC people provided no actual fix and instead say other compilers are buggy... when they aren't. 1394 1395 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1396 1397 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1398 __m128i A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.1f, 53.5f, -2.9f)); 1399 assert(A.array == [1, -2, 54, -3]); 1400 1401 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1402 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.11f, 53.4f, -2.8f)); 1403 assert(A.array == [1, -3, 53, -3]); 1404 1405 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1406 A = _mm_cvtps_epi32(_mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f)); 1407 assert(A.array == [2, -2, 54, -2]); 1408 1409 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1410 A = _mm_cvtps_epi32(_mm_setr_ps(1.4f, -2.17f, 53.8f, -2.91f)); 1411 assert(A.array == [1, -2, 53, -2]); 1412 1413 _MM_SET_ROUNDING_MODE(savedRounding); 1414 } 1415 1416 /// Convert packed single-precision (32-bit) floating-point elements 1417 /// in `a` to packed double-precision (64-bit) floating-point elements. 1418 __m128d _mm_cvtps_pd (__m128 a) pure @trusted 1419 { 1420 version(LDC) 1421 { 1422 // Generates cvtps2pd since LDC 1.0 -O0 1423 enum ir = ` 1424 %v = shufflevector <4 x float> %0,<4 x float> %0, <2 x i32> <i32 0, i32 1> 1425 %r = fpext <2 x float> %v to <2 x double> 1426 ret <2 x double> %r`; 1427 return cast(__m128d) LDCInlineIR!(ir, __m128d, __m128)(a); 1428 } 1429 else static if (GDC_with_SSE2) 1430 { 1431 return __builtin_ia32_cvtps2pd(a); 1432 } 1433 else 1434 { 1435 double2 r = void; 1436 r.ptr[0] = a.array[0]; 1437 r.ptr[1] = a.array[1]; 1438 return r; 1439 } 1440 } 1441 unittest 1442 { 1443 __m128d A = _mm_cvtps_pd(_mm_set1_ps(54.0f)); 1444 assert(A.array[0] == 54.0); 1445 assert(A.array[1] == 54.0); 1446 } 1447 1448 /// Copy the lower double-precision (64-bit) floating-point element of `a`. 1449 double _mm_cvtsd_f64 (__m128d a) pure @safe 1450 { 1451 return a.array[0]; 1452 } 1453 1454 /// Convert the lower double-precision (64-bit) floating-point element 1455 /// in `a` to a 32-bit integer. 1456 int _mm_cvtsd_si32 (__m128d a) @safe 1457 { 1458 static if (LDC_with_SSE2) 1459 { 1460 return __builtin_ia32_cvtsd2si(a); 1461 } 1462 else static if (GDC_with_SSE2) 1463 { 1464 return __builtin_ia32_cvtsd2si(a); 1465 } 1466 else 1467 { 1468 return convertDoubleToInt32UsingMXCSR(a[0]); 1469 } 1470 } 1471 unittest 1472 { 1473 assert(4 == _mm_cvtsd_si32(_mm_set1_pd(4.0))); 1474 } 1475 1476 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer. 1477 long _mm_cvtsd_si64 (__m128d a) @trusted 1478 { 1479 version (LDC) 1480 { 1481 version (X86_64) 1482 { 1483 return __builtin_ia32_cvtsd2si64(a); 1484 } 1485 else 1486 { 1487 // Note: In 32-bit x86, there is no way to convert from float/double to 64-bit integer 1488 // using SSE instructions only. So the builtin doesn't exit for this arch. 1489 return convertDoubleToInt64UsingMXCSR(a[0]); 1490 } 1491 } 1492 else 1493 { 1494 return convertDoubleToInt64UsingMXCSR(a.array[0]); 1495 } 1496 } 1497 unittest 1498 { 1499 assert(-4 == _mm_cvtsd_si64(_mm_set1_pd(-4.0))); 1500 1501 uint savedRounding = _MM_GET_ROUNDING_MODE(); 1502 1503 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 1504 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.49))); 1505 1506 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); 1507 assert(-56468486187 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.1))); 1508 1509 _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); 1510 assert(56468486187 == _mm_cvtsd_si64(_mm_set1_pd(56468486186.1))); 1511 1512 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); 1513 assert(-56468486186 == _mm_cvtsd_si64(_mm_set1_pd(-56468486186.9))); 1514 1515 _MM_SET_ROUNDING_MODE(savedRounding); 1516 } 1517 1518 deprecated("Use _mm_cvtsd_si64 instead") alias _mm_cvtsd_si64x = _mm_cvtsd_si64; /// 1519 1520 /// Convert the lower double-precision (64-bit) floating-point element in `b` to a single-precision (32-bit) 1521 /// floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from `a` 1522 /// to the upper elements of result. 1523 __m128 _mm_cvtsd_ss (__m128 a, __m128d b) pure @trusted 1524 { 1525 static if (GDC_with_SSE2) 1526 { 1527 return __builtin_ia32_cvtsd2ss(a, b); 1528 } 1529 else 1530 { 1531 // Generates cvtsd2ss since LDC 1.3 -O0 1532 a.ptr[0] = b.array[0]; 1533 return a; 1534 } 1535 } 1536 unittest 1537 { 1538 __m128 R = _mm_cvtsd_ss(_mm_set1_ps(4.0f), _mm_set1_pd(3.0)); 1539 assert(R.array == [3.0f, 4.0f, 4.0f, 4.0f]); 1540 } 1541 1542 /// Get the lower 32-bit integer in `a`. 1543 int _mm_cvtsi128_si32 (__m128i a) pure @safe 1544 { 1545 return a.array[0]; 1546 } 1547 1548 /// Get the lower 64-bit integer in `a`. 1549 long _mm_cvtsi128_si64 (__m128i a) pure @safe 1550 { 1551 long2 la = cast(long2)a; 1552 return la.array[0]; 1553 } 1554 deprecated("Use _mm_cvtsi128_si64 instead") alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64; 1555 1556 /// Convert the signed 32-bit integer `b` to a double-precision (64-bit) floating-point element, store that in the 1557 /// lower element of result, and copy the upper element from `a` to the upper element of result. 1558 __m128d _mm_cvtsi32_sd(__m128d a, int b) pure @trusted 1559 { 1560 a.ptr[0] = cast(double)b; 1561 return a; 1562 } 1563 unittest 1564 { 1565 __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42); 1566 assert(a.array == [42.0, 0]); 1567 } 1568 1569 /// Copy 32-bit integer `a` to the lower element of result, and zero the upper elements. 1570 __m128i _mm_cvtsi32_si128 (int a) pure @trusted 1571 { 1572 int4 r = [0, 0, 0, 0]; 1573 r.ptr[0] = a; 1574 return r; 1575 } 1576 unittest 1577 { 1578 __m128i a = _mm_cvtsi32_si128(65); 1579 assert(a.array == [65, 0, 0, 0]); 1580 } 1581 1582 /// Convert the signed 64-bit integer `b` to a double-precision (64-bit) floating-point element, store the result in 1583 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 1584 1585 __m128d _mm_cvtsi64_sd(__m128d a, long b) pure @trusted 1586 { 1587 a.ptr[0] = cast(double)b; 1588 return a; 1589 } 1590 unittest 1591 { 1592 __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42); 1593 assert(a.array == [42.0, 0]); 1594 } 1595 1596 /// Copy 64-bit integer `a` to the lower element of result, and zero the upper element. 1597 __m128i _mm_cvtsi64_si128 (long a) pure @trusted 1598 { 1599 long2 r = [0, 0]; 1600 r.ptr[0] = a; 1601 return cast(__m128i)(r); 1602 } 1603 1604 deprecated("Use _mm_cvtsi64_sd instead") alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd; /// 1605 deprecated("Use _mm_cvtsi64_si128 instead") alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128; /// 1606 1607 /// Convert the lower single-precision (32-bit) floating-point element in `b` to a double-precision (64-bit) 1608 /// floating-point element, store that in the lower element of result, and copy the upper element from `a` to the upper 1609 // element of result. 1610 double2 _mm_cvtss_sd(double2 a, float4 b) pure @trusted 1611 { 1612 a.ptr[0] = b.array[0]; 1613 return a; 1614 } 1615 unittest 1616 { 1617 __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f)); 1618 assert(a.array == [42.0, 0]); 1619 } 1620 1621 /// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit integer with truncation. 1622 long _mm_cvttss_si64 (__m128 a) pure @safe 1623 { 1624 return cast(long)(a.array[0]); // Generates cvttss2si as expected 1625 } 1626 unittest 1627 { 1628 assert(1 == _mm_cvttss_si64(_mm_setr_ps(1.9f, 2.0f, 3.0f, 4.0f))); 1629 } 1630 1631 /// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1632 /// Put zeroes in the upper elements of result. 1633 __m128i _mm_cvttpd_epi32 (__m128d a) pure @trusted 1634 { 1635 static if (LDC_with_SSE2) 1636 { 1637 return __builtin_ia32_cvttpd2dq(a); 1638 } 1639 else static if (GDC_with_SSE2) 1640 { 1641 return __builtin_ia32_cvttpd2dq(a); 1642 } 1643 else 1644 { 1645 // Note: doesn't generate cvttpd2dq as of LDC 1.13 1646 __m128i r; // PERF =void; 1647 r.ptr[0] = cast(int)a.array[0]; 1648 r.ptr[1] = cast(int)a.array[1]; 1649 r.ptr[2] = 0; 1650 r.ptr[3] = 0; 1651 return r; 1652 } 1653 } 1654 unittest 1655 { 1656 __m128i R = _mm_cvttpd_epi32(_mm_setr_pd(-4.9, 45641.5f)); 1657 assert(R.array == [-4, 45641, 0, 0]); 1658 } 1659 1660 /// Convert packed double-precision (64-bit) floating-point elements in `v` 1661 /// to packed 32-bit integers with truncation. 1662 __m64 _mm_cvttpd_pi32 (__m128d v) pure @safe 1663 { 1664 return to_m64(_mm_cvttpd_epi32(v)); 1665 } 1666 unittest 1667 { 1668 int2 R = cast(int2) _mm_cvttpd_pi32(_mm_setr_pd(-4.9, 45641.7f)); 1669 int[2] correct = [-4, 45641]; 1670 assert(R.array == correct); 1671 } 1672 1673 /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation. 1674 __m128i _mm_cvttps_epi32 (__m128 a) pure @trusted 1675 { 1676 // x86: Generates cvttps2dq since LDC 1.3 -O2 1677 // ARM64: generates fcvtze since LDC 1.8 -O2 1678 __m128i r; // PERF = void; 1679 r.ptr[0] = cast(int)a.array[0]; 1680 r.ptr[1] = cast(int)a.array[1]; 1681 r.ptr[2] = cast(int)a.array[2]; 1682 r.ptr[3] = cast(int)a.array[3]; 1683 return r; 1684 } 1685 unittest 1686 { 1687 __m128i R = _mm_cvttps_epi32(_mm_setr_ps(-4.9, 45641.5f, 0.0f, 1.0f)); 1688 assert(R.array == [-4, 45641, 0, 1]); 1689 } 1690 1691 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer with truncation. 1692 int _mm_cvttsd_si32 (__m128d a) 1693 { 1694 // Generates cvttsd2si since LDC 1.3 -O0 1695 return cast(int)a.array[0]; 1696 } 1697 1698 /// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer with truncation. 1699 long _mm_cvttsd_si64 (__m128d a) 1700 { 1701 // Generates cvttsd2si since LDC 1.3 -O0 1702 // but in 32-bit instead, it's a long sequence that resort to FPU 1703 return cast(long)a.array[0]; 1704 } 1705 1706 deprecated("Use _mm_cvttsd_si64 instead") alias _mm_cvttsd_si64x = _mm_cvttsd_si64; /// 1707 1708 /// Divide packed double-precision (64-bit) floating-point elements in `a` by packed elements in `b`. 1709 __m128d _mm_div_pd(__m128d a, __m128d b) pure @safe 1710 { 1711 pragma(inline, true); 1712 return a / b; 1713 } 1714 1715 __m128d _mm_div_sd(__m128d a, __m128d b) pure @trusted 1716 { 1717 static if (GDC_with_SSE2) 1718 { 1719 return __builtin_ia32_divsd(a, b); 1720 } 1721 else version(DigitalMars) 1722 { 1723 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 1724 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 1725 asm pure nothrow @nogc @trusted { nop;} 1726 a.array[0] = a.array[0] / b.array[0]; 1727 return a; 1728 } 1729 else 1730 { 1731 a.ptr[0] /= b.array[0]; 1732 return a; 1733 } 1734 } 1735 unittest 1736 { 1737 __m128d a = [2.0, 4.5]; 1738 a = _mm_div_sd(a, a); 1739 assert(a.array == [1.0, 4.5]); 1740 } 1741 1742 /// Extract a 16-bit integer from `v`, selected with `index`. 1743 /// Warning: the returned value is zero-extended to 32-bits. 1744 int _mm_extract_epi16(__m128i v, int index) pure @safe 1745 { 1746 short8 r = cast(short8)v; 1747 return cast(ushort)(r.array[index & 7]); 1748 } 1749 unittest 1750 { 1751 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, -1); 1752 assert(_mm_extract_epi16(A, 6) == 6); 1753 assert(_mm_extract_epi16(A, 0) == 65535); 1754 assert(_mm_extract_epi16(A, 5 + 8) == 5); 1755 } 1756 1757 /// Copy `v`, and insert the 16-bit integer `i` at the location specified by `index`. 1758 __m128i _mm_insert_epi16 (__m128i v, int i, int index) @trusted 1759 { 1760 short8 r = cast(short8)v; 1761 r.ptr[index & 7] = cast(short)i; 1762 return cast(__m128i)r; 1763 } 1764 unittest 1765 { 1766 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 1767 short8 R = cast(short8) _mm_insert_epi16(A, 42, 6); 1768 short[8] correct = [0, 1, 2, 3, 4, 5, 42, 7]; 1769 assert(R.array == correct); 1770 } 1771 1772 /// Perform a serializing operation on all load-from-memory instructions that were issued prior 1773 /// to this instruction. Guarantees that every load instruction that precedes, in program order, 1774 /// is globally visible before any load instruction which follows the fence in program order. 1775 void _mm_lfence() @trusted 1776 { 1777 version(GNU) 1778 { 1779 static if (GDC_with_SSE2) 1780 { 1781 __builtin_ia32_lfence(); 1782 } 1783 else version(X86) 1784 { 1785 asm pure nothrow @nogc @trusted 1786 { 1787 "lfence;\n" : : : ; 1788 } 1789 } 1790 else 1791 static assert(false); 1792 } 1793 else static if (LDC_with_SSE2) 1794 { 1795 __builtin_ia32_lfence(); 1796 } 1797 else static if (LDC_with_ARM64) 1798 { 1799 __builtin_arm_dmb(9); // dmb ishld 1800 } 1801 else static if (DMD_with_asm) 1802 { 1803 asm nothrow @nogc pure @safe 1804 { 1805 lfence; 1806 } 1807 } 1808 else version(LDC) 1809 { 1810 // When the architecture is unknown, generate a full memory barrier, 1811 // as the semantics of sfence do not really match those of atomics. 1812 llvm_memory_fence(); 1813 } 1814 else 1815 static assert(false); 1816 } 1817 unittest 1818 { 1819 _mm_lfence(); 1820 } 1821 1822 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1823 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1824 __m128d _mm_load_pd (const(double) * mem_addr) pure 1825 { 1826 pragma(inline, true); 1827 __m128d* aligned = cast(__m128d*)mem_addr; 1828 return *aligned; 1829 } 1830 unittest 1831 { 1832 align(16) double[2] S = [-5.0, 7.0]; 1833 __m128d R = _mm_load_pd(S.ptr); 1834 assert(R.array == S); 1835 } 1836 1837 /// Load a double-precision (64-bit) floating-point element from memory into both elements of dst. 1838 /// `mem_addr` does not need to be aligned on any particular boundary. 1839 __m128d _mm_load_pd1 (const(double)* mem_addr) pure 1840 { 1841 double m = *mem_addr; 1842 __m128d r; // PERF =void; 1843 r.ptr[0] = m; 1844 r.ptr[1] = m; 1845 return r; 1846 } 1847 unittest 1848 { 1849 double what = 4; 1850 __m128d R = _mm_load_pd1(&what); 1851 double[2] correct = [4.0, 4]; 1852 assert(R.array == correct); 1853 } 1854 1855 /// Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper 1856 /// element. `mem_addr` does not need to be aligned on any particular boundary. 1857 __m128d _mm_load_sd (const(double)* mem_addr) pure @trusted 1858 { 1859 double2 r = [0, 0]; 1860 r.ptr[0] = *mem_addr; 1861 return r; 1862 } 1863 unittest 1864 { 1865 double x = -42; 1866 __m128d a = _mm_load_sd(&x); 1867 assert(a.array == [-42.0, 0.0]); 1868 } 1869 1870 /// Load 128-bits of integer data from memory into dst. 1871 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1872 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted // TODO: shoudln't be trusted because alignment, Issue #62 1873 { 1874 pragma(inline, true); 1875 return *mem_addr; 1876 } 1877 unittest 1878 { 1879 align(16) int[4] correct = [-1, 2, 3, 4]; 1880 int4 A = cast(int4) _mm_load_si128(cast(__m128i*) correct.ptr); 1881 assert(A.array == correct); 1882 } 1883 1884 alias _mm_load1_pd = _mm_load_pd1; /// 1885 1886 /// Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the 1887 /// lower element from `a` to result. `mem_addr` does not need to be aligned on any particular boundary. 1888 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @trusted 1889 { 1890 pragma(inline, true); 1891 a.ptr[1] = *mem_addr; 1892 return a; 1893 } 1894 unittest 1895 { 1896 double A = 7.0; 1897 __m128d B = _mm_setr_pd(4.0, -5.0); 1898 __m128d R = _mm_loadh_pd(B, &A); 1899 double[2] correct = [ 4.0, 7.0 ]; 1900 assert(R.array == correct); 1901 } 1902 1903 /// Load 64-bit integer from memory into the first element of result. Zero out the other. 1904 // Note: strange signature since the memory doesn't have to aligned (Issue #60), and doesn't have to be 128-bit 1905 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @trusted // TODO signature 1906 { 1907 pragma(inline, true); 1908 auto pLong = cast(const(long)*)mem_addr; 1909 long2 r = [0, 0]; 1910 r.ptr[0] = *pLong; 1911 return cast(__m128i)(r); 1912 } 1913 unittest 1914 { 1915 long A = 0x7878787870707070; 1916 long2 R = cast(long2) _mm_loadl_epi64(cast(__m128i*)&A); 1917 long[2] correct = [0x7878787870707070, 0]; 1918 assert(R.array == correct); 1919 } 1920 1921 /// Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the 1922 /// upper element from `a` to result. mem_addr does not need to be aligned on any particular boundary. 1923 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @trusted 1924 { 1925 a.ptr[0] = *mem_addr; 1926 return a; 1927 } 1928 unittest 1929 { 1930 double A = 7.0; 1931 __m128d B = _mm_setr_pd(4.0, -5.0); 1932 __m128d R = _mm_loadl_pd(B, &A); 1933 double[2] correct = [ 7.0, -5.0 ]; 1934 assert(R.array == correct); 1935 } 1936 1937 /// Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. 1938 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 1939 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted 1940 { 1941 __m128d a = *cast(__m128d*)(mem_addr); 1942 __m128d r; // PERF =void; 1943 r.ptr[0] = a.array[1]; 1944 r.ptr[1] = a.array[0]; 1945 return r; 1946 } 1947 unittest 1948 { 1949 align(16) double[2] A = [56.0, -74.0]; 1950 __m128d R = _mm_loadr_pd(A.ptr); 1951 double[2] correct = [-74.0, 56.0]; 1952 assert(R.array == correct); 1953 } 1954 1955 /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. 1956 /// `mem_addr` does not need to be aligned on any particular boundary. 1957 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @trusted 1958 { 1959 pragma(inline, true); 1960 static if (GDC_with_SSE2) 1961 { 1962 return __builtin_ia32_loadupd(mem_addr); 1963 } 1964 else version(LDC) 1965 { 1966 return loadUnaligned!(double2)(mem_addr); 1967 } 1968 else version(DigitalMars) 1969 { 1970 // Apparently inside __simd you can use aligned dereferences without fear. 1971 // That was issue 23048 on dlang's Bugzilla. 1972 static if (DMD_with_DSIMD) 1973 { 1974 return cast(__m128d)__simd(XMM.LODUPD, *cast(double2*)mem_addr); 1975 } 1976 else static if (SSESizedVectorsAreEmulated) 1977 { 1978 // Since this vector is emulated, it doesn't have alignement constraints 1979 // and as such we can just cast it. 1980 return *cast(__m128d*)(mem_addr); 1981 } 1982 else 1983 { 1984 __m128d result; 1985 result.ptr[0] = mem_addr[0]; 1986 result.ptr[1] = mem_addr[1]; 1987 return result; 1988 } 1989 } 1990 else 1991 { 1992 __m128d result; 1993 result.ptr[0] = mem_addr[0]; 1994 result.ptr[1] = mem_addr[1]; 1995 return result; 1996 } 1997 } 1998 unittest 1999 { 2000 double[2] A = [56.0, -75.0]; 2001 __m128d R = _mm_loadu_pd(A.ptr); 2002 double[2] correct = [56.0, -75.0]; 2003 assert(R.array == correct); 2004 } 2005 2006 /// Load 128-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary. 2007 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted 2008 { 2009 // PERF DMD 2010 pragma(inline, true); 2011 static if (GDC_with_SSE2) 2012 { 2013 return cast(__m128i) __builtin_ia32_loaddqu(cast(const(char*))mem_addr); 2014 } 2015 else version(LDC) 2016 { 2017 return loadUnaligned!(__m128i)(cast(int*)mem_addr); 2018 } 2019 else 2020 { 2021 const(int)* p = cast(const(int)*)mem_addr; 2022 __m128i r = void; 2023 r.ptr[0] = p[0]; 2024 r.ptr[1] = p[1]; 2025 r.ptr[2] = p[2]; 2026 r.ptr[3] = p[3]; 2027 return r; 2028 } 2029 } 2030 unittest 2031 { 2032 align(16) int[4] correct = [-1, 2, -3, 4]; 2033 int4 A = cast(int4) _mm_loadu_si128(cast(__m128i*) correct.ptr); 2034 assert(A.array == correct); 2035 } 2036 2037 /// Load unaligned 32-bit integer from memory into the first element of result. 2038 __m128i _mm_loadu_si32 (const(void)* mem_addr) pure @trusted 2039 { 2040 pragma(inline, true); 2041 int r = *cast(int*)(mem_addr); 2042 int4 result = [0, 0, 0, 0]; 2043 result.ptr[0] = r; 2044 return result; 2045 } 2046 unittest 2047 { 2048 int r = 42; 2049 __m128i A = _mm_loadu_si32(&r); 2050 int[4] correct = [42, 0, 0, 0]; 2051 assert(A.array == correct); 2052 } 2053 2054 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate 2055 /// signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, 2056 /// and pack the results in destination. 2057 __m128i _mm_madd_epi16 (__m128i a, __m128i b) pure @trusted 2058 { 2059 static if (GDC_with_SSE2) 2060 { 2061 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2062 } 2063 else static if (LDC_with_SSE2) 2064 { 2065 return cast(__m128i) __builtin_ia32_pmaddwd128(cast(short8)a, cast(short8)b); 2066 } 2067 else static if (LDC_with_ARM64) 2068 { 2069 int4 pl = vmull_s16(vget_low_s16(cast(short8)a), vget_low_s16(cast(short8)b)); 2070 int4 ph = vmull_s16(vget_high_s16(cast(short8)a), vget_high_s16(cast(short8)b)); 2071 int2 rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl)); 2072 int2 rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph)); 2073 return vcombine_s32(rl, rh); 2074 } 2075 else 2076 { 2077 short8 sa = cast(short8)a; 2078 short8 sb = cast(short8)b; 2079 int4 r; 2080 foreach(i; 0..4) 2081 { 2082 r.ptr[i] = sa.array[2*i] * sb.array[2*i] + sa.array[2*i+1] * sb.array[2*i+1]; 2083 } 2084 return r; 2085 } 2086 } 2087 unittest 2088 { 2089 short8 A = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2090 short8 B = [0, 1, 2, 3, -32768, -32768, 32767, 32767]; 2091 int4 R = _mm_madd_epi16(cast(__m128i)A, cast(__m128i)B); 2092 int[4] correct = [1, 13, -2147483648, 2*32767*32767]; 2093 assert(R.array == correct); 2094 } 2095 2096 /// Conditionally store 8-bit integer elements from `a` into memory using `mask` 2097 /// (elements are not stored when the highest bit is not set in the corresponding element) 2098 /// and a non-temporal memory hint. `mem_addr` does not need to be aligned on any particular 2099 /// boundary. 2100 void _mm_maskmoveu_si128 (__m128i a, __m128i mask, void* mem_addr) @trusted 2101 { 2102 static if (GDC_with_SSE2) 2103 { 2104 return __builtin_ia32_maskmovdqu(cast(ubyte16)a, cast(ubyte16)mask, cast(char*)mem_addr); 2105 } 2106 else static if (LDC_with_SSE2) 2107 { 2108 return __builtin_ia32_maskmovdqu(cast(byte16)a, cast(byte16)mask, cast(char*)mem_addr); 2109 } 2110 else static if (LDC_with_ARM64) 2111 { 2112 // PERF: catastrophic on ARM32 2113 byte16 bmask = cast(byte16)mask; 2114 byte16 shift = 7; 2115 bmask = bmask >> shift; // sign-extend to have a 0xff or 0x00 mask 2116 mask = cast(__m128i) bmask; 2117 __m128i dest = loadUnaligned!__m128i(cast(int*)mem_addr); 2118 dest = (a & mask) | (dest & ~mask); 2119 storeUnaligned!__m128i(dest, cast(int*)mem_addr); 2120 } 2121 else 2122 { 2123 byte16 b = cast(byte16)a; 2124 byte16 m = cast(byte16)mask; 2125 byte* dest = cast(byte*)(mem_addr); 2126 foreach(j; 0..16) 2127 { 2128 if (m.array[j] & 128) 2129 { 2130 dest[j] = b.array[j]; 2131 } 2132 } 2133 } 2134 } 2135 unittest 2136 { 2137 ubyte[16] dest = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]; 2138 __m128i mask = _mm_setr_epi8(0,-1, 0,-1,-1, 1,-1,-1, 0,-1,-4,-1,-1, 0,-127, 0); 2139 __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); 2140 _mm_maskmoveu_si128(A, mask, dest.ptr); 2141 ubyte[16] correct = [42, 1,42, 3, 4,42, 6, 7,42, 9,10,11,12,42,14,42]; 2142 assert(dest == correct); 2143 } 2144 2145 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed maximum values. 2146 __m128i _mm_max_epi16 (__m128i a, __m128i b) pure @safe 2147 { 2148 static if (GDC_with_SSE2) 2149 { 2150 return cast(__m128i) __builtin_ia32_pmaxsw128(cast(short8)a, cast(short8)b); 2151 } 2152 else version(LDC) 2153 { 2154 // x86: pmaxsw since LDC 1.0 -O1 2155 // ARM: smax.8h since LDC 1.5 -01 2156 short8 sa = cast(short8)a; 2157 short8 sb = cast(short8)b; 2158 short8 greater = greaterMask!short8(sa, sb); 2159 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2160 } 2161 else 2162 { 2163 __m128i lowerShorts = _mm_cmpgt_epi16(a, b); // ones where a should be selected, b else 2164 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2165 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2166 return _mm_xor_si128(b, mask); 2167 } 2168 } 2169 unittest 2170 { 2171 short8 R = cast(short8) _mm_max_epi16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57), 2172 _mm_setr_epi16(-4,-8, 9, 7, 0,-32768, 0, 0)); 2173 short[8] correct = [32767, 1, 9, 7, 9, 7, 0, 0]; 2174 assert(R.array == correct); 2175 } 2176 2177 /// Compare packed unsigned 8-bit integers in a and b, and return packed maximum values. 2178 __m128i _mm_max_epu8 (__m128i a, __m128i b) pure @safe 2179 { 2180 version(LDC) 2181 { 2182 // x86: pmaxub since LDC 1.0.0 -O1 2183 // ARM64: umax.16b since LDC 1.5.0 -O1 2184 // PERF: catastrophic on ARM32 2185 ubyte16 sa = cast(ubyte16)a; 2186 ubyte16 sb = cast(ubyte16)b; 2187 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2188 return cast(__m128i)( (greater & sa) | (~greater & sb) ); 2189 } 2190 else 2191 { 2192 __m128i value128 = _mm_set1_epi8(-128); 2193 __m128i higher = _mm_cmpgt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2194 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2195 __m128i mask = _mm_and_si128(aTob, higher); 2196 return _mm_xor_si128(b, mask); 2197 } 2198 } 2199 unittest 2200 { 2201 byte16 R = cast(byte16) _mm_max_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2202 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2203 byte[16] correct = [-4,-8, -4, -8, 9,-57, 0,-57, -4,-8, -4, -8, 9,-57, 0,-57]; 2204 assert(R.array == correct); 2205 } 2206 2207 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return 2208 /// packed maximum values. 2209 __m128d _mm_max_pd (__m128d a, __m128d b) pure @trusted 2210 { 2211 static if (GDC_with_SSE2) 2212 { 2213 return __builtin_ia32_maxpd(a, b); 2214 } 2215 else 2216 { 2217 // x86: Generates maxpd starting with LDC 1.9 -O2 2218 a.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2219 a.ptr[1] = (a.array[1] > b.array[1]) ? a.array[1] : b.array[1]; 2220 return a; 2221 } 2222 } 2223 unittest 2224 { 2225 __m128d A = _mm_setr_pd(4.0, 1.0); 2226 __m128d B = _mm_setr_pd(1.0, 8.0); 2227 __m128d M = _mm_max_pd(A, B); 2228 assert(M.array[0] == 4.0); 2229 assert(M.array[1] == 8.0); 2230 } 2231 2232 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the maximum value in the 2233 /// lower element of result, and copy the upper element from `a` to the upper element of result. 2234 __m128d _mm_max_sd (__m128d a, __m128d b) pure @trusted 2235 { 2236 static if (GDC_with_SSE2) 2237 { 2238 return __builtin_ia32_maxsd(a, b); 2239 } 2240 else 2241 { 2242 __m128d r = a; 2243 // Generates maxsd starting with LDC 1.3 2244 r.ptr[0] = (a.array[0] > b.array[0]) ? a.array[0] : b.array[0]; 2245 return r; 2246 } 2247 } 2248 unittest 2249 { 2250 __m128d A = _mm_setr_pd(1.0, 1.0); 2251 __m128d B = _mm_setr_pd(4.0, 2.0); 2252 __m128d M = _mm_max_sd(A, B); 2253 assert(M.array[0] == 4.0); 2254 assert(M.array[1] == 1.0); 2255 } 2256 2257 /// Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to 2258 /// this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction 2259 /// is globally visible before any memory instruction which follows the fence in program order. 2260 void _mm_mfence() @trusted // not pure! 2261 { 2262 version(GNU) 2263 { 2264 static if (GDC_with_SSE2) 2265 { 2266 __builtin_ia32_mfence(); 2267 } 2268 else version(X86) 2269 { 2270 asm pure nothrow @nogc @trusted 2271 { 2272 "mfence;\n" : : : ; 2273 } 2274 } 2275 else 2276 static assert(false); 2277 } 2278 else static if (LDC_with_SSE2) 2279 { 2280 __builtin_ia32_mfence(); 2281 } 2282 else static if (DMD_with_asm) 2283 { 2284 asm nothrow @nogc pure @safe 2285 { 2286 mfence; 2287 } 2288 } 2289 else version(LDC) 2290 { 2291 // Note: will generate the DMB ish instruction on ARM 2292 llvm_memory_fence(); 2293 } 2294 else 2295 static assert(false); 2296 } 2297 unittest 2298 { 2299 _mm_mfence(); 2300 } 2301 2302 /// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values. 2303 __m128i _mm_min_epi16 (__m128i a, __m128i b) pure @safe 2304 { 2305 static if (GDC_with_SSE2) 2306 { 2307 return cast(__m128i) __builtin_ia32_pminsw128(cast(short8)a, cast(short8)b); 2308 } 2309 else version(LDC) 2310 { 2311 // x86: pminsw since LDC 1.0 -O1 2312 // ARM64: smin.8h since LDC 1.5 -01 2313 short8 sa = cast(short8)a; 2314 short8 sb = cast(short8)b; 2315 short8 greater = greaterMask!short8(sa, sb); 2316 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2317 } 2318 else 2319 { 2320 __m128i lowerShorts = _mm_cmplt_epi16(a, b); // ones where a should be selected, b else 2321 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2322 __m128i mask = _mm_and_si128(aTob, lowerShorts); 2323 return _mm_xor_si128(b, mask); 2324 } 2325 } 2326 unittest 2327 { 2328 short8 R = cast(short8) _mm_min_epi16(_mm_setr_epi16(45, 1, -4, -8, 9, 7, 0,-32768), 2329 _mm_setr_epi16(-4,-8, 9, 7, 0,-57, 0, 0)); 2330 short[8] correct = [-4,-8, -4, -8, 0,-57, 0, -32768]; 2331 assert(R.array == correct); 2332 } 2333 2334 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return packed minimum values. 2335 __m128i _mm_min_epu8 (__m128i a, __m128i b) pure @safe 2336 { 2337 version(LDC) 2338 { 2339 // x86: pminub since LDC 1.0.0 -O1 2340 // ARM: umin.16b since LDC 1.5.0 -O1 2341 // PERF: catastrophic on ARM32 2342 ubyte16 sa = cast(ubyte16)a; 2343 ubyte16 sb = cast(ubyte16)b; 2344 ubyte16 greater = cast(ubyte16) greaterMask!ubyte16(sa, sb); 2345 return cast(__m128i)( (~greater & sa) | (greater & sb) ); 2346 } 2347 else 2348 { 2349 __m128i value128 = _mm_set1_epi8(-128); 2350 __m128i lower = _mm_cmplt_epi8(_mm_add_epi8(a, value128), _mm_add_epi8(b, value128)); // signed comparison 2351 __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b 2352 __m128i mask = _mm_and_si128(aTob, lower); 2353 return _mm_xor_si128(b, mask); 2354 } 2355 } 2356 unittest 2357 { 2358 byte16 R = cast(byte16) _mm_min_epu8(_mm_setr_epi8(45, 1, -4, -8, 9, 7, 0,-57, -4,-8, 9, 7, 0,-57, 0, 0), 2359 _mm_setr_epi8(-4,-8, 9, 7, 0,-57, 0, 0, 45, 1, -4, -8, 9, 7, 0,-57)); 2360 byte[16] correct = [45, 1, 9, 7, 0, 7, 0, 0, 45, 1, 9, 7, 0, 7, 0, 0]; 2361 assert(R.array == correct); 2362 } 2363 2364 /// Compare packed double-precision (64-bit) floating-point elements in `a` and `b`, and return packed minimum values. 2365 __m128d _mm_min_pd (__m128d a, __m128d b) pure @trusted 2366 { 2367 static if (GDC_with_SSE2) 2368 { 2369 return __builtin_ia32_minpd(a, b); 2370 } 2371 else 2372 { 2373 // Generates minpd starting with LDC 1.9 2374 a.ptr[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2375 a.ptr[1] = (a.array[1] < b.array[1]) ? a.array[1] : b.array[1]; 2376 return a; 2377 } 2378 } 2379 unittest 2380 { 2381 __m128d A = _mm_setr_pd(1.0, 2.0); 2382 __m128d B = _mm_setr_pd(4.0, 1.0); 2383 __m128d M = _mm_min_pd(A, B); 2384 assert(M.array[0] == 1.0); 2385 assert(M.array[1] == 1.0); 2386 } 2387 2388 /// Compare the lower double-precision (64-bit) floating-point elements in `a` and `b`, store the minimum value in 2389 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 2390 __m128d _mm_min_sd (__m128d a, __m128d b) pure @safe 2391 { 2392 static if (GDC_with_SSE2) 2393 { 2394 return __builtin_ia32_minsd(a, b); 2395 } 2396 else 2397 { 2398 // Generates minsd starting with LDC 1.3 2399 __m128d r = a; 2400 r.array[0] = (a.array[0] < b.array[0]) ? a.array[0] : b.array[0]; 2401 return r; 2402 } 2403 } 2404 unittest 2405 { 2406 __m128d A = _mm_setr_pd(1.0, 3.0); 2407 __m128d B = _mm_setr_pd(4.0, 2.0); 2408 __m128d M = _mm_min_sd(A, B); 2409 assert(M.array[0] == 1.0); 2410 assert(M.array[1] == 3.0); 2411 } 2412 2413 /// Copy the lower 64-bit integer in `a` to the lower element of result, and zero the upper element. 2414 __m128i _mm_move_epi64 (__m128i a) pure @trusted 2415 { 2416 static if (GDC_with_SSE2) 2417 { 2418 // slightly better with GDC -O0 2419 return cast(__m128i) __builtin_ia32_movq128(cast(long2)a); 2420 } 2421 else 2422 { 2423 long2 result = [ 0, 0 ]; 2424 long2 la = cast(long2) a; 2425 result.ptr[0] = la.array[0]; 2426 return cast(__m128i)(result); 2427 } 2428 } 2429 unittest 2430 { 2431 long2 A = [13, 47]; 2432 long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A ); 2433 long[2] correct = [13, 0]; 2434 assert(B.array == correct); 2435 } 2436 2437 /// Move the lower double-precision (64-bit) floating-point element from `b` to the lower element of result, and copy 2438 /// the upper element from `a` to the upper element of dst. 2439 __m128d _mm_move_sd (__m128d a, __m128d b) pure @trusted 2440 { 2441 static if (GDC_with_SSE2) 2442 { 2443 return __builtin_ia32_movsd(a, b); 2444 } 2445 else 2446 { 2447 b.ptr[1] = a.array[1]; 2448 return b; 2449 } 2450 } 2451 unittest 2452 { 2453 double2 A = [13.0, 47.0]; 2454 double2 B = [34.0, 58.0]; 2455 double2 C = _mm_move_sd(A, B); 2456 double[2] correct = [34.0, 47.0]; 2457 assert(C.array == correct); 2458 } 2459 2460 /// Create mask from the most significant bit of each 8-bit element in `v`. 2461 int _mm_movemask_epi8 (__m128i a) pure @trusted 2462 { 2463 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2464 static if (GDC_with_SSE2) 2465 { 2466 return __builtin_ia32_pmovmskb128(cast(ubyte16)a); 2467 } 2468 else static if (LDC_with_SSE2) 2469 { 2470 return __builtin_ia32_pmovmskb128(cast(byte16)a); 2471 } 2472 else static if (LDC_with_ARM64) 2473 { 2474 // Solution from https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon 2475 // The other two solutions lead to unfound intrinsics in LLVM and that took a long time. 2476 // SO there might be something a bit faster, but this one is reasonable and branchless. 2477 byte8 mask_shift; 2478 mask_shift.ptr[0] = 7; 2479 mask_shift.ptr[1] = 6; 2480 mask_shift.ptr[2] = 5; 2481 mask_shift.ptr[3] = 4; 2482 mask_shift.ptr[4] = 3; 2483 mask_shift.ptr[5] = 2; 2484 mask_shift.ptr[6] = 1; 2485 mask_shift.ptr[7] = 0; 2486 byte8 mask_and = byte8(-128); 2487 byte8 lo = vget_low_u8(cast(byte16)a); 2488 byte8 hi = vget_high_u8(cast(byte16)a); 2489 lo = vand_u8(lo, mask_and); 2490 lo = vshr_u8(lo, mask_shift); 2491 hi = vand_u8(hi, mask_and); 2492 hi = vshr_u8(hi, mask_shift); 2493 lo = vpadd_u8(lo,lo); 2494 lo = vpadd_u8(lo,lo); 2495 lo = vpadd_u8(lo,lo); 2496 hi = vpadd_u8(hi,hi); 2497 hi = vpadd_u8(hi,hi); 2498 hi = vpadd_u8(hi,hi); 2499 return (cast(ubyte)(hi[0]) << 8) | cast(ubyte)(lo[0]); 2500 } 2501 else 2502 { 2503 byte16 ai = cast(byte16)a; 2504 int r = 0; 2505 foreach(bit; 0..16) 2506 { 2507 if (ai.array[bit] < 0) r += (1 << bit); 2508 } 2509 return r; 2510 } 2511 } 2512 unittest 2513 { 2514 assert(0x9C36 == _mm_movemask_epi8(_mm_set_epi8(-1, 1, 2, -3, -1, -1, 4, 8, 127, 0, -1, -1, 0, -1, -1, 0))); 2515 } 2516 2517 /// Create mask from the most significant bit of each 16-bit element in `v`. #BONUS 2518 int _mm_movemask_epi16 (__m128i a) pure @trusted 2519 { 2520 return _mm_movemask_epi8(_mm_packs_epi16(a, _mm_setzero_si128())); 2521 } 2522 unittest 2523 { 2524 assert(0x9C == _mm_movemask_epi16(_mm_set_epi16(-1, 1, 2, -3, -32768, -1, 32767, 8))); 2525 } 2526 2527 /// Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) 2528 /// loating-point element in `v`. 2529 int _mm_movemask_pd(__m128d v) pure @safe 2530 { 2531 // PERF: Not possible in D_SIMD because of https://issues.dlang.org/show_bug.cgi?id=8047 2532 static if (GDC_with_SSE2) 2533 { 2534 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2535 /// packed double-precision (64-bit) floating-point element in `v`. 2536 return __builtin_ia32_movmskpd(v); 2537 } 2538 else static if (LDC_with_SSE2) 2539 { 2540 /// Set each bit of mask `dst` based on the most significant bit of the corresponding 2541 /// packed double-precision (64-bit) floating-point element in `v`. 2542 return __builtin_ia32_movmskpd(v); 2543 } 2544 else 2545 { 2546 long2 lv = cast(long2)v; 2547 int r = 0; 2548 if (lv.array[0] < 0) r += 1; 2549 if (lv.array[1] < 0) r += 2; 2550 return r; 2551 } 2552 } 2553 unittest 2554 { 2555 __m128d A = cast(__m128d) _mm_set_epi64x(-1, 0); 2556 assert(_mm_movemask_pd(A) == 2); 2557 } 2558 2559 /// Copy the lower 64-bit integer in `v`. 2560 __m64 _mm_movepi64_pi64 (__m128i v) pure @safe 2561 { 2562 long2 lv = cast(long2)v; 2563 return long1(lv.array[0]); 2564 } 2565 unittest 2566 { 2567 __m128i A = _mm_set_epi64x(-1, -2); 2568 __m64 R = _mm_movepi64_pi64(A); 2569 assert(R.array[0] == -2); 2570 } 2571 2572 /// Copy the 64-bit integer `a` to the lower element of dest, and zero the upper element. 2573 __m128i _mm_movpi64_epi64 (__m64 a) pure @trusted 2574 { 2575 long2 r; 2576 r.ptr[0] = a.array[0]; 2577 r.ptr[1] = 0; 2578 return cast(__m128i)r; 2579 } 2580 2581 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in `a` and `b`, 2582 /// and store the unsigned 64-bit results. 2583 __m128i _mm_mul_epu32 (__m128i a, __m128i b) pure @trusted 2584 { 2585 // PERF DMD D_SIMD 2586 static if (GDC_with_SSE2) 2587 { 2588 return cast(__m128i) __builtin_ia32_pmuludq128 (a, b); 2589 } 2590 else 2591 { 2592 version(LDC) 2593 { 2594 static if (__VERSION__ >= 2088) 2595 { 2596 // Need LLVM9 for proper optimization 2597 long2 la, lb; 2598 la.ptr[0] = cast(uint)a.array[0]; 2599 la.ptr[1] = cast(uint)a.array[2]; 2600 lb.ptr[0] = cast(uint)b.array[0]; 2601 lb.ptr[1] = cast(uint)b.array[2]; 2602 } 2603 else 2604 { 2605 __m128i zero; 2606 zero = 0; 2607 long2 la = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(a, zero); 2608 long2 lb = cast(long2) shufflevectorLDC!(int4, 0, 4, 2, 6)(b, zero); 2609 } 2610 } 2611 else 2612 { 2613 long2 la, lb; 2614 la.ptr[0] = cast(uint)a.array[0]; 2615 la.ptr[1] = cast(uint)a.array[2]; 2616 lb.ptr[0] = cast(uint)b.array[0]; 2617 lb.ptr[1] = cast(uint)b.array[2]; 2618 } 2619 2620 version(DigitalMars) 2621 { 2622 // DMD has no long2 mul 2623 la.ptr[0] *= lb.array[0]; 2624 la.ptr[1] *= lb.array[1]; 2625 return cast(__m128i)(la); 2626 } 2627 else 2628 { 2629 static if (__VERSION__ >= 2076) 2630 { 2631 return cast(__m128i)(la * lb); 2632 } 2633 else 2634 { 2635 // long2 mul not supported before LDC 1.5 2636 la.ptr[0] *= lb.array[0]; 2637 la.ptr[1] *= lb.array[1]; 2638 return cast(__m128i)(la); 2639 } 2640 } 2641 } 2642 } 2643 unittest 2644 { 2645 __m128i A = _mm_set_epi32(42, 0xDEADBEEF, 42, 0xffffffff); 2646 __m128i B = _mm_set_epi32(42, 0xCAFEBABE, 42, 0xffffffff); 2647 __m128i C = _mm_mul_epu32(A, B); 2648 long2 LC = cast(long2)C; 2649 assert(LC.array[0] == 18446744065119617025uL); 2650 assert(LC.array[1] == 12723420444339690338uL); 2651 } 2652 2653 /// Multiply packed double-precision (64-bit) floating-point elements in `a` and `b`, and return the results. 2654 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe 2655 { 2656 pragma(inline, true); 2657 return a * b; 2658 } 2659 unittest 2660 { 2661 __m128d a = [-2.0, 1.5]; 2662 a = _mm_mul_pd(a, a); 2663 assert(a.array == [4.0, 2.25]); 2664 } 2665 2666 /// Multiply the lower double-precision (64-bit) floating-point element in `a` and `b`, store the result in the lower 2667 /// element of result, and copy the upper element from `a` to the upper element of result. 2668 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @trusted 2669 { 2670 version(DigitalMars) 2671 { 2672 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 2673 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 2674 asm pure nothrow @nogc @trusted { nop;} 2675 a.array[0] = a.array[0] * b.array[0]; 2676 return a; 2677 } 2678 else static if (GDC_with_SSE2) 2679 { 2680 return __builtin_ia32_mulsd(a, b); 2681 } 2682 else 2683 { 2684 a.ptr[0] *= b.array[0]; 2685 return a; 2686 } 2687 } 2688 unittest 2689 { 2690 __m128d a = [-2.0, 1.5]; 2691 a = _mm_mul_sd(a, a); 2692 assert(a.array == [4.0, 1.5]); 2693 } 2694 2695 /// Multiply the low unsigned 32-bit integers from `a` and `b`, 2696 /// and get an unsigned 64-bit result. 2697 __m64 _mm_mul_su32 (__m64 a, __m64 b) pure @safe 2698 { 2699 return to_m64(_mm_mul_epu32(to_m128i(a), to_m128i(b))); 2700 } 2701 unittest 2702 { 2703 __m64 A = _mm_set_pi32(42, 0xDEADBEEF); 2704 __m64 B = _mm_set_pi32(42, 0xCAFEBABE); 2705 __m64 C = _mm_mul_su32(A, B); 2706 assert(C.array[0] == 0xDEADBEEFuL * 0xCAFEBABEuL); 2707 } 2708 2709 /// Multiply the packed signed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2710 /// high 16 bits of the intermediate integers. 2711 __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) pure @trusted 2712 { 2713 static if (GDC_with_SSE2) 2714 { 2715 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2716 } 2717 else static if (LDC_with_SSE2) 2718 { 2719 return cast(__m128i) __builtin_ia32_pmulhw128(cast(short8)a, cast(short8)b); 2720 } 2721 else 2722 { 2723 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x smull.4s and shrn.4h shrn2.8h 2724 // PERF: it seems the simde solution has one less instruction in ARM64. 2725 // PERF: Catastrophic in ARM32. 2726 short8 sa = cast(short8)a; 2727 short8 sb = cast(short8)b; 2728 short8 r = void; 2729 r.ptr[0] = (sa.array[0] * sb.array[0]) >> 16; 2730 r.ptr[1] = (sa.array[1] * sb.array[1]) >> 16; 2731 r.ptr[2] = (sa.array[2] * sb.array[2]) >> 16; 2732 r.ptr[3] = (sa.array[3] * sb.array[3]) >> 16; 2733 r.ptr[4] = (sa.array[4] * sb.array[4]) >> 16; 2734 r.ptr[5] = (sa.array[5] * sb.array[5]) >> 16; 2735 r.ptr[6] = (sa.array[6] * sb.array[6]) >> 16; 2736 r.ptr[7] = (sa.array[7] * sb.array[7]) >> 16; 2737 return cast(__m128i)r; 2738 } 2739 } 2740 unittest 2741 { 2742 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2743 __m128i B = _mm_set1_epi16(16384); 2744 short8 R = cast(short8)_mm_mulhi_epi16(A, B); 2745 short[8] correct = [0, -4, 0, 0, 1, 2, 4, 1]; 2746 assert(R.array == correct); 2747 } 2748 2749 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the 2750 /// high 16 bits of the intermediate integers. 2751 __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) pure @trusted 2752 { 2753 static if (GDC_with_SSE2) 2754 { 2755 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2756 } 2757 else static if (LDC_with_SSE2) 2758 { 2759 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8)a, cast(short8)b); 2760 } 2761 else 2762 { 2763 // ARM64: LDC 1.5 -O2 or later gives a nice sequence with 2 x ext.16b, 2 x umull.4s and shrn.4h shrn2.8h 2764 // it seems the simde solution has one less instruction in ARM64 2765 // PERF: Catastrophic in ARM32. 2766 short8 sa = cast(short8)a; 2767 short8 sb = cast(short8)b; 2768 short8 r = void; 2769 r.ptr[0] = cast(short)( (cast(ushort)sa.array[0] * cast(ushort)sb.array[0]) >> 16 ); 2770 r.ptr[1] = cast(short)( (cast(ushort)sa.array[1] * cast(ushort)sb.array[1]) >> 16 ); 2771 r.ptr[2] = cast(short)( (cast(ushort)sa.array[2] * cast(ushort)sb.array[2]) >> 16 ); 2772 r.ptr[3] = cast(short)( (cast(ushort)sa.array[3] * cast(ushort)sb.array[3]) >> 16 ); 2773 r.ptr[4] = cast(short)( (cast(ushort)sa.array[4] * cast(ushort)sb.array[4]) >> 16 ); 2774 r.ptr[5] = cast(short)( (cast(ushort)sa.array[5] * cast(ushort)sb.array[5]) >> 16 ); 2775 r.ptr[6] = cast(short)( (cast(ushort)sa.array[6] * cast(ushort)sb.array[6]) >> 16 ); 2776 r.ptr[7] = cast(short)( (cast(ushort)sa.array[7] * cast(ushort)sb.array[7]) >> 16 ); 2777 return cast(__m128i)r; 2778 } 2779 } 2780 unittest 2781 { 2782 __m128i A = _mm_setr_epi16(0, -16, 2, 3, 4, 8, 16, 7); 2783 __m128i B = _mm_set1_epi16(16384); 2784 short8 R = cast(short8)_mm_mulhi_epu16(A, B); 2785 short[8] correct = [0, 0x3FFC, 0, 0, 1, 2, 4, 1]; 2786 assert(R.array == correct); 2787 } 2788 2789 /// Multiply the packed 16-bit integers in `a` and `b`, producing intermediate 32-bit integers, and return the low 16 2790 /// bits of the intermediate integers. 2791 __m128i _mm_mullo_epi16 (__m128i a, __m128i b) pure @safe 2792 { 2793 return cast(__m128i)(cast(short8)a * cast(short8)b); 2794 } 2795 unittest 2796 { 2797 __m128i A = _mm_setr_epi16(16384, -16, 0, 3, 4, 1, 16, 7); 2798 __m128i B = _mm_set1_epi16(16384); 2799 short8 R = cast(short8)_mm_mullo_epi16(A, B); 2800 short[8] correct = [0, 0, 0, -16384, 0, 16384, 0, -16384]; 2801 assert(R.array == correct); 2802 } 2803 2804 /// Compute the bitwise NOT of 128 bits in `a`. #BONUS 2805 __m128i _mm_not_si128 (__m128i a) pure @safe 2806 { 2807 return ~a; 2808 } 2809 unittest 2810 { 2811 __m128i A = _mm_set1_epi32(-748); 2812 int4 notA = cast(int4) _mm_not_si128(A); 2813 int[4] correct = [747, 747, 747, 747]; 2814 assert(notA.array == correct); 2815 } 2816 2817 /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 2818 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe 2819 { 2820 pragma(inline, true); 2821 return cast(__m128d)( cast(__m128i)a | cast(__m128i)b ); 2822 } 2823 2824 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and `b`. 2825 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe 2826 { 2827 pragma(inline, true); 2828 return a | b; 2829 } 2830 2831 /// Convert packed signed 32-bit integers from `a` and `b` to packed 16-bit integers using signed saturation. 2832 __m128i _mm_packs_epi32 (__m128i a, __m128i b) pure @trusted 2833 { 2834 static if (GDC_with_SSE2) 2835 { 2836 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2837 } 2838 else static if (LDC_with_SSE2) 2839 { 2840 return cast(__m128i) __builtin_ia32_packssdw128(a, b); 2841 } 2842 else static if (LDC_with_ARM64) 2843 { 2844 short4 ra = vqmovn_s32(cast(int4)a); 2845 short4 rb = vqmovn_s32(cast(int4)b); 2846 return cast(__m128i)vcombine_s16(ra, rb); 2847 } 2848 else 2849 { 2850 // PERF: catastrophic on ARM32 2851 short8 r; 2852 r.ptr[0] = saturateSignedIntToSignedShort(a.array[0]); 2853 r.ptr[1] = saturateSignedIntToSignedShort(a.array[1]); 2854 r.ptr[2] = saturateSignedIntToSignedShort(a.array[2]); 2855 r.ptr[3] = saturateSignedIntToSignedShort(a.array[3]); 2856 r.ptr[4] = saturateSignedIntToSignedShort(b.array[0]); 2857 r.ptr[5] = saturateSignedIntToSignedShort(b.array[1]); 2858 r.ptr[6] = saturateSignedIntToSignedShort(b.array[2]); 2859 r.ptr[7] = saturateSignedIntToSignedShort(b.array[3]); 2860 return cast(__m128i)r; 2861 } 2862 } 2863 unittest 2864 { 2865 __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); 2866 short8 R = cast(short8) _mm_packs_epi32(A, A); 2867 short[8] correct = [32767, -32768, 1000, 0, 32767, -32768, 1000, 0]; 2868 assert(R.array == correct); 2869 } 2870 2871 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using signed saturation. 2872 __m128i _mm_packs_epi16 (__m128i a, __m128i b) pure @trusted 2873 { 2874 static if (GDC_with_SSE2) 2875 { 2876 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2877 } 2878 else static if (LDC_with_SSE2) 2879 { 2880 return cast(__m128i) __builtin_ia32_packsswb128(cast(short8)a, cast(short8)b); 2881 } 2882 else static if (LDC_with_ARM64) 2883 { 2884 // generate a nice pair of sqxtn.8b + sqxtn2 since LDC 1.5 -02 2885 byte8 ra = vqmovn_s16(cast(short8)a); 2886 byte8 rb = vqmovn_s16(cast(short8)b); 2887 return cast(__m128i)vcombine_s8(ra, rb); 2888 } 2889 else 2890 { 2891 // PERF: ARM32 is missing 2892 byte16 r; 2893 short8 sa = cast(short8)a; 2894 short8 sb = cast(short8)b; 2895 foreach(i; 0..8) 2896 r.ptr[i] = saturateSignedWordToSignedByte(sa.array[i]); 2897 foreach(i; 0..8) 2898 r.ptr[i+8] = saturateSignedWordToSignedByte(sb.array[i]); 2899 return cast(__m128i)r; 2900 } 2901 } 2902 unittest 2903 { 2904 __m128i A = _mm_setr_epi16(1000, -1000, 1000, 0, 256, -129, 254, 0); 2905 byte16 R = cast(byte16) _mm_packs_epi16(A, A); 2906 byte[16] correct = [127, -128, 127, 0, 127, -128, 127, 0, 2907 127, -128, 127, 0, 127, -128, 127, 0]; 2908 assert(R.array == correct); 2909 } 2910 2911 /// Convert packed signed 16-bit integers from `a` and `b` to packed 8-bit integers using unsigned saturation. 2912 __m128i _mm_packus_epi16 (__m128i a, __m128i b) pure @trusted 2913 { 2914 // PERF DMD catastrophic 2915 static if (GDC_with_SSE2) 2916 { 2917 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2918 } 2919 else static if (LDC_with_SSE2) 2920 { 2921 return cast(__m128i) __builtin_ia32_packuswb128(cast(short8)a, cast(short8)b); 2922 } 2923 else static if (LDC_with_ARM64) 2924 { 2925 // generate a nice pair of sqxtun + sqxtun2 since LDC 1.5 -02 2926 byte8 ra = vqmovun_s16(cast(short8)a); 2927 byte8 rb = vqmovun_s16(cast(short8)b); 2928 return cast(__m128i)vcombine_s8(ra, rb); 2929 } 2930 else 2931 { 2932 short8 sa = cast(short8)a; 2933 short8 sb = cast(short8)b; 2934 align(16) ubyte[16] result = void; 2935 for (int i = 0; i < 8; ++i) 2936 { 2937 short s = sa[i]; 2938 if (s < 0) s = 0; 2939 if (s > 255) s = 255; 2940 result[i] = cast(ubyte)s; 2941 2942 s = sb[i]; 2943 if (s < 0) s = 0; 2944 if (s > 255) s = 255; 2945 result[i+8] = cast(ubyte)s; 2946 } 2947 return *cast(__m128i*)(result.ptr); 2948 } 2949 } 2950 unittest 2951 { 2952 __m128i A = _mm_setr_epi16(-10, 400, 0, 256, 255, 2, 1, 0); 2953 byte16 AA = cast(byte16) _mm_packus_epi16(A, A); 2954 static immutable ubyte[16] correctResult = [0, 255, 0, 255, 255, 2, 1, 0, 2955 0, 255, 0, 255, 255, 2, 1, 0]; 2956 foreach(i; 0..16) 2957 assert(AA.array[i] == cast(byte)(correctResult[i])); 2958 } 2959 2960 /// Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance 2961 /// and power consumption of spin-wait loops. 2962 void _mm_pause() @trusted 2963 { 2964 version(GNU) 2965 { 2966 static if (GDC_with_SSE2) 2967 { 2968 __builtin_ia32_pause(); 2969 } 2970 else version(X86) 2971 { 2972 asm pure nothrow @nogc @trusted 2973 { 2974 "pause;\n" : : : ; 2975 } 2976 } 2977 else 2978 static assert(false); 2979 } 2980 else static if (LDC_with_SSE2) 2981 { 2982 __builtin_ia32_pause(); 2983 } 2984 else static if (DMD_with_asm) 2985 { 2986 asm nothrow @nogc pure @safe 2987 { 2988 rep; nop; // F3 90 = pause 2989 } 2990 } 2991 else version (LDC) 2992 { 2993 // PERF: Do nothing currently , could be the "yield" intruction on ARM. 2994 } 2995 else 2996 static assert(false); 2997 } 2998 unittest 2999 { 3000 _mm_pause(); 3001 } 3002 3003 /// Compute the absolute differences of packed unsigned 8-bit integers in `a` and `b`, then horizontally sum each 3004 /// consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the 3005 /// low 16 bits of 64-bit elements in result. 3006 __m128i _mm_sad_epu8 (__m128i a, __m128i b) pure @trusted 3007 { 3008 static if (GDC_with_SSE2) 3009 { 3010 return cast(__m128i) __builtin_ia32_psadbw128(cast(ubyte16)a, cast(ubyte16)b); 3011 } 3012 else static if (LDC_with_SSE2) 3013 { 3014 return cast(__m128i) __builtin_ia32_psadbw128(cast(byte16)a, cast(byte16)b); 3015 } 3016 else static if (LDC_with_ARM64) 3017 { 3018 ushort8 t = cast(ushort8) vpaddlq_u8(vabdq_u8(cast(byte16) a, cast(byte16) b)); 3019 3020 // PERF: Looks suboptimal vs addp 3021 ushort r0 = cast(ushort)(t[0] + t[1] + t[2] + t[3]); 3022 ushort r4 = cast(ushort)(t[4] + t[5] + t[6] + t[7]); 3023 ushort8 r = 0; 3024 r[0] = r0; 3025 r[4] = r4; 3026 return cast(__m128i) r; 3027 } 3028 else 3029 { 3030 // PERF: ARM32 is lacking 3031 byte16 ab = cast(byte16)a; 3032 byte16 bb = cast(byte16)b; 3033 ubyte[16] t; 3034 foreach(i; 0..16) 3035 { 3036 int diff = cast(ubyte)(ab.array[i]) - cast(ubyte)(bb.array[i]); 3037 if (diff < 0) diff = -diff; 3038 t[i] = cast(ubyte)(diff); 3039 } 3040 int4 r = _mm_setzero_si128(); 3041 r.ptr[0] = t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 3042 r.ptr[2] = t[8] + t[9] + t[10]+ t[11]+ t[12]+ t[13]+ t[14]+ t[15]; 3043 return r; 3044 } 3045 } 3046 unittest 3047 { 3048 __m128i A = _mm_setr_epi8(3, 4, 6, 8, 12, 14, 18, 20, 24, 30, 32, 38, 42, 44, 48, 54); // primes + 1 3049 __m128i B = _mm_set1_epi8(1); 3050 __m128i R = _mm_sad_epu8(A, B); 3051 int[4] correct = [2 + 3 + 5 + 7 + 11 + 13 + 17 + 19, 3052 0, 3053 23 + 29 + 31 + 37 + 41 + 43 + 47 + 53, 3054 0]; 3055 assert(R.array == correct); 3056 } 3057 3058 /// Set packed 16-bit integers with the supplied values. 3059 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 3060 { 3061 short8 r = void; 3062 r.ptr[0] = e0; 3063 r.ptr[1] = e1; 3064 r.ptr[2] = e2; 3065 r.ptr[3] = e3; 3066 r.ptr[4] = e4; 3067 r.ptr[5] = e5; 3068 r.ptr[6] = e6; 3069 r.ptr[7] = e7; 3070 return cast(__m128i) r; 3071 } 3072 unittest 3073 { 3074 __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 3075 short8 B = cast(short8) A; 3076 foreach(i; 0..8) 3077 assert(B.array[i] == i); 3078 } 3079 3080 /// Set packed 32-bit integers with the supplied values. 3081 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3082 { 3083 // PERF: does a constant inline correctly? vs int4 field assignment 3084 align(16) int[4] r = [e0, e1, e2, e3]; 3085 return *cast(int4*)&r; 3086 } 3087 unittest 3088 { 3089 __m128i A = _mm_set_epi32(3, 2, 1, 0); 3090 foreach(i; 0..4) 3091 assert(A.array[i] == i); 3092 } 3093 3094 /// Set packed 64-bit integers with the supplied values. 3095 __m128i _mm_set_epi64(__m64 e1, __m64 e0) pure @trusted 3096 { 3097 pragma(inline, true); 3098 long2 r = void; 3099 r.ptr[0] = e0.array[0]; 3100 r.ptr[1] = e1.array[0]; 3101 return cast(__m128i)(r); 3102 } 3103 unittest 3104 { 3105 __m128i A = _mm_set_epi64(_mm_cvtsi64_m64(1234), _mm_cvtsi64_m64(5678)); 3106 long2 B = cast(long2) A; 3107 assert(B.array[0] == 5678); 3108 assert(B.array[1] == 1234); 3109 } 3110 3111 /// Set packed 64-bit integers with the supplied values. 3112 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted 3113 { 3114 pragma(inline, true); 3115 long2 r = void; 3116 r.ptr[0] = e0; 3117 r.ptr[1] = e1; 3118 return cast(__m128i)(r); 3119 } 3120 unittest 3121 { 3122 __m128i A = _mm_set_epi64x(1234, -5678); 3123 long2 B = cast(long2) A; 3124 assert(B.array[0] == -5678); 3125 assert(B.array[1] == 1234); 3126 } 3127 3128 /// Set packed 8-bit integers with the supplied values. 3129 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12, 3130 byte e11, byte e10, byte e9, byte e8, 3131 byte e7, byte e6, byte e5, byte e4, 3132 byte e3, byte e2, byte e1, byte e0) pure @trusted 3133 { 3134 align(16) byte[16] result = [e0, e1, e2, e3, e4, e5, e6, e7, 3135 e8, e9, e10, e11, e12, e13, e14, e15]; 3136 return *cast(__m128i*)(result.ptr); 3137 } 3138 unittest 3139 { 3140 byte16 R = cast(byte16) _mm_set_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); 3141 byte[16] correct = [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, -128, 127, 56, 0, -1]; 3142 assert(R.array == correct); 3143 } 3144 3145 /// Set packed double-precision (64-bit) floating-point elements with the supplied values. 3146 __m128d _mm_set_pd (double e1, double e0) pure @trusted 3147 { 3148 pragma(inline, true); 3149 double2 r = void; 3150 r.ptr[0] = e0; 3151 r.ptr[1] = e1; 3152 return r; 3153 } 3154 unittest 3155 { 3156 __m128d A = _mm_set_pd(61.0, 55.0); 3157 double[2] correct = [55.0, 61.0]; 3158 assert(A.array == correct); 3159 } 3160 3161 /// Broadcast double-precision (64-bit) floating-point value `a` to all element. 3162 __m128d _mm_set_pd1 (double a) pure @trusted 3163 { 3164 pragma(inline, true); 3165 __m128d r = void; 3166 r.ptr[0] = a; 3167 r.ptr[1] = a; 3168 return r; 3169 } 3170 unittest 3171 { 3172 __m128d A = _mm_set_pd1(61.0); 3173 double[2] correct = [61.0, 61.0]; 3174 assert(A.array == correct); 3175 } 3176 3177 /// Copy double-precision (64-bit) floating-point element `a` to the lower element of result, 3178 /// and zero the upper element. 3179 __m128d _mm_set_sd (double a) pure @trusted 3180 { 3181 double2 r = void; 3182 r.ptr[0] = a; 3183 r.ptr[1] = 0.0; 3184 return r; 3185 } 3186 unittest 3187 { 3188 __m128d A = _mm_set_sd(61.0); 3189 double[2] correct = [61.0, 0.0]; 3190 assert(A.array == correct); 3191 } 3192 3193 /// Broadcast 16-bit integer a to all elements of dst. 3194 __m128i _mm_set1_epi16 (short a) pure @trusted 3195 { 3196 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 3197 { 3198 short8 v = a; 3199 return cast(__m128i) v; 3200 } 3201 else 3202 { 3203 pragma(inline, true); 3204 return cast(__m128i)(short8(a)); 3205 } 3206 } 3207 unittest 3208 { 3209 short8 a = cast(short8) _mm_set1_epi16(31); 3210 for (int i = 0; i < 8; ++i) 3211 assert(a.array[i] == 31); 3212 } 3213 3214 /// Broadcast 32-bit integer `a` to all elements. 3215 __m128i _mm_set1_epi32 (int a) pure @trusted 3216 { 3217 pragma(inline, true); 3218 return cast(__m128i)(int4(a)); 3219 } 3220 unittest 3221 { 3222 int4 a = cast(int4) _mm_set1_epi32(31); 3223 for (int i = 0; i < 4; ++i) 3224 assert(a.array[i] == 31); 3225 } 3226 3227 /// Broadcast 64-bit integer `a` to all elements. 3228 __m128i _mm_set1_epi64 (__m64 a) pure @safe 3229 { 3230 return _mm_set_epi64(a, a); 3231 } 3232 unittest 3233 { 3234 long b = 0x1DEADCAFE; 3235 __m64 a; 3236 a.ptr[0] = b; 3237 long2 c = cast(long2) _mm_set1_epi64(a); 3238 assert(c.array[0] == b); 3239 assert(c.array[1] == b); 3240 } 3241 3242 /// Broadcast 64-bit integer `a` to all elements 3243 __m128i _mm_set1_epi64x (long a) pure @trusted 3244 { 3245 long2 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3246 return cast(__m128i)(b); 3247 } 3248 unittest 3249 { 3250 long b = 0x1DEADCAFE; 3251 long2 c = cast(long2) _mm_set1_epi64x(b); 3252 for (int i = 0; i < 2; ++i) 3253 assert(c.array[i] == b); 3254 } 3255 3256 /// Broadcast 8-bit integer `a` to all elements. 3257 __m128i _mm_set1_epi8 (byte a) pure @trusted 3258 { 3259 pragma(inline, true); 3260 byte16 b = a; // Must be on its own line to workaround https://issues.dlang.org/show_bug.cgi?id=21470 3261 return cast(__m128i)(b); 3262 } 3263 unittest 3264 { 3265 byte16 b = cast(byte16) _mm_set1_epi8(31); 3266 for (int i = 0; i < 16; ++i) 3267 assert(b.array[i] == 31); 3268 } 3269 3270 alias _mm_set1_pd = _mm_set_pd1; 3271 3272 /// Set packed 16-bit integers with the supplied values in reverse order. 3273 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, 3274 short e3, short e2, short e1, short e0) pure @trusted 3275 { 3276 short8 r = void; 3277 r.ptr[0] = e7; 3278 r.ptr[1] = e6; 3279 r.ptr[2] = e5; 3280 r.ptr[3] = e4; 3281 r.ptr[4] = e3; 3282 r.ptr[5] = e2; 3283 r.ptr[6] = e1; 3284 r.ptr[7] = e0; 3285 return cast(__m128i)(r); 3286 } 3287 unittest 3288 { 3289 short8 A = cast(short8) _mm_setr_epi16(7, 6, 5, -32768, 32767, 2, 1, 0); 3290 short[8] correct = [7, 6, 5, -32768, 32767, 2, 1, 0]; 3291 assert(A.array == correct); 3292 } 3293 3294 /// Set packed 32-bit integers with the supplied values in reverse order. 3295 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted 3296 { 3297 // Performs better than = void; with GDC 3298 pragma(inline, true); 3299 align(16) int[4] result = [e3, e2, e1, e0]; 3300 return *cast(__m128i*)(result.ptr); 3301 } 3302 unittest 3303 { 3304 int4 A = cast(int4) _mm_setr_epi32(-1, 0, -2147483648, 2147483647); 3305 int[4] correct = [-1, 0, -2147483648, 2147483647]; 3306 assert(A.array == correct); 3307 } 3308 3309 /// Set packed 64-bit integers with the supplied values in reverse order. 3310 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted 3311 { 3312 long2 r = void; 3313 r.ptr[0] = e1; 3314 r.ptr[1] = e0; 3315 return cast(__m128i)(r); 3316 } 3317 unittest 3318 { 3319 long2 A = cast(long2) _mm_setr_epi64(-1, 0); 3320 long[2] correct = [-1, 0]; 3321 assert(A.array == correct); 3322 } 3323 3324 /// Set packed 8-bit integers with the supplied values in reverse order. 3325 __m128i _mm_setr_epi8 (byte e15, byte e14, byte e13, byte e12, 3326 byte e11, byte e10, byte e9, byte e8, 3327 byte e7, byte e6, byte e5, byte e4, 3328 byte e3, byte e2, byte e1, byte e0) pure @trusted 3329 { 3330 align(16) byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8, 3331 e7, e6, e5, e4, e3, e2, e1, e0]; 3332 return *cast(__m128i*)(result.ptr); 3333 } 3334 unittest 3335 { 3336 byte16 R = cast(byte16) _mm_setr_epi8(-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); 3337 byte[16] correct = [-1, 0, 56, 127, -128, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]; 3338 assert(R.array == correct); 3339 } 3340 3341 /// Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order. 3342 __m128d _mm_setr_pd (double e1, double e0) pure @trusted 3343 { 3344 pragma(inline, true); 3345 double2 result; 3346 result.ptr[0] = e1; 3347 result.ptr[1] = e0; 3348 return result; 3349 } 3350 unittest 3351 { 3352 __m128d A = _mm_setr_pd(61.0, 55.0); 3353 double[2] correct = [61.0, 55.0]; 3354 assert(A.array == correct); 3355 } 3356 3357 /// Return vector of type `__m128d` with all elements set to zero. 3358 __m128d _mm_setzero_pd() pure @trusted 3359 { 3360 pragma(inline, true); 3361 double2 r = void; 3362 r.ptr[0] = 0.0; 3363 r.ptr[1] = 0.0; 3364 return r; 3365 } 3366 unittest 3367 { 3368 __m128d A = _mm_setzero_pd(); 3369 double[2] correct = [0.0, 0.0]; 3370 assert(A.array == correct); 3371 } 3372 3373 /// Return vector of type `__m128i` with all elements set to zero. 3374 __m128i _mm_setzero_si128() pure @trusted 3375 { 3376 pragma(inline, true); 3377 int4 r = void; 3378 r.ptr[0] = 0; 3379 r.ptr[1] = 0; 3380 r.ptr[2] = 0; 3381 r.ptr[3] = 0; 3382 return r; 3383 } 3384 unittest 3385 { 3386 __m128i A = _mm_setzero_si128(); 3387 int[4] correct = [0, 0, 0, 0]; 3388 assert(A.array == correct); 3389 } 3390 3391 /// Shuffle 32-bit integers in `a` using the control in `imm8`. 3392 /// See_also: `_MM_SHUFFLE`. 3393 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @trusted 3394 { 3395 // PERF DMD D_SIMD 3396 static if (GDC_with_SSE2) 3397 { 3398 return __builtin_ia32_pshufd(a, imm8); 3399 } 3400 else version(LDC) 3401 { 3402 return shufflevectorLDC!(int4, (imm8 >> 0) & 3, 3403 (imm8 >> 2) & 3, 3404 (imm8 >> 4) & 3, 3405 (imm8 >> 6) & 3)(a, a); 3406 } 3407 else 3408 { 3409 int4 r = void; 3410 r.ptr[0] = a.ptr[(imm8 >> 0) & 3]; 3411 r.ptr[1] = a.ptr[(imm8 >> 2) & 3]; 3412 r.ptr[2] = a.ptr[(imm8 >> 4) & 3]; 3413 r.ptr[3] = a.ptr[(imm8 >> 6) & 3]; 3414 return r; 3415 } 3416 } 3417 unittest 3418 { 3419 __m128i A = _mm_setr_epi32(0, 1, 2, 3); 3420 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3421 int4 B = cast(int4) _mm_shuffle_epi32!SHUFFLE(A); 3422 int[4] expectedB = [ 3, 2, 1, 0 ]; 3423 assert(B.array == expectedB); 3424 } 3425 3426 /// Shuffle double-precision (64-bit) floating-point elements using the control in `imm8`. 3427 /// See_also: `_MM_SHUFFLE2`. 3428 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @trusted 3429 { 3430 // PERF DMD D_SIMD 3431 static if (GDC_with_SSE2) 3432 { 3433 return __builtin_ia32_shufpd(a, b, imm8); 3434 } 3435 else version(LDC) 3436 { 3437 return shufflevectorLDC!(double2, 0 + ( imm8 & 1 ), 3438 2 + ( (imm8 >> 1) & 1 ))(a, b); 3439 } 3440 else 3441 { 3442 double2 r = void; 3443 r.ptr[0] = a.array[imm8 & 1]; 3444 r.ptr[1] = b.array[(imm8 >> 1) & 1]; 3445 return r; 3446 } 3447 } 3448 unittest 3449 { 3450 __m128d A = _mm_setr_pd(0.5, 2.0); 3451 __m128d B = _mm_setr_pd(4.0, 5.0); 3452 enum int SHUFFLE = _MM_SHUFFLE2(1, 1); 3453 __m128d R = _mm_shuffle_pd!SHUFFLE(A, B); 3454 double[2] correct = [ 2.0, 5.0 ]; 3455 assert(R.array == correct); 3456 } 3457 3458 /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in `imm8`. Store the results in the high 3459 /// 64 bits of result, with the low 64 bits being copied from from `a` to result. 3460 /// See also: `_MM_SHUFFLE`. 3461 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @trusted 3462 { 3463 // PERF DMD D_SIMD 3464 static if (GDC_with_SSE2) 3465 { 3466 return cast(__m128i) __builtin_ia32_pshufhw(cast(short8)a, imm8); 3467 } 3468 else version(LDC) 3469 { 3470 return cast(__m128i) shufflevectorLDC!(short8, 0, 1, 2, 3, 3471 4 + ( (imm8 >> 0) & 3 ), 3472 4 + ( (imm8 >> 2) & 3 ), 3473 4 + ( (imm8 >> 4) & 3 ), 3474 4 + ( (imm8 >> 6) & 3 ))(cast(short8)a, cast(short8)a); 3475 } 3476 else 3477 { 3478 short8 r = cast(short8)a; 3479 short8 sa = cast(short8)a; 3480 r.ptr[4] = sa.array[4 + ( (imm8 >> 0) & 3 ) ]; 3481 r.ptr[5] = sa.array[4 + ( (imm8 >> 2) & 3 ) ]; 3482 r.ptr[6] = sa.array[4 + ( (imm8 >> 4) & 3 ) ]; 3483 r.ptr[7] = sa.array[4 + ( (imm8 >> 6) & 3 ) ]; 3484 return cast(__m128i) r; 3485 } 3486 } 3487 unittest 3488 { 3489 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3490 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3491 short8 C = cast(short8) _mm_shufflehi_epi16!SHUFFLE(A); 3492 short[8] expectedC = [ 0, 1, 2, 3, 7, 6, 5, 4 ]; 3493 assert(C.array == expectedC); 3494 } 3495 3496 /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in `imm8`. Store the results in the low 64 3497 /// bits of result, with the high 64 bits being copied from from `a` to result. 3498 /// See_also: `_MM_SHUFFLE`. 3499 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @trusted 3500 { 3501 // PERF DMD D_SIMD 3502 static if (GDC_with_SSE2) 3503 { 3504 return cast(__m128i) __builtin_ia32_pshuflw(cast(short8)a, imm8); 3505 } 3506 else version(LDC) 3507 { 3508 return cast(__m128i) shufflevectorLDC!(short8, ( (imm8 >> 0) & 3 ), 3509 ( (imm8 >> 2) & 3 ), 3510 ( (imm8 >> 4) & 3 ), 3511 ( (imm8 >> 6) & 3 ), 4, 5, 6, 7)(cast(short8)a, cast(short8)a); 3512 } 3513 else 3514 { 3515 short8 r = cast(short8)a; 3516 short8 sa = cast(short8)a; 3517 r.ptr[0] = sa.array[(imm8 >> 0) & 3]; 3518 r.ptr[1] = sa.array[(imm8 >> 2) & 3]; 3519 r.ptr[2] = sa.array[(imm8 >> 4) & 3]; 3520 r.ptr[3] = sa.array[(imm8 >> 6) & 3]; 3521 return cast(__m128i) r; 3522 } 3523 } 3524 unittest 3525 { 3526 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3527 enum int SHUFFLE = _MM_SHUFFLE(0, 1, 2, 3); 3528 short8 B = cast(short8) _mm_shufflelo_epi16!SHUFFLE(A); 3529 short[8] expectedB = [ 3, 2, 1, 0, 4, 5, 6, 7 ]; 3530 assert(B.array == expectedB); 3531 } 3532 3533 /// Shift packed 32-bit integers in `a` left by `count` while shifting in zeros. 3534 deprecated("Use _mm_slli_epi32 instead.") __m128i _mm_sll_epi32 (__m128i a, __m128i count) pure @trusted 3535 { 3536 static if (LDC_with_SSE2) 3537 { 3538 return __builtin_ia32_pslld128(a, count); 3539 } 3540 else static if (GDC_with_SSE2) 3541 { 3542 return __builtin_ia32_pslld128(a, count); 3543 } 3544 else static if (DMD_with_32bit_asm) 3545 { 3546 asm pure nothrow @nogc @trusted 3547 { 3548 movdqu XMM0, a; 3549 movdqu XMM1, count; 3550 pslld XMM0, XMM1; 3551 movdqu a, XMM0; 3552 } 3553 return a; 3554 } 3555 else 3556 { 3557 int4 r = void; 3558 long2 lc = cast(long2)count; 3559 int bits = cast(int)(lc.array[0]); 3560 foreach(i; 0..4) 3561 r[i] = cast(uint)(a[i]) << bits; 3562 return r; 3563 } 3564 } 3565 3566 /// Shift packed 64-bit integers in `a` left by `count` while shifting in zeros. 3567 deprecated("Use _mm_slli_epi64 instead.") __m128i _mm_sll_epi64 (__m128i a, __m128i count) pure @trusted 3568 { 3569 static if (LDC_with_SSE2) 3570 { 3571 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3572 } 3573 else static if (GDC_with_SSE2) 3574 { 3575 return cast(__m128i) __builtin_ia32_psllq128(cast(long2)a, cast(long2)count); 3576 } 3577 else static if (DMD_with_32bit_asm) 3578 { 3579 asm pure nothrow @nogc @trusted 3580 { 3581 movdqu XMM0, a; 3582 movdqu XMM1, count; 3583 psllq XMM0, XMM1; 3584 movdqu a, XMM0; 3585 } 3586 return a; 3587 } 3588 else 3589 { 3590 // ARM: good since LDC 1.12 -O2 3591 // ~but -O0 version is catastrophic 3592 long2 r = void; 3593 long2 sa = cast(long2)a; 3594 long2 lc = cast(long2)count; 3595 int bits = cast(int)(lc.array[0]); 3596 foreach(i; 0..2) 3597 r.array[i] = cast(ulong)(sa.array[i]) << bits; 3598 return cast(__m128i)r; 3599 } 3600 } 3601 3602 /// Shift packed 16-bit integers in `a` left by `count` while shifting in zeros. 3603 deprecated("Use _mm_slli_epi16 instead.") __m128i _mm_sll_epi16 (__m128i a, __m128i count) pure @trusted 3604 { 3605 static if (LDC_with_SSE2) 3606 { 3607 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3608 } 3609 else static if (GDC_with_SSE2) 3610 { 3611 return cast(__m128i) _mm_sll_epi16(cast(short8)a, count); 3612 } 3613 else static if (DMD_with_32bit_asm) 3614 { 3615 asm pure nothrow @nogc 3616 { 3617 movdqu XMM0, a; 3618 movdqu XMM1, count; 3619 psllw XMM0, XMM1; 3620 movdqu a, XMM0; 3621 } 3622 return a; 3623 } 3624 else 3625 { 3626 short8 sa = cast(short8)a; 3627 long2 lc = cast(long2)count; 3628 int bits = cast(int)(lc.array[0]); 3629 short8 r = void; 3630 foreach(i; 0..8) 3631 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) << bits); 3632 return cast(int4)r; 3633 } 3634 } 3635 3636 3637 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. 3638 __m128i _mm_slli_epi32 (__m128i a, int imm8) pure @trusted 3639 { 3640 static if (GDC_with_SSE2) 3641 { 3642 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3643 } 3644 else static if (LDC_with_SSE2) 3645 { 3646 return __builtin_ia32_pslldi128(a, cast(ubyte)imm8); 3647 } 3648 else 3649 { 3650 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3651 // D says "It's illegal to shift by the same or more bits 3652 // than the size of the quantity being shifted" 3653 // and it's UB instead. 3654 int4 r = _mm_setzero_si128(); 3655 3656 ubyte count = cast(ubyte) imm8; 3657 if (count > 31) 3658 return r; 3659 3660 foreach(i; 0..4) 3661 r.array[i] = cast(uint)(a.array[i]) << count; 3662 return r; 3663 } 3664 } 3665 unittest 3666 { 3667 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 3668 __m128i B = _mm_slli_epi32(A, 1); 3669 __m128i B2 = _mm_slli_epi32(A, 1 + 256); 3670 int[4] expectedB = [ 0, 4, 6, -8]; 3671 assert(B.array == expectedB); 3672 assert(B2.array == expectedB); 3673 3674 __m128i C = _mm_slli_epi32(A, 0); 3675 int[4] expectedC = [ 0, 2, 3, -4]; 3676 assert(C.array == expectedC); 3677 3678 __m128i D = _mm_slli_epi32(A, 65); 3679 int[4] expectedD = [ 0, 0, 0, 0]; 3680 assert(D.array == expectedD); 3681 } 3682 3683 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros. 3684 __m128i _mm_slli_epi64 (__m128i a, int imm8) pure @trusted 3685 { 3686 static if (GDC_with_SSE2) 3687 { 3688 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3689 } 3690 else static if (LDC_with_SSE2) 3691 { 3692 return cast(__m128i) __builtin_ia32_psllqi128(cast(long2)a, cast(ubyte)imm8); 3693 } 3694 else 3695 { 3696 long2 sa = cast(long2)a; 3697 3698 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3699 // D says "It's illegal to shift by the same or more bits 3700 // than the size of the quantity being shifted" 3701 // and it's UB instead. 3702 long2 r = cast(long2) _mm_setzero_si128(); 3703 ubyte count = cast(ubyte) imm8; 3704 if (count > 63) 3705 return cast(__m128i)r; 3706 3707 r.ptr[0] = cast(ulong)(sa.array[0]) << count; 3708 r.ptr[1] = cast(ulong)(sa.array[1]) << count; 3709 return cast(__m128i)r; 3710 } 3711 } 3712 unittest 3713 { 3714 __m128i A = _mm_setr_epi64(8, -4); 3715 long2 B = cast(long2) _mm_slli_epi64(A, 1); 3716 long2 B2 = cast(long2) _mm_slli_epi64(A, 1 + 1024); 3717 long[2] expectedB = [ 16, -8]; 3718 assert(B.array == expectedB); 3719 assert(B2.array == expectedB); 3720 3721 long2 C = cast(long2) _mm_slli_epi64(A, 0); 3722 long[2] expectedC = [ 8, -4]; 3723 assert(C.array == expectedC); 3724 3725 long2 D = cast(long2) _mm_slli_epi64(A, 64); 3726 long[2] expectedD = [ 0, -0]; 3727 assert(D.array == expectedD); 3728 } 3729 3730 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. 3731 __m128i _mm_slli_epi16(__m128i a, int imm8) pure @trusted 3732 { 3733 static if (GDC_with_SSE2) 3734 { 3735 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3736 } 3737 else static if (LDC_with_SSE2) 3738 { 3739 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8)a, cast(ubyte)imm8); 3740 } 3741 else static if (LDC_with_ARM64) 3742 { 3743 short8 sa = cast(short8)a; 3744 short8 r = cast(short8)_mm_setzero_si128(); 3745 ubyte count = cast(ubyte) imm8; 3746 if (count > 15) 3747 return cast(__m128i)r; 3748 r = sa << short8(count); 3749 return cast(__m128i)r; 3750 } 3751 else 3752 { 3753 short8 sa = cast(short8)a; 3754 short8 r = cast(short8)_mm_setzero_si128(); 3755 ubyte count = cast(ubyte) imm8; 3756 if (count > 15) 3757 return cast(__m128i)r; 3758 foreach(i; 0..8) 3759 r.ptr[i] = cast(short)(sa.array[i] << count); 3760 return cast(__m128i)r; 3761 } 3762 } 3763 unittest 3764 { 3765 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3766 short8 B = cast(short8)( _mm_slli_epi16(A, 1) ); 3767 short8 B2 = cast(short8)( _mm_slli_epi16(A, 1 + 256) ); 3768 short[8] expectedB = [ 0, 2, 4, 6, -8, -10, 12, 14 ]; 3769 assert(B.array == expectedB); 3770 assert(B2.array == expectedB); 3771 3772 short8 C = cast(short8)( _mm_slli_epi16(A, 16) ); 3773 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0 ]; 3774 assert(C.array == expectedC); 3775 } 3776 3777 3778 /// Shift `a` left by `bytes` bytes while shifting in zeros. 3779 __m128i _mm_slli_si128(ubyte bytes)(__m128i op) pure @trusted 3780 { 3781 static if (bytes & 0xF0) 3782 { 3783 return _mm_setzero_si128(); 3784 } 3785 else static if (GDC_with_SSE2) 3786 { 3787 return cast(__m128i) __builtin_ia32_pslldqi128(cast(long2)op, cast(ubyte)(bytes * 8)); 3788 } 3789 else version(LDC) 3790 { 3791 return cast(__m128i) shufflevectorLDC!(byte16, 3792 16 - bytes, 17 - bytes, 18 - bytes, 19 - bytes, 20 - bytes, 21 - bytes, 3793 22 - bytes, 23 - bytes, 24 - bytes, 25 - bytes, 26 - bytes, 27 - bytes, 3794 28 - bytes, 29 - bytes, 30 - bytes, 31 - bytes) 3795 (cast(byte16)_mm_setzero_si128(), cast(byte16)op); 3796 } 3797 else static if (DMD_with_32bit_asm) 3798 { 3799 asm pure nothrow @nogc @trusted // somehow doesn't work for x86_64 3800 { 3801 movdqu XMM0, op; 3802 pslldq XMM0, bytes; 3803 movdqu op, XMM0; 3804 } 3805 return op; 3806 } 3807 else 3808 { 3809 byte16 A = cast(byte16)op; 3810 byte16 R = void; 3811 for (int n = 15; n >= bytes; --n) 3812 R.ptr[n] = A.array[n-bytes]; 3813 for (int n = bytes-1; n >= 0; --n) 3814 R.ptr[n] = 0; 3815 return cast(__m128i)R; 3816 } 3817 } 3818 unittest 3819 { 3820 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 3821 short8 R = cast(short8) _mm_slli_si128!8(A); // shift 8 bytes to the left 3822 short[8] correct = [ 0, 0, 0, 0, 0, 1, 2, 3 ]; 3823 assert(R.array == correct); 3824 3825 __m128i B = _mm_slli_si128!16(_mm_set1_epi32(-1)); 3826 int[4] expectedB = [0, 0, 0, 0]; 3827 assert(B.array == expectedB); 3828 } 3829 3830 /// Compute the square root of packed double-precision (64-bit) floating-point elements in `vec`. 3831 __m128d _mm_sqrt_pd(__m128d vec) pure @trusted 3832 { 3833 version(LDC) 3834 { 3835 // Disappeared with LDC 1.11 3836 static if (__VERSION__ < 2081) 3837 return __builtin_ia32_sqrtpd(vec); 3838 else 3839 { 3840 // PERF: use llvm_sqrt on the vector 3841 vec.array[0] = llvm_sqrt(vec.array[0]); 3842 vec.array[1] = llvm_sqrt(vec.array[1]); 3843 return vec; 3844 } 3845 } 3846 else static if (GDC_with_SSE2) 3847 { 3848 return __builtin_ia32_sqrtpd(vec); 3849 } 3850 else 3851 { 3852 vec.ptr[0] = sqrt(vec.array[0]); 3853 vec.ptr[1] = sqrt(vec.array[1]); 3854 return vec; 3855 } 3856 } 3857 3858 /// Compute the square root of the lower double-precision (64-bit) floating-point element in `b`, store the result in 3859 /// the lower element of result, and copy the upper element from `a` to the upper element of result. 3860 __m128d _mm_sqrt_sd(__m128d a, __m128d b) pure @trusted 3861 { 3862 // Note: the builtin has one argument, since the legacy `sqrtsd` SSE2 instruction operates on the same register only. 3863 // "128-bit Legacy SSE version: The first source operand and the destination operand are the same. 3864 // The quadword at bits 127:64 of the destination operand remains unchanged." 3865 version(LDC) 3866 { 3867 // Disappeared with LDC 1.11 3868 static if (__VERSION__ < 2081) 3869 { 3870 __m128d c = __builtin_ia32_sqrtsd(b); 3871 a[0] = c[0]; 3872 return a; 3873 } 3874 else 3875 { 3876 a.array[0] = llvm_sqrt(b.array[0]); 3877 return a; 3878 } 3879 } 3880 else static if (GDC_with_SSE2) 3881 { 3882 __m128d c = __builtin_ia32_sqrtsd(b); 3883 a.ptr[0] = c.array[0]; 3884 return a; 3885 } 3886 else 3887 { 3888 a.ptr[0] = sqrt(b.array[0]); 3889 return a; 3890 } 3891 } 3892 unittest 3893 { 3894 __m128d A = _mm_setr_pd(1.0, 3.0); 3895 __m128d B = _mm_setr_pd(4.0, 5.0); 3896 __m128d R = _mm_sqrt_sd(A, B); 3897 double[2] correct = [2.0, 3.0 ]; 3898 assert(R.array == correct); 3899 } 3900 3901 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign bits. 3902 deprecated("Use _mm_srai_epi16 instead.") __m128i _mm_sra_epi16 (__m128i a, __m128i count) pure @trusted 3903 { 3904 static if (GDC_with_SSE2) 3905 { 3906 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3907 } 3908 else static if (LDC_with_SSE2) 3909 { 3910 return cast(__m128i) __builtin_ia32_psraw128(cast(short8)a, cast(short8)count); 3911 } 3912 else 3913 { 3914 short8 sa = cast(short8)a; 3915 long2 lc = cast(long2)count; 3916 int bits = cast(int)(lc.array[0]); 3917 short8 r = void; 3918 foreach(i; 0..8) 3919 r.ptr[i] = cast(short)(sa.array[i] >> bits); 3920 return cast(int4)r; 3921 } 3922 } 3923 3924 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign bits. 3925 deprecated("Use _mm_srai_epi32 instead.") __m128i _mm_sra_epi32 (__m128i a, __m128i count) pure @trusted 3926 { 3927 static if (LDC_with_SSE2) 3928 { 3929 return __builtin_ia32_psrad128(a, count); 3930 } 3931 else static if (GDC_with_SSE2) 3932 { 3933 return __builtin_ia32_psrad128(a, count); 3934 } 3935 else 3936 { 3937 int4 r = void; 3938 long2 lc = cast(long2)count; 3939 int bits = cast(int)(lc.array[0]); 3940 r.ptr[0] = (a.array[0] >> bits); 3941 r.ptr[1] = (a.array[1] >> bits); 3942 r.ptr[2] = (a.array[2] >> bits); 3943 r.ptr[3] = (a.array[3] >> bits); 3944 return r; 3945 } 3946 } 3947 3948 3949 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign bits. 3950 __m128i _mm_srai_epi16 (__m128i a, int imm8) pure @trusted 3951 { 3952 static if (GDC_with_SSE2) 3953 { 3954 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 3955 } 3956 else static if (LDC_with_SSE2) 3957 { 3958 return cast(__m128i) __builtin_ia32_psrawi128(cast(short8)a, cast(ubyte)imm8); 3959 } 3960 else static if (LDC_with_ARM64) 3961 { 3962 short8 sa = cast(short8)a; 3963 ubyte count = cast(ubyte)imm8; 3964 if (count > 15) 3965 count = 15; 3966 short8 r = sa >> short8(count); 3967 return cast(__m128i)r; 3968 } 3969 else 3970 { 3971 short8 sa = cast(short8)a; 3972 short8 r = void; 3973 3974 // Note: the intrinsics guarantee imm8[0..7] is taken, however 3975 // D says "It's illegal to shift by the same or more bits 3976 // than the size of the quantity being shifted" 3977 // and it's UB instead. 3978 ubyte count = cast(ubyte)imm8; 3979 if (count > 15) 3980 count = 15; 3981 foreach(i; 0..8) 3982 r.ptr[i] = cast(short)(sa.array[i] >> count); 3983 return cast(int4)r; 3984 } 3985 } 3986 unittest 3987 { 3988 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 3989 short8 B = cast(short8)( _mm_srai_epi16(A, 1) ); 3990 short8 B2 = cast(short8)( _mm_srai_epi16(A, 1 + 256) ); 3991 short[8] expectedB = [ 0, 0, 1, 1, -2, -3, 3, 3 ]; 3992 assert(B.array == expectedB); 3993 assert(B2.array == expectedB); 3994 3995 short8 C = cast(short8)( _mm_srai_epi16(A, 18) ); 3996 short[8] expectedC = [ 0, 0, 0, 0, -1, -1, 0, 0 ]; 3997 assert(C.array == expectedC); 3998 } 3999 4000 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign bits. 4001 __m128i _mm_srai_epi32 (__m128i a, int imm8) pure @trusted 4002 { 4003 static if (LDC_with_SSE2) 4004 { 4005 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 4006 } 4007 else static if (GDC_with_SSE2) 4008 { 4009 return __builtin_ia32_psradi128(a, cast(ubyte)imm8); 4010 } 4011 else 4012 { 4013 int4 r = void; 4014 4015 // Note: the intrinsics guarantee imm8[0..7] is taken, however 4016 // D says "It's illegal to shift by the same or more bits 4017 // than the size of the quantity being shifted" 4018 // and it's UB instead. 4019 ubyte count = cast(ubyte) imm8; 4020 if (count > 31) 4021 count = 31; 4022 4023 r.ptr[0] = (a.array[0] >> count); 4024 r.ptr[1] = (a.array[1] >> count); 4025 r.ptr[2] = (a.array[2] >> count); 4026 r.ptr[3] = (a.array[3] >> count); 4027 return r; 4028 } 4029 } 4030 unittest 4031 { 4032 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 4033 __m128i B = _mm_srai_epi32(A, 1); 4034 __m128i B2 = _mm_srai_epi32(A, 1 + 256); 4035 int[4] expectedB = [ 0, 1, 1, -2]; 4036 assert(B.array == expectedB); 4037 assert(B2.array == expectedB); 4038 4039 __m128i C = _mm_srai_epi32(A, 32); 4040 int[4] expectedC = [ 0, 0, 0, -1]; 4041 assert(C.array == expectedC); 4042 4043 __m128i D = _mm_srai_epi32(A, 0); 4044 int[4] expectedD = [ 0, 2, 3, -4]; 4045 assert(D.array == expectedD); 4046 } 4047 4048 deprecated("Use _mm_srli_epi16 instead.") __m128i _mm_srl_epi16 (__m128i a, __m128i count) pure @trusted 4049 { 4050 static if (LDC_with_SSE2) 4051 { 4052 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 4053 } 4054 else static if (GDC_with_SSE2) 4055 { 4056 return cast(__m128i) __builtin_ia32_psrlw128(cast(short8)a, cast(short8)count); 4057 } 4058 else 4059 { 4060 short8 sa = cast(short8)a; 4061 long2 lc = cast(long2)count; 4062 int bits = cast(int)(lc.array[0]); 4063 short8 r = void; 4064 foreach(i; 0..8) 4065 r.ptr[i] = cast(short)(cast(ushort)(sa.array[i]) >> bits); 4066 return cast(int4)r; 4067 } 4068 } 4069 4070 deprecated("Use _mm_srli_epi32 instead.") __m128i _mm_srl_epi32 (__m128i a, __m128i count) pure @trusted 4071 { 4072 static if (LDC_with_SSE2) 4073 { 4074 return __builtin_ia32_psrld128(a, count); 4075 } 4076 else static if (GDC_with_SSE2) 4077 { 4078 return __builtin_ia32_psrld128(a, count); 4079 } 4080 else 4081 { 4082 int4 r = void; 4083 long2 lc = cast(long2)count; 4084 int bits = cast(int)(lc.array[0]); 4085 r.ptr[0] = cast(uint)(a.array[0]) >> bits; 4086 r.ptr[1] = cast(uint)(a.array[1]) >> bits; 4087 r.ptr[2] = cast(uint)(a.array[2]) >> bits; 4088 r.ptr[3] = cast(uint)(a.array[3]) >> bits; 4089 return r; 4090 } 4091 } 4092 4093 deprecated("Use _mm_srli_epi64 instead.") __m128i _mm_srl_epi64 (__m128i a, __m128i count) pure @trusted 4094 { 4095 static if (LDC_with_SSE2) 4096 { 4097 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 4098 } 4099 else static if (GDC_with_SSE2) 4100 { 4101 return cast(__m128i) __builtin_ia32_psrlq128(cast(long2)a, cast(long2)count); 4102 } 4103 else 4104 { 4105 // Workaround for https://issues.dlang.org/show_bug.cgi?id=23047 4106 // => avoid void initialization. 4107 long2 r; 4108 long2 sa = cast(long2)a; 4109 long2 lc = cast(long2)count; 4110 int bits = cast(int)(lc.array[0]); 4111 r.ptr[0] = cast(ulong)(sa.array[0]) >> bits; 4112 r.ptr[1] = cast(ulong)(sa.array[1]) >> bits; 4113 return cast(__m128i)r; 4114 } 4115 } 4116 4117 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in zeros. 4118 __m128i _mm_srli_epi16 (__m128i a, int imm8) pure @trusted 4119 { 4120 static if (GDC_with_SSE2) 4121 { 4122 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 4123 } 4124 else static if (LDC_with_SSE2) 4125 { 4126 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8)a, cast(ubyte)imm8); 4127 } 4128 else static if (LDC_with_ARM64) 4129 { 4130 short8 sa = cast(short8)a; 4131 short8 r = cast(short8) _mm_setzero_si128(); 4132 4133 ubyte count = cast(ubyte)imm8; 4134 if (count >= 16) 4135 return cast(__m128i)r; 4136 4137 r = sa >>> short8(count); // This facility offered with LDC, but not DMD. 4138 return cast(__m128i)r; 4139 } 4140 else 4141 { 4142 short8 sa = cast(short8)a; 4143 ubyte count = cast(ubyte)imm8; 4144 4145 short8 r = cast(short8) _mm_setzero_si128(); 4146 if (count >= 16) 4147 return cast(__m128i)r; 4148 4149 foreach(i; 0..8) 4150 r.array[i] = cast(short)(cast(ushort)(sa.array[i]) >> count); 4151 return cast(__m128i)r; 4152 } 4153 } 4154 unittest 4155 { 4156 __m128i A = _mm_setr_epi16(0, 1, 2, 3, -4, -5, 6, 7); 4157 short8 B = cast(short8)( _mm_srli_epi16(A, 1) ); 4158 short8 B2 = cast(short8)( _mm_srli_epi16(A, 1 + 256) ); 4159 short[8] expectedB = [ 0, 0, 1, 1, 0x7FFE, 0x7FFD, 3, 3 ]; 4160 assert(B.array == expectedB); 4161 assert(B2.array == expectedB); 4162 4163 short8 C = cast(short8)( _mm_srli_epi16(A, 16) ); 4164 short[8] expectedC = [ 0, 0, 0, 0, 0, 0, 0, 0]; 4165 assert(C.array == expectedC); 4166 4167 short8 D = cast(short8)( _mm_srli_epi16(A, 0) ); 4168 short[8] expectedD = [ 0, 1, 2, 3, -4, -5, 6, 7 ]; 4169 assert(D.array == expectedD); 4170 } 4171 4172 4173 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in zeros. 4174 __m128i _mm_srli_epi32 (__m128i a, int imm8) pure @trusted 4175 { 4176 static if (GDC_with_SSE2) 4177 { 4178 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 4179 } 4180 else static if (LDC_with_SSE2) 4181 { 4182 return __builtin_ia32_psrldi128(a, cast(ubyte)imm8); 4183 } 4184 else 4185 { 4186 ubyte count = cast(ubyte) imm8; 4187 4188 // Note: the intrinsics guarantee imm8[0..7] is taken, however 4189 // D says "It's illegal to shift by the same or more bits 4190 // than the size of the quantity being shifted" 4191 // and it's UB instead. 4192 int4 r = _mm_setzero_si128(); 4193 if (count >= 32) 4194 return r; 4195 r.ptr[0] = a.array[0] >>> count; 4196 r.ptr[1] = a.array[1] >>> count; 4197 r.ptr[2] = a.array[2] >>> count; 4198 r.ptr[3] = a.array[3] >>> count; 4199 return r; 4200 } 4201 } 4202 unittest 4203 { 4204 __m128i A = _mm_setr_epi32(0, 2, 3, -4); 4205 __m128i B = _mm_srli_epi32(A, 1); 4206 __m128i B2 = _mm_srli_epi32(A, 1 + 256); 4207 int[4] expectedB = [ 0, 1, 1, 0x7FFFFFFE]; 4208 assert(B.array == expectedB); 4209 assert(B2.array == expectedB); 4210 4211 __m128i C = _mm_srli_epi32(A, 255); 4212 int[4] expectedC = [ 0, 0, 0, 0 ]; 4213 assert(C.array == expectedC); 4214 } 4215 4216 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in zeros. 4217 __m128i _mm_srli_epi64 (__m128i a, int imm8) pure @trusted 4218 { 4219 static if (GDC_with_SSE2) 4220 { 4221 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4222 } 4223 else static if (LDC_with_SSE2) 4224 { 4225 return cast(__m128i) __builtin_ia32_psrlqi128(cast(long2)a, cast(ubyte)imm8); 4226 } 4227 else 4228 { 4229 long2 r = cast(long2) _mm_setzero_si128(); 4230 long2 sa = cast(long2)a; 4231 4232 ubyte count = cast(ubyte) imm8; 4233 if (count >= 64) 4234 return cast(__m128i)r; 4235 4236 r.ptr[0] = sa.array[0] >>> count; 4237 r.ptr[1] = sa.array[1] >>> count; 4238 return cast(__m128i)r; 4239 } 4240 } 4241 unittest 4242 { 4243 __m128i A = _mm_setr_epi64(8, -4); 4244 long2 B = cast(long2) _mm_srli_epi64(A, 1); 4245 long2 B2 = cast(long2) _mm_srli_epi64(A, 1 + 512); 4246 long[2] expectedB = [ 4, 0x7FFFFFFFFFFFFFFE]; 4247 assert(B.array == expectedB); 4248 assert(B2.array == expectedB); 4249 4250 long2 C = cast(long2) _mm_srli_epi64(A, 64); 4251 long[2] expectedC = [ 0, 0 ]; 4252 assert(C.array == expectedC); 4253 } 4254 4255 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4256 __m128i _mm_srli_si128(ubyte bytes)(__m128i v) pure @trusted 4257 { 4258 static if (bytes & 0xF0) 4259 { 4260 return _mm_setzero_si128(); 4261 } 4262 else static if (GDC_with_SSE2) 4263 { 4264 return cast(__m128i) __builtin_ia32_psrldqi128(cast(long2)v, cast(ubyte)(bytes * 8)); 4265 } 4266 else static if (DMD_with_32bit_asm) 4267 { 4268 asm pure nothrow @nogc @trusted 4269 { 4270 movdqu XMM0, v; 4271 psrldq XMM0, bytes; 4272 movdqu v, XMM0; 4273 } 4274 return v; 4275 } 4276 else version(LDC) 4277 { 4278 return cast(__m128i) shufflevectorLDC!(byte16, 4279 bytes+0, bytes+1, bytes+2, bytes+3, bytes+4, bytes+5, bytes+6, bytes+7, 4280 bytes+8, bytes+9, bytes+10, bytes+11, bytes+12, bytes+13, bytes+14, bytes+15) 4281 (cast(byte16) v, cast(byte16)_mm_setzero_si128()); 4282 } 4283 else 4284 { 4285 byte16 A = cast(byte16)v; 4286 byte16 R = void; 4287 for (int n = 0; n < bytes; ++n) 4288 R.ptr[15-n] = 0; 4289 for (int n = bytes; n < 16; ++n) 4290 R.ptr[15-n] = A.array[15 - n + bytes]; 4291 return cast(__m128i)R; 4292 } 4293 } 4294 unittest 4295 { 4296 __m128i R = _mm_srli_si128!4(_mm_set_epi32(4, 3, -2, 1)); 4297 int[4] correct = [-2, 3, 4, 0]; 4298 assert(R.array == correct); 4299 4300 __m128i A = _mm_srli_si128!16(_mm_set1_epi32(-1)); 4301 int[4] expectedA = [0, 0, 0, 0]; 4302 assert(A.array == expectedA); 4303 } 4304 4305 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4306 /// #BONUS 4307 __m128 _mm_srli_ps(ubyte bytes)(__m128 v) pure @safe 4308 { 4309 return cast(__m128)_mm_srli_si128!bytes(cast(__m128i)v); 4310 } 4311 unittest 4312 { 4313 __m128 R = _mm_srli_ps!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)); 4314 float[4] correct = [3.0f, 4.0f, 0, 0]; 4315 assert(R.array == correct); 4316 } 4317 4318 /// Shift `v` right by `bytes` bytes while shifting in zeros. 4319 /// #BONUS 4320 __m128d _mm_srli_pd(ubyte bytes)(__m128d v) pure @safe 4321 { 4322 return cast(__m128d) _mm_srli_si128!bytes(cast(__m128i)v); 4323 } 4324 4325 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a` into memory. 4326 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4327 void _mm_store_pd (double* mem_addr, __m128d a) pure @trusted 4328 { 4329 pragma(inline, true); 4330 __m128d* aligned = cast(__m128d*)mem_addr; 4331 *aligned = a; 4332 } 4333 unittest 4334 { 4335 align(16) double[2] A; 4336 __m128d B = _mm_setr_pd(-8.0, 9.0); 4337 _mm_store_pd(A.ptr, B); 4338 assert(A == [-8.0, 9.0]); 4339 } 4340 4341 /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous elements in memory. 4342 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated. 4343 void _mm_store_pd1 (double* mem_addr, __m128d a) pure @trusted 4344 { 4345 __m128d* aligned = cast(__m128d*)mem_addr; 4346 __m128d r; // PERF =void; 4347 r.ptr[0] = a.array[0]; 4348 r.ptr[1] = a.array[0]; 4349 *aligned = r; 4350 } 4351 4352 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. `mem_addr` does not need to 4353 /// be aligned on any particular boundary. 4354 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe 4355 { 4356 pragma(inline, true); 4357 *mem_addr = a.array[0]; 4358 } 4359 4360 /// Store 128-bits of integer data from `a` into memory. `mem_addr` must be aligned on a 16-byte boundary or a 4361 /// general-protection exception may be generated. 4362 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe 4363 { 4364 pragma(inline, true); 4365 *mem_addr = a; 4366 } 4367 4368 alias _mm_store1_pd = _mm_store_pd1; /// 4369 4370 /// Store the upper double-precision (64-bit) floating-point element from `a` into memory. 4371 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe 4372 { 4373 pragma(inline, true); 4374 *mem_addr = a.array[1]; 4375 } 4376 4377 // Note: `mem_addr` doesn't have to actually be aligned, which breaks 4378 // expectations from the user point of view. This problem also exist in C++. 4379 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe 4380 { 4381 pragma(inline, true); 4382 long* dest = cast(long*)mem_addr; 4383 long2 la = cast(long2)a; 4384 *dest = la.array[0]; 4385 } 4386 unittest 4387 { 4388 long[3] A = [1, 2, 3]; 4389 _mm_storel_epi64(cast(__m128i*)(&A[1]), _mm_set_epi64x(0x1_0000_0000, 0x1_0000_0000)); 4390 long[3] correct = [1, 0x1_0000_0000, 3]; 4391 assert(A == correct); 4392 } 4393 4394 /// Store the lower double-precision (64-bit) floating-point element from `a` into memory. 4395 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe 4396 { 4397 pragma(inline, true); 4398 *mem_addr = a.array[0]; 4399 } 4400 4401 /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse 4402 /// order. `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception 4403 /// may be generated. 4404 void _mm_storer_pd (double* mem_addr, __m128d a) pure @system 4405 { 4406 __m128d reversed = void; 4407 reversed.ptr[0] = a.array[1]; 4408 reversed.ptr[1] = a.array[0]; 4409 *cast(__m128d*)mem_addr = reversed; 4410 } 4411 unittest 4412 { 4413 align(16) double[2] A = [0.0, 1.0]; 4414 _mm_storer_pd(A.ptr, _mm_setr_pd(2.0, 3.0)); 4415 assert(A[0] == 3.0 && A[1] == 2.0); 4416 } 4417 4418 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 4419 /// `a` into memory. `mem_addr` does not need to be aligned on any particular boundary. 4420 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @trusted // TODO: signature, should be system 4421 { 4422 // PERF DMD 4423 pragma(inline, true); 4424 static if (GDC_with_SSE2) 4425 { 4426 __builtin_ia32_storeupd(mem_addr, a); 4427 } 4428 else version(LDC) 4429 { 4430 storeUnaligned!double2(a, mem_addr); 4431 } 4432 else 4433 { 4434 mem_addr[0] = a.array[0]; 4435 mem_addr[1] = a.array[1]; 4436 } 4437 } 4438 unittest 4439 { 4440 __m128d A = _mm_setr_pd(3.0, 4.0); 4441 align(16) double[4] R = [0.0, 0, 0, 0]; 4442 double[2] correct = [3.0, 4.0]; 4443 _mm_storeu_pd(&R[1], A); 4444 assert(R[1..3] == correct); 4445 } 4446 4447 /// Store 128-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular 4448 /// boundary. 4449 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @trusted // TODO: signature is wrong, mem_addr is not aligned 4450 { 4451 // PERF: DMD 4452 pragma(inline, true); 4453 static if (GDC_with_SSE2) 4454 { 4455 __builtin_ia32_storedqu(cast(char*)mem_addr, cast(ubyte16)a); 4456 } 4457 else version(LDC) 4458 { 4459 storeUnaligned!__m128i(a, cast(int*)mem_addr); 4460 } 4461 else 4462 { 4463 int* p = cast(int*)mem_addr; 4464 p[0] = a.array[0]; 4465 p[1] = a.array[1]; 4466 p[2] = a.array[2]; 4467 p[3] = a.array[3]; 4468 } 4469 } 4470 unittest 4471 { 4472 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 4473 align(16) int[6] R = [0, 0, 0, 0, 0, 0]; 4474 int[4] correct = [1, 2, 3, 4]; 4475 _mm_storeu_si128(cast(__m128i*)(&R[1]), A); 4476 assert(R[1..5] == correct); 4477 } 4478 4479 /// Store 32-bit integer from the first element of `a` into memory. 4480 /// `mem_addr` does not need to be aligned on any particular boundary. 4481 void _mm_storeu_si32 (void* mem_addr, __m128i a) pure @trusted 4482 { 4483 pragma(inline, true); 4484 int* dest = cast(int*)mem_addr; 4485 *dest = a.array[0]; 4486 } 4487 unittest 4488 { 4489 int[2] arr = [-24, 12]; 4490 _mm_storeu_si32(&arr[1], _mm_setr_epi32(-1, -2, -6, -7)); 4491 assert(arr == [-24, -1]); 4492 } 4493 4494 /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) 4495 /// from `a` into memory using a non-temporal memory hint. `mem_addr` must be aligned on a 16-byte 4496 /// boundary or a general-protection exception may be generated. 4497 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4498 void _mm_stream_pd (double* mem_addr, __m128d a) pure @system 4499 { 4500 // PERF DMD D_SIMD 4501 static if (GDC_with_SSE2) 4502 { 4503 return __builtin_ia32_movntpd(mem_addr, a); 4504 } 4505 else version(LDC) 4506 { 4507 enum prefix = `!0 = !{ i32 1 }`; 4508 enum ir = ` 4509 store <2 x double> %1, <2 x double>* %0, align 16, !nontemporal !0 4510 ret void`; 4511 LDCInlineIREx!(prefix, ir, "", void, double2*, double2)(cast(double2*)mem_addr, a); 4512 } 4513 else 4514 { 4515 // Regular store instead. 4516 __m128d* dest = cast(__m128d*)mem_addr; 4517 *dest = a; 4518 } 4519 } 4520 unittest 4521 { 4522 align(16) double[2] A; 4523 __m128d B = _mm_setr_pd(-8.0, 9.0); 4524 _mm_stream_pd(A.ptr, B); 4525 assert(A == [-8.0, 9.0]); 4526 } 4527 4528 /// Store 128-bits of integer data from a into memory using a non-temporal memory hint. 4529 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception 4530 /// may be generated. 4531 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4532 void _mm_stream_si128 (__m128i* mem_addr, __m128i a) pure @trusted 4533 { 4534 // PERF DMD D_SIMD 4535 static if (GDC_with_SSE2) 4536 { 4537 return __builtin_ia32_movntdq (cast(long2*)mem_addr, cast(long2)a); 4538 } 4539 else version(LDC) 4540 { 4541 enum prefix = `!0 = !{ i32 1 }`; 4542 enum ir = ` 4543 store <4 x i32> %1, <4 x i32>* %0, align 16, !nontemporal !0 4544 ret void`; 4545 LDCInlineIREx!(prefix, ir, "", void, int4*, int4)(cast(int4*)mem_addr, a); 4546 } 4547 else 4548 { 4549 // Regular store instead. 4550 __m128i* dest = cast(__m128i*)mem_addr; 4551 *dest = a; 4552 } 4553 } 4554 unittest 4555 { 4556 align(16) int[4] A; 4557 __m128i B = _mm_setr_epi32(-8, 9, 10, -11); 4558 _mm_stream_si128(cast(__m128i*)A.ptr, B); 4559 assert(A == [-8, 9, 10, -11]); 4560 } 4561 4562 /// Store 32-bit integer a into memory using a non-temporal hint to minimize cache 4563 /// pollution. If the cache line containing address `mem_addr` is already in the cache, 4564 /// the cache will be updated. 4565 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4566 void _mm_stream_si32 (int* mem_addr, int a) pure @trusted 4567 { 4568 // PERF DMD D_SIMD 4569 static if (GDC_with_SSE2) 4570 { 4571 return __builtin_ia32_movnti(mem_addr, a); 4572 } 4573 else version(LDC) 4574 { 4575 enum prefix = `!0 = !{ i32 1 }`; 4576 enum ir = ` 4577 store i32 %1, i32* %0, !nontemporal !0 4578 ret void`; 4579 LDCInlineIREx!(prefix, ir, "", void, int*, int)(mem_addr, a); 4580 } 4581 else 4582 { 4583 // Regular store instead. 4584 *mem_addr = a; 4585 } 4586 } 4587 unittest 4588 { 4589 int A; 4590 _mm_stream_si32(&A, -34); 4591 assert(A == -34); 4592 } 4593 4594 /// Store 64-bit integer a into memory using a non-temporal hint to minimize 4595 /// cache pollution. If the cache line containing address `mem_addr` is already 4596 /// in the cache, the cache will be updated. 4597 /// Note: non-temporal stores should be followed by `_mm_sfence()` for reader threads. 4598 void _mm_stream_si64 (long* mem_addr, long a) pure @trusted 4599 { 4600 // PERF DMD D_SIMD 4601 static if (GDC_with_SSE2) 4602 { 4603 return __builtin_ia32_movnti64(mem_addr, a); 4604 } 4605 else version(LDC) 4606 { 4607 enum prefix = `!0 = !{ i32 1 }`; 4608 enum ir = ` 4609 store i64 %1, i64* %0, !nontemporal !0 4610 ret void`; 4611 LDCInlineIREx!(prefix, ir, "", void, long*, long)(mem_addr, a); 4612 4613 } 4614 else 4615 { 4616 // Regular store instead. 4617 *mem_addr = a; 4618 } 4619 } 4620 unittest 4621 { 4622 long A; 4623 _mm_stream_si64(&A, -46); 4624 assert(A == -46); 4625 } 4626 4627 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. 4628 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe 4629 { 4630 pragma(inline, true); 4631 return cast(__m128i)(cast(short8)a - cast(short8)b); 4632 } 4633 // TODO untitest 4634 4635 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. 4636 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe 4637 { 4638 pragma(inline, true); 4639 return cast(__m128i)(cast(int4)a - cast(int4)b); 4640 } 4641 // TODO untitest 4642 4643 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`. 4644 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe 4645 { 4646 pragma(inline, true); 4647 return cast(__m128i)(cast(long2)a - cast(long2)b); 4648 } 4649 // TODO untitest 4650 4651 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. 4652 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe 4653 { 4654 pragma(inline, true); 4655 return cast(__m128i)(cast(byte16)a - cast(byte16)b); 4656 } 4657 // TODO untitest 4658 4659 /// Subtract packed double-precision (64-bit) floating-point elements in `b` from packed double-precision (64-bit) 4660 /// floating-point elements in `a`. 4661 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe 4662 { 4663 pragma(inline, true); 4664 return a - b; 4665 } 4666 unittest 4667 { 4668 __m128d A = _mm_setr_pd(4000.0, -8.0); 4669 __m128d B = _mm_setr_pd(12.0, -8450.0); 4670 __m128d C = _mm_sub_pd(A, B); 4671 double[2] correct = [3988.0, 8442.0]; 4672 assert(C.array == correct); 4673 } 4674 4675 /// Subtract the lower double-precision (64-bit) floating-point element in `b` from the lower double-precision (64-bit) 4676 /// floating-point element in `a`, store that in the lower element of result, and copy the upper element from `a` to the 4677 /// upper element of result. 4678 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @trusted 4679 { 4680 version(DigitalMars) 4681 { 4682 // Work-around for https://issues.dlang.org/show_bug.cgi?id=19599 4683 // Note that this is unneeded since DMD >= 2.094.0 at least, haven't investigated again 4684 asm pure nothrow @nogc @trusted { nop;} 4685 a[0] = a[0] - b[0]; 4686 return a; 4687 } 4688 else static if (GDC_with_SSE2) 4689 { 4690 return __builtin_ia32_subsd(a, b); 4691 } 4692 else 4693 { 4694 a.ptr[0] -= b.array[0]; 4695 return a; 4696 } 4697 } 4698 unittest 4699 { 4700 __m128d a = [1.5, -2.0]; 4701 a = _mm_sub_sd(a, a); 4702 assert(a.array == [0.0, -2.0]); 4703 } 4704 4705 /// Subtract 64-bit integer `b` from 64-bit integer `a`. 4706 __m64 _mm_sub_si64 (__m64 a, __m64 b) pure @safe 4707 { 4708 pragma(inline, true); 4709 return a - b; 4710 } 4711 unittest 4712 { 4713 __m64 A, B; 4714 A = -1214; 4715 B = 489415; 4716 __m64 C = _mm_sub_si64(B, A); 4717 assert(C.array[0] == 489415 + 1214); 4718 } 4719 4720 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4721 __m128i _mm_subs_epi16(__m128i a, __m128i b) pure @trusted 4722 { 4723 version(LDC) 4724 { 4725 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4726 { 4727 // Generates PSUBSW since LDC 1.15 -O0 4728 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4729 4730 enum prefix = `declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4731 enum ir = ` 4732 %r = call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4733 ret <8 x i16> %r`; 4734 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4735 } 4736 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4737 { 4738 /// Add packed 16-bit signed integers in `a` and `b` using signed saturation. 4739 short[8] res; // PERF: =void; 4740 short8 sa = cast(short8)a; 4741 short8 sb = cast(short8)b; 4742 foreach(i; 0..8) 4743 res[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4744 return _mm_loadu_si128(cast(int4*)res.ptr); 4745 } 4746 else static if (LDC_with_SSE2) 4747 { 4748 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 4749 } 4750 else 4751 static assert(false); 4752 } 4753 else static if (GDC_with_SSE2) 4754 { 4755 return cast(__m128i) __builtin_ia32_psubsw128(cast(short8) a, cast(short8) b); 4756 } 4757 else 4758 { 4759 short[8] res; // PERF =void; 4760 short8 sa = cast(short8)a; 4761 short8 sb = cast(short8)b; 4762 foreach(i; 0..8) 4763 res.ptr[i] = saturateSignedIntToSignedShort(sa.array[i] - sb.array[i]); 4764 return _mm_loadu_si128(cast(int4*)res.ptr); 4765 } 4766 } 4767 unittest 4768 { 4769 short8 res = cast(short8) _mm_subs_epi16(_mm_setr_epi16(32760, -32760, 5, 4, 3, 2, 1, 0), 4770 _mm_setr_epi16(-10 , 16, 5, 4, 3, 2, 1, 0)); 4771 static immutable short[8] correctResult = [32767, -32768, 0, 0, 0, 0, 0, 0]; 4772 assert(res.array == correctResult); 4773 } 4774 4775 /// Add packed 8-bit signed integers in `a` and `b` using signed saturation. 4776 __m128i _mm_subs_epi8(__m128i a, __m128i b) pure @trusted 4777 { 4778 version(LDC) 4779 { 4780 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4781 { 4782 // x86: Generates PSUBSB since LDC 1.15 -O0 4783 // ARM: Generates sqsub.16b since LDC 1.21 -O0 4784 enum prefix = `declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 4785 enum ir = ` 4786 %r = call <16 x i8> @llvm.ssub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 4787 ret <16 x i8> %r`; 4788 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 4789 } 4790 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4791 { 4792 byte[16] res; // PERF =void; 4793 byte16 sa = cast(byte16)a; 4794 byte16 sb = cast(byte16)b; 4795 foreach(i; 0..16) 4796 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4797 return _mm_loadu_si128(cast(int4*)res.ptr); 4798 } 4799 else static if (LDC_with_SSE2) 4800 { 4801 return cast(__m128i) __builtin_ia32_psubsb128(cast(byte16) a, cast(byte16) b); 4802 } 4803 else 4804 static assert(false); 4805 } 4806 else static if (GDC_with_SSE2) 4807 { 4808 return cast(__m128i) __builtin_ia32_psubsb128(cast(ubyte16) a, cast(ubyte16) b); 4809 } 4810 else 4811 { 4812 byte[16] res; // PERF =void; 4813 byte16 sa = cast(byte16)a; 4814 byte16 sb = cast(byte16)b; 4815 foreach(i; 0..16) 4816 res[i] = saturateSignedWordToSignedByte(sa.array[i] - sb.array[i]); 4817 return _mm_loadu_si128(cast(int4*)res.ptr); 4818 } 4819 } 4820 unittest 4821 { 4822 byte16 res = cast(byte16) _mm_subs_epi8(_mm_setr_epi8(-128, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4823 _mm_setr_epi8( 15, -14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4824 static immutable byte[16] correctResult = [-128, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4825 assert(res.array == correctResult); 4826 } 4827 4828 /// Add packed 16-bit unsigned integers in `a` and `b` using unsigned saturation. 4829 __m128i _mm_subs_epu16(__m128i a, __m128i b) pure @trusted 4830 { 4831 version(LDC) 4832 { 4833 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4834 { 4835 // x86: Generates PSUBUSW since LDC 1.15 -O0 4836 // ARM: Generates uqsub.8h since LDC 1.21 -O0 4837 enum prefix = `declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)`; 4838 enum ir = ` 4839 %r = call <8 x i16> @llvm.usub.sat.v8i16( <8 x i16> %0, <8 x i16> %1) 4840 ret <8 x i16> %r`; 4841 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", short8, short8, short8)(cast(short8)a, cast(short8)b); 4842 } 4843 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4844 { 4845 short[8] res; // PERF =void; 4846 short8 sa = cast(short8)a; 4847 short8 sb = cast(short8)b; 4848 foreach(i; 0..8) 4849 { 4850 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4851 res[i] = saturateSignedIntToUnsignedShort(sum); 4852 } 4853 return _mm_loadu_si128(cast(int4*)res.ptr); 4854 } 4855 else static if (LDC_with_SSE2) 4856 { 4857 return cast(__m128i) __builtin_ia32_psubusw128(a, b); 4858 } 4859 else 4860 static assert(false); 4861 } 4862 else static if (GDC_with_SSE2) 4863 { 4864 return cast(__m128i) __builtin_ia32_psubusw128(cast(short8)a, cast(short8)b); 4865 } 4866 else 4867 { 4868 short[8] res; // PERF =void; 4869 short8 sa = cast(short8)a; 4870 short8 sb = cast(short8)b; 4871 foreach(i; 0..8) 4872 { 4873 int sum = cast(ushort)(sa.array[i]) - cast(ushort)(sb.array[i]); 4874 res[i] = saturateSignedIntToUnsignedShort(sum); 4875 } 4876 return _mm_loadu_si128(cast(int4*)res.ptr); 4877 } 4878 } 4879 unittest 4880 { 4881 short8 R = cast(short8) _mm_subs_epu16(_mm_setr_epi16(cast(short)65534, 1, 5, 4, 3, 2, 1, 0), 4882 _mm_setr_epi16(cast(short)65535, 16, 4, 4, 3, 0, 1, 0)); 4883 static immutable short[8] correct = [ 0, 0, 1, 0, 0, 2, 0, 0]; 4884 assert(R.array == correct); 4885 } 4886 4887 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4888 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4889 { 4890 version(LDC) 4891 { 4892 static if (__VERSION__ >= 2085) // saturation x86 intrinsics disappeared in LLVM 8 4893 { 4894 // x86: Generates PSUBUSB since LDC 1.15 -O0 4895 // ARM: Generates uqsub.16b since LDC 1.21 -O0 4896 enum prefix = `declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)`; 4897 enum ir = ` 4898 %r = call <16 x i8> @llvm.usub.sat.v16i8( <16 x i8> %0, <16 x i8> %1) 4899 ret <16 x i8> %r`; 4900 return cast(__m128i) LDCInlineIREx!(prefix, ir, "", byte16, byte16, byte16)(cast(byte16)a, cast(byte16)b); 4901 } 4902 else static if (LDC_with_ARM) // Raspberry ships with LDC 1.12, no saturation 4903 { 4904 /// Add packed 8-bit unsigned integers in `a` and `b` using unsigned saturation. 4905 __m128i _mm_subs_epu8(__m128i a, __m128i b) pure @trusted 4906 { 4907 ubyte[16] res; // PERF =void; 4908 byte16 sa = cast(byte16)a; 4909 byte16 sb = cast(byte16)b; 4910 foreach(i; 0..16) 4911 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4912 return _mm_loadu_si128(cast(int4*)res.ptr); 4913 } 4914 } 4915 else static if (LDC_with_SSE2) 4916 { 4917 return __builtin_ia32_psubusb128(a, b); 4918 } 4919 else 4920 static assert(false); 4921 } 4922 else static if (GDC_with_SSE2) 4923 { 4924 return cast(__m128i) __builtin_ia32_psubusb128(cast(ubyte16) a, cast(ubyte16) b); 4925 } 4926 else 4927 { 4928 ubyte[16] res; // PERF =void; 4929 byte16 sa = cast(byte16)a; 4930 byte16 sb = cast(byte16)b; 4931 foreach(i; 0..16) 4932 res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) - cast(ubyte)(sb.array[i])); 4933 return _mm_loadu_si128(cast(int4*)res.ptr); 4934 } 4935 } 4936 unittest 4937 { 4938 byte16 res = cast(byte16) _mm_subs_epu8(_mm_setr_epi8(cast(byte)254, 127, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 4939 _mm_setr_epi8(cast(byte)255, 120, 14, 42, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); 4940 static immutable byte[16] correctResult = [ 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 4941 assert(res.array == correctResult); 4942 } 4943 4944 // Note: the only difference between these intrinsics is the signalling 4945 // behaviour of quiet NaNs. This is incorrect but the case where 4946 // you would want to differentiate between qNaN and sNaN and then 4947 // treat them differently on purpose seems extremely rare. 4948 alias _mm_ucomieq_sd = _mm_comieq_sd; /// 4949 alias _mm_ucomige_sd = _mm_comige_sd; /// 4950 alias _mm_ucomigt_sd = _mm_comigt_sd; /// 4951 alias _mm_ucomile_sd = _mm_comile_sd; /// 4952 alias _mm_ucomilt_sd = _mm_comilt_sd; /// 4953 alias _mm_ucomineq_sd = _mm_comineq_sd; /// 4954 4955 /// Return vector of type `__m128d` with undefined elements. 4956 __m128d _mm_undefined_pd() pure @safe 4957 { 4958 pragma(inline, true); 4959 __m128d result = void; 4960 return result; 4961 } 4962 4963 /// Return vector of type `__m128i` with undefined elements. 4964 __m128i _mm_undefined_si128() pure @safe 4965 { 4966 pragma(inline, true); 4967 __m128i result = void; 4968 return result; 4969 } 4970 4971 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. 4972 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @trusted 4973 { 4974 // PERF DMD D_SIMD 4975 static if (GDC_with_SSE2) 4976 { 4977 return cast(__m128i) __builtin_ia32_punpckhwd128(cast(short8) a, cast(short8) b); 4978 } 4979 else version(LDC) 4980 { 4981 return cast(__m128i) shufflevectorLDC!(short8, 4, 12, 5, 13, 6, 14, 7, 15) 4982 (cast(short8)a, cast(short8)b); 4983 } 4984 else static if (DMD_with_32bit_asm) 4985 { 4986 asm pure nothrow @nogc @trusted 4987 { 4988 movdqu XMM0, a; 4989 movdqu XMM1, b; 4990 punpckhwd XMM0, XMM1; 4991 movdqu a, XMM0; 4992 } 4993 return a; 4994 } 4995 else 4996 { 4997 short8 r = void; 4998 short8 sa = cast(short8)a; 4999 short8 sb = cast(short8)b; 5000 r.ptr[0] = sa.array[4]; 5001 r.ptr[1] = sb.array[4]; 5002 r.ptr[2] = sa.array[5]; 5003 r.ptr[3] = sb.array[5]; 5004 r.ptr[4] = sa.array[6]; 5005 r.ptr[5] = sb.array[6]; 5006 r.ptr[6] = sa.array[7]; 5007 r.ptr[7] = sb.array[7]; 5008 return cast(__m128i)r; 5009 } 5010 } 5011 unittest 5012 { 5013 __m128i A = _mm_setr_epi16(4, 5, 6, 7, 8, 9, 10, 11); 5014 __m128i B = _mm_setr_epi16(12, 13, 14, 15, 16, 17, 18, 19); 5015 short8 C = cast(short8)(_mm_unpackhi_epi16(A, B)); 5016 short[8] correct = [8, 16, 9, 17, 10, 18, 11, 19]; 5017 assert(C.array == correct); 5018 } 5019 5020 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. 5021 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @trusted 5022 { 5023 static if (GDC_with_SSE2) 5024 { 5025 return __builtin_ia32_punpckhdq128(a, b); 5026 } 5027 else version(LDC) 5028 { 5029 return shufflevectorLDC!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b); 5030 } 5031 else 5032 { 5033 __m128i r = void; 5034 r.ptr[0] = a.array[2]; 5035 r.ptr[1] = b.array[2]; 5036 r.ptr[2] = a.array[3]; 5037 r.ptr[3] = b.array[3]; 5038 return r; 5039 } 5040 } 5041 unittest 5042 { 5043 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 5044 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 5045 __m128i C = _mm_unpackhi_epi32(A, B); 5046 int[4] correct = [3, 7, 4, 8]; 5047 assert(C.array == correct); 5048 } 5049 5050 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`. 5051 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @trusted 5052 { 5053 static if (GDC_with_SSE2) 5054 { 5055 return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b); 5056 } 5057 else 5058 { 5059 __m128i r = cast(__m128i)b; 5060 r[0] = a[2]; 5061 r[1] = a[3]; 5062 return r; 5063 } 5064 } 5065 unittest // Issue #36 5066 { 5067 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 5068 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 5069 long2 C = cast(long2)(_mm_unpackhi_epi64(A, B)); 5070 long[2] correct = [0x33333333_33333333, 0x55555555_55555555]; 5071 assert(C.array == correct); 5072 } 5073 5074 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. 5075 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @trusted 5076 { 5077 // PERF DMD D_SIMD 5078 static if (GDC_with_SSE2) 5079 { 5080 return cast(__m128i) __builtin_ia32_punpckhbw128(cast(ubyte16)a, cast(ubyte16)b); 5081 } 5082 else static if (DMD_with_32bit_asm) 5083 { 5084 asm pure nothrow @nogc @trusted 5085 { 5086 movdqu XMM0, a; 5087 movdqu XMM1, b; 5088 punpckhbw XMM0, XMM1; 5089 movdqu a, XMM0; 5090 } 5091 return a; 5092 } 5093 else version(LDC) 5094 { 5095 return cast(__m128i)shufflevectorLDC!(byte16, 8, 24, 9, 25, 10, 26, 11, 27, 5096 12, 28, 13, 29, 14, 30, 15, 31) 5097 (cast(byte16)a, cast(byte16)b); 5098 } 5099 else 5100 { 5101 byte16 r = void; 5102 byte16 ba = cast(byte16)a; 5103 byte16 bb = cast(byte16)b; 5104 r.ptr[0] = ba.array[8]; 5105 r.ptr[1] = bb.array[8]; 5106 r.ptr[2] = ba.array[9]; 5107 r.ptr[3] = bb.array[9]; 5108 r.ptr[4] = ba.array[10]; 5109 r.ptr[5] = bb.array[10]; 5110 r.ptr[6] = ba.array[11]; 5111 r.ptr[7] = bb.array[11]; 5112 r.ptr[8] = ba.array[12]; 5113 r.ptr[9] = bb.array[12]; 5114 r.ptr[10] = ba.array[13]; 5115 r.ptr[11] = bb.array[13]; 5116 r.ptr[12] = ba.array[14]; 5117 r.ptr[13] = bb.array[14]; 5118 r.ptr[14] = ba.array[15]; 5119 r.ptr[15] = bb.array[15]; 5120 return cast(__m128i)r; 5121 } 5122 } 5123 unittest 5124 { 5125 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 5126 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 5127 byte16 C = cast(byte16) _mm_unpackhi_epi8(A, B); 5128 byte[16] correct = [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]; 5129 assert(C.array == correct); 5130 } 5131 5132 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of `a` and `b`. 5133 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @trusted 5134 { 5135 // PERF DMD D_SIMD 5136 static if (GDC_with_SSE2) 5137 { 5138 return __builtin_ia32_unpckhpd(a, b); 5139 } 5140 else version(LDC) 5141 { 5142 return shufflevectorLDC!(__m128d, 1, 3)(a, b); 5143 } 5144 else 5145 { 5146 double2 r = void; 5147 r.ptr[0] = a.array[1]; 5148 r.ptr[1] = b.array[1]; 5149 return r; 5150 } 5151 } 5152 unittest 5153 { 5154 __m128d A = _mm_setr_pd(4.0, 6.0); 5155 __m128d B = _mm_setr_pd(7.0, 9.0); 5156 __m128d C = _mm_unpackhi_pd(A, B); 5157 double[2] correct = [6.0, 9.0]; 5158 assert(C.array == correct); 5159 } 5160 5161 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. 5162 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @trusted 5163 { 5164 // PERF DMD SIMD 5165 static if (GDC_with_SSE2) 5166 { 5167 return cast(__m128i) __builtin_ia32_punpcklwd128(cast(short8) a, cast(short8) b); 5168 } 5169 else version(LDC) 5170 { 5171 return cast(__m128i) shufflevectorLDC!(short8, 0, 8, 1, 9, 2, 10, 3, 11)(cast(short8)a, cast(short8)b); 5172 } 5173 else static if (DMD_with_32bit_asm) 5174 { 5175 asm pure nothrow @nogc @trusted 5176 { 5177 movdqu XMM0, a; 5178 movdqu XMM1, b; 5179 punpcklwd XMM0, XMM1; 5180 movdqu a, XMM0; 5181 } 5182 return a; 5183 } 5184 else 5185 { 5186 short8 r = void; 5187 short8 sa = cast(short8)a; 5188 short8 sb = cast(short8)b; 5189 r.ptr[0] = sa.array[0]; 5190 r.ptr[1] = sb.array[0]; 5191 r.ptr[2] = sa.array[1]; 5192 r.ptr[3] = sb.array[1]; 5193 r.ptr[4] = sa.array[2]; 5194 r.ptr[5] = sb.array[2]; 5195 r.ptr[6] = sa.array[3]; 5196 r.ptr[7] = sb.array[3]; 5197 return cast(__m128i)r; 5198 } 5199 } 5200 unittest 5201 { 5202 __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 5203 __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 5204 short8 C = cast(short8) _mm_unpacklo_epi16(A, B); 5205 short[8] correct = [0, 8, 1, 9, 2, 10, 3, 11]; 5206 assert(C.array == correct); 5207 } 5208 5209 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. 5210 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @trusted 5211 { 5212 // PERF DMD 5213 static if (GDC_with_SSE2) 5214 { 5215 return __builtin_ia32_punpckldq128(a, b); 5216 } 5217 else version(LDC) 5218 { 5219 return shufflevectorLDC!(int4, 0, 4, 1, 5)(cast(int4)a, cast(int4)b); 5220 } 5221 else 5222 { 5223 __m128i r; 5224 r.ptr[0] = a.array[0]; 5225 r.ptr[1] = b.array[0]; 5226 r.ptr[2] = a.array[1]; 5227 r.ptr[3] = b.array[1]; 5228 return r; 5229 } 5230 } 5231 unittest 5232 { 5233 __m128i A = _mm_setr_epi32(1, 2, 3, 4); 5234 __m128i B = _mm_setr_epi32(5, 6, 7, 8); 5235 __m128i C = _mm_unpacklo_epi32(A, B); 5236 int[4] correct = [1, 5, 2, 6]; 5237 assert(C.array == correct); 5238 } 5239 5240 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`. 5241 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @trusted 5242 { 5243 static if (GDC_with_SSE2) 5244 { 5245 return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b); 5246 } 5247 else 5248 { 5249 long2 lA = cast(long2)a; 5250 long2 lB = cast(long2)b; 5251 long2 R; // PERF =void; 5252 R.ptr[0] = lA.array[0]; 5253 R.ptr[1] = lB.array[0]; 5254 return cast(__m128i)R; 5255 } 5256 } 5257 unittest // Issue #36 5258 { 5259 __m128i A = _mm_setr_epi64(0x22222222_22222222, 0x33333333_33333333); 5260 __m128i B = _mm_setr_epi64(0x44444444_44444444, 0x55555555_55555555); 5261 long2 C = cast(long2)(_mm_unpacklo_epi64(A, B)); 5262 long[2] correct = [0x22222222_22222222, 0x44444444_44444444]; 5263 assert(C.array == correct); 5264 } 5265 5266 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. 5267 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @trusted 5268 { 5269 // PERF DMD D_SIMD 5270 static if (GDC_with_SSE2) 5271 { 5272 return cast(__m128i) __builtin_ia32_punpcklbw128(cast(ubyte16) a, cast(ubyte16) b); 5273 } 5274 else static if (DMD_with_32bit_asm) 5275 { 5276 asm pure nothrow @nogc @trusted 5277 { 5278 movdqu XMM0, a; 5279 movdqu XMM1, b; 5280 punpcklbw XMM0, XMM1; 5281 movdqu a, XMM0; 5282 } 5283 return a; 5284 } 5285 else version(LDC) 5286 { 5287 return cast(__m128i) shufflevectorLDC!(byte16, 0, 16, 1, 17, 2, 18, 3, 19, 5288 4, 20, 5, 21, 6, 22, 7, 23) 5289 (cast(byte16)a, cast(byte16)b); 5290 } 5291 else 5292 { 5293 byte16 r = void; 5294 byte16 ba = cast(byte16)a; 5295 byte16 bb = cast(byte16)b; 5296 r.ptr[0] = ba.array[0]; 5297 r.ptr[1] = bb.array[0]; 5298 r.ptr[2] = ba.array[1]; 5299 r.ptr[3] = bb.array[1]; 5300 r.ptr[4] = ba.array[2]; 5301 r.ptr[5] = bb.array[2]; 5302 r.ptr[6] = ba.array[3]; 5303 r.ptr[7] = bb.array[3]; 5304 r.ptr[8] = ba.array[4]; 5305 r.ptr[9] = bb.array[4]; 5306 r.ptr[10] = ba.array[5]; 5307 r.ptr[11] = bb.array[5]; 5308 r.ptr[12] = ba.array[6]; 5309 r.ptr[13] = bb.array[6]; 5310 r.ptr[14] = ba.array[7]; 5311 r.ptr[15] = bb.array[7]; 5312 return cast(__m128i)r; 5313 } 5314 } 5315 unittest 5316 { 5317 __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 5318 __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 5319 byte16 C = cast(byte16) _mm_unpacklo_epi8(A, B); 5320 byte[16] correct = [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]; 5321 assert(C.array == correct); 5322 } 5323 5324 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of `a` and `b`. 5325 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @trusted 5326 { 5327 // PERF DMD D_SIMD 5328 static if (GDC_with_SSE2) 5329 { 5330 return __builtin_ia32_unpcklpd(a, b); 5331 } 5332 else version(LDC) 5333 { 5334 return shufflevectorLDC!(__m128d, 0, 2)(a, b); 5335 } 5336 else 5337 { 5338 double2 r = void; 5339 r.ptr[0] = a.array[0]; 5340 r.ptr[1] = b.array[0]; 5341 return r; 5342 } 5343 } 5344 unittest 5345 { 5346 __m128d A = _mm_setr_pd(4.0, 6.0); 5347 __m128d B = _mm_setr_pd(7.0, 9.0); 5348 __m128d C = _mm_unpacklo_pd(A, B); 5349 double[2] correct = [4.0, 7.0]; 5350 assert(C.array == correct); 5351 } 5352 5353 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`. 5354 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe 5355 { 5356 return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b); 5357 } 5358 // TODO unittest 5359 5360 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and `b`. 5361 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe 5362 { 5363 return a ^ b; 5364 } 5365 // TODO unittest 5366 5367 unittest 5368 { 5369 float distance(float[4] a, float[4] b) nothrow @nogc 5370 { 5371 __m128 va = _mm_loadu_ps(a.ptr); 5372 __m128 vb = _mm_loadu_ps(b.ptr); 5373 __m128 diffSquared = _mm_sub_ps(va, vb); 5374 diffSquared = _mm_mul_ps(diffSquared, diffSquared); 5375 __m128 sum = _mm_add_ps(diffSquared, _mm_srli_ps!8(diffSquared)); 5376 sum = _mm_add_ps(sum, _mm_srli_ps!4(sum)); 5377 return _mm_cvtss_f32(_mm_sqrt_ss(sum)); 5378 } 5379 assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2); 5380 }