1 /** 2 * `core.simd` emulation layer. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.types; 8 9 10 pure: 11 nothrow: 12 @nogc: 13 14 version(GNU) 15 { 16 // Note: for GDC support, be sure to use https://explore.dgnu.org/ 17 18 version(X86_64) 19 { 20 enum MMXSizedVectorsAreEmulated = false; 21 enum SSESizedVectorsAreEmulated = false; 22 23 // TODO: use D_AVX and D_AVX2 eventually to detect AVX? 24 enum AVXSizedVectorsAreEmulated = true; 25 26 import gcc.builtins; 27 28 float4 loadUnaligned(Vec)(const(float)* pvec) @trusted if (is(Vec == float4)) 29 { 30 return __builtin_ia32_loadups(pvec); 31 } 32 33 double2 loadUnaligned(Vec)(const(double)* pvec) @trusted if (is(Vec == double2)) 34 { 35 return __builtin_ia32_loadupd(pvec); 36 } 37 38 byte16 loadUnaligned(Vec)(const(byte)* pvec) @trusted if (is(Vec == byte16)) 39 { 40 return cast(byte16) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 41 } 42 43 short8 loadUnaligned(Vec)(const(short)* pvec) @trusted if (is(Vec == short8)) 44 { 45 return cast(short8) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 46 } 47 48 int4 loadUnaligned(Vec)(const(int)* pvec) @trusted if (is(Vec == int4)) 49 { 50 return cast(int4) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 51 } 52 53 long2 loadUnaligned(Vec)(const(long)* pvec) @trusted if (is(Vec == long2)) 54 { 55 return cast(long2) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 56 } 57 58 void storeUnaligned(Vec)(Vec v, float* pvec) @trusted if (is(Vec == float4)) 59 { 60 __builtin_ia32_storeups(pvec, v); 61 } 62 63 void storeUnaligned(Vec)(Vec v, double* pvec) @trusted if (is(Vec == double2)) 64 { 65 __builtin_ia32_storeupd(pvec, v); 66 } 67 68 void storeUnaligned(Vec)(Vec v, byte* pvec) @trusted if (is(Vec == byte16)) 69 { 70 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 71 } 72 73 void storeUnaligned(Vec)(Vec v, short* pvec) @trusted if (is(Vec == short8)) 74 { 75 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 76 } 77 78 void storeUnaligned(Vec)(Vec v, int* pvec) @trusted if (is(Vec == int4)) 79 { 80 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 81 } 82 83 void storeUnaligned(Vec)(Vec v, long* pvec) @trusted if (is(Vec == long2)) 84 { 85 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 86 } 87 88 // TODO: for performance, replace that anywhere possible by a GDC intrinsic 89 Vec shufflevector(Vec, mask...)(Vec a, Vec b) @trusted 90 { 91 enum Count = Vec.array.length; 92 static assert(mask.length == Count); 93 94 Vec r = void; 95 foreach(int i, m; mask) 96 { 97 static assert (m < Count * 2); 98 int ind = cast(int)m; 99 if (ind < Count) 100 r.ptr[i] = a.array[ind]; 101 else 102 r.ptr[i] = b.array[ind - Count]; 103 } 104 return r; 105 } 106 } 107 else 108 { 109 enum MMXSizedVectorsAreEmulated = true; 110 enum SSESizedVectorsAreEmulated = true; 111 enum AVXSizedVectorsAreEmulated = true; 112 } 113 } 114 else version(LDC) 115 { 116 public import ldc.simd; 117 118 // Use this alias to mention it should only be used with LDC, 119 // for example when emulated shufflevector would just be wasteful. 120 alias shufflevectorLDC = shufflevector; 121 122 enum MMXSizedVectorsAreEmulated = false; 123 enum SSESizedVectorsAreEmulated = false; 124 enum AVXSizedVectorsAreEmulated = false; 125 } 126 else version(DigitalMars) 127 { 128 public import core.simd; 129 130 // Note: turning this true is desirable, 131 // and leads to many bugs being discovered upstream. 132 // the fact that it works relies on many workardounds. 133 // in particular intel-intrinsics with this on is a honeypot for DMD backend bugs. 134 static if (__VERSION__ >= 2100) 135 { 136 enum bool tryToEnableCoreSimdWithDMD = true; 137 } 138 else 139 { 140 enum bool tryToEnableCoreSimdWithDMD = false; 141 } 142 143 version(D_SIMD) 144 { 145 enum MMXSizedVectorsAreEmulated = true; 146 enum SSESizedVectorsAreEmulated = !tryToEnableCoreSimdWithDMD; 147 version(D_AVX) 148 enum AVXSizedVectorsAreEmulated = !tryToEnableCoreSimdWithDMD; 149 else 150 enum AVXSizedVectorsAreEmulated = true; 151 } 152 else 153 { 154 // Some DMD 32-bit targets don't have D_SIMD 155 enum MMXSizedVectorsAreEmulated = true; 156 enum SSESizedVectorsAreEmulated = true; 157 enum AVXSizedVectorsAreEmulated = true; 158 } 159 } 160 161 enum CoreSimdIsEmulated = MMXSizedVectorsAreEmulated || SSESizedVectorsAreEmulated || AVXSizedVectorsAreEmulated; 162 163 version(GNU) 164 enum bool DefineGenericLoadStoreUnaligned = false; 165 else 166 enum bool DefineGenericLoadStoreUnaligned = CoreSimdIsEmulated; 167 168 169 static if (CoreSimdIsEmulated) 170 { 171 // core.simd is emulated in some capacity: introduce `VectorOps` 172 173 mixin template VectorOps(VectorType, ArrayType: BaseType[N], BaseType, size_t N) 174 { 175 enum Count = N; 176 alias Base = BaseType; 177 178 BaseType* ptr() return pure nothrow @nogc 179 { 180 return array.ptr; 181 } 182 183 // Unary operators 184 VectorType opUnary(string op)() pure nothrow @safe @nogc 185 { 186 VectorType res = void; 187 mixin("res.array[] = " ~ op ~ "array[];"); 188 return res; 189 } 190 191 // Binary operators 192 VectorType opBinary(string op)(VectorType other) pure const nothrow @safe @nogc 193 { 194 VectorType res = void; 195 mixin("res.array[] = array[] " ~ op ~ " other.array[];"); 196 return res; 197 } 198 199 // Assigning a BaseType value 200 void opAssign(BaseType e) pure nothrow @safe @nogc 201 { 202 array[] = e; 203 } 204 205 // Assigning a static array 206 void opAssign(ArrayType v) pure nothrow @safe @nogc 207 { 208 array[] = v[]; 209 } 210 211 void opOpAssign(string op)(VectorType other) pure nothrow @safe @nogc 212 { 213 mixin("array[] " ~ op ~ "= other.array[];"); 214 } 215 216 // Assigning a dyn array 217 this(ArrayType v) pure nothrow @safe @nogc 218 { 219 array[] = v[]; 220 } 221 222 // Broadcast constructor 223 this(BaseType x) pure nothrow @safe @nogc 224 { 225 array[] = x; 226 } 227 228 /// We can't support implicit conversion but do support explicit casting. 229 /// "Vector types of the same size can be implicitly converted among each other." 230 /// Casting to another vector type is always just a raw copy. 231 VecDest opCast(VecDest)() pure const nothrow @trusted @nogc 232 if (VecDest.sizeof == VectorType.sizeof) 233 { 234 VecDest dest = void; 235 // Copy 236 dest.array[] = (cast(typeof(dest.array))cast(void[VectorType.sizeof])array)[]; 237 return dest; 238 } 239 240 ref inout(BaseType) opIndex(size_t i) inout return pure nothrow @safe @nogc 241 { 242 return array[i]; 243 } 244 245 } 246 247 // they just weren't interesting enough, use v.array[i] instead. 248 deprecated auto extractelement(Vec, int index, Vec2)(Vec2 vec) @trusted 249 { 250 static assert(Vec.sizeof == Vec2.sizeof); 251 import core.stdc.string: memcpy; 252 Vec v = void; 253 memcpy(&v, &vec, Vec2.sizeof); 254 return v.array[index]; 255 } 256 257 // they just weren't interesting enough, use v.ptr[i] = x instead. 258 deprecated auto insertelement(Vec, int index, Vec2)(Vec2 vec, Vec.Base e) @trusted 259 { 260 static assert(Vec.sizeof == Vec2.sizeof); 261 import core.stdc.string: memcpy; 262 Vec v = void; 263 memcpy(&v, &vec, Vec2.sizeof); 264 v.array[index] = e; 265 return v; 266 } 267 } 268 else 269 { 270 public import core.simd; 271 272 // GDC cannot convert implicitely __vector from signed to unsigned, but LDC can 273 // And LDC sometimes need those unsigned vector types for some intrinsics. 274 // For internal use only. 275 package alias ushort8 = Vector!(ushort[8]); 276 package alias ubyte8 = Vector!(ubyte[8]); 277 package alias ubyte16 = Vector!(ubyte[16]); 278 } 279 280 static if (DefineGenericLoadStoreUnaligned) 281 { 282 template loadUnaligned(Vec) 283 { 284 // Note: can't be @safe with this signature 285 Vec loadUnaligned(const(BaseType!Vec)* pvec) @trusted 286 { 287 enum bool isVector = ( (Vec.sizeof == 8) && (!MMXSizedVectorsAreEmulated) 288 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated) 289 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) ); 290 291 static if (isVector) 292 { 293 // PERF DMD 294 // BUG: code is wrong, should cast to Vec, see https://github.com/dlang/druntime/pull/3808/commits/b5670753248ec3b1631a0eb8ca76a27e8d6a39b9 295 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d 296 static if (DMD_with_DSIMD && Vec.sizeof == 8) 297 { 298 static if (is(Vec == double2)) 299 return cast(Vec)__simd(XMM.LODUPD, *pvec); 300 else static if (is(Vec == float4)) 301 return cast(Vec)__simd(XMM.LODUPS, *pvec); 302 else 303 return cast(Vec)__simd(XMM.LODDQU, *pvec); 304 } 305 else */ 306 { 307 enum size_t Count = Vec.array.length; 308 Vec result; 309 foreach(int i; 0..Count) 310 { 311 result.ptr[i] = pvec[i]; 312 } 313 return result; 314 } 315 } 316 else 317 { 318 // Since this vector is emulated, it doesn't have alignement constraints 319 // and as such we can just cast it. 320 return *cast(Vec*)(pvec); 321 } 322 } 323 } 324 325 template storeUnaligned(Vec) 326 { 327 // Note: can't be @safe with this signature 328 void storeUnaligned(Vec v, BaseType!Vec* pvec) @trusted 329 { 330 enum bool isVector = ( (Vec.sizeof == 8) && (!MMXSizedVectorsAreEmulated) 331 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated) 332 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) ); 333 334 static if (isVector) 335 { 336 // PERF DMD 337 // BUG: code is wrong, should cast to Vec, see https://github.com/dlang/druntime/pull/3808/commits/b5670753248ec3b1631a0eb8ca76a27e8d6a39b9 338 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d 339 static if (DMD_with_DSIMD && Vec.sizeof == 8) 340 { 341 static if (is(Vec == double2)) 342 __simd_sto(XMM.STOUPD, *pvec, value); 343 else static if (is(Vec == float4)) 344 __simd_sto(XMM.STOUPS, *pvec, value); 345 else 346 __simd_sto(XMM.STODQU, *pvec, value); 347 } 348 else*/ 349 { 350 enum size_t Count = Vec.array.length; 351 foreach(int i; 0..Count) 352 pvec[i] = v.array[i]; 353 } 354 } 355 else 356 { 357 *cast(Vec*)(pvec) = v; 358 } 359 } 360 } 361 362 Vec shufflevector(Vec, mask...)(Vec a, Vec b) @safe if (Vec.sizeof < 32) 363 { 364 enum size_t Count = Vec.array.length; 365 static assert(mask.length == Count); 366 367 Vec r = void; 368 foreach(int i, m; mask) 369 { 370 static assert (m < Count * 2); 371 enum int ind = cast(int)m; 372 static if (ind < Count) 373 r.array[i] = a.array[ind]; 374 else 375 r.array[i] = b.array[ind-Count]; 376 } 377 return r; 378 } 379 } 380 381 // Emulate ldc.simd cmpMask and other masks. 382 // Note: these should be deprecated on non-LDC, 383 // since it's slower to generate that code. 384 version(LDC) 385 {} 386 else 387 { 388 private template BaseType(V) 389 { 390 alias typeof( ( { V v; return v; }()).array[0]) BaseType; 391 } 392 393 private template TrueMask(V) 394 { 395 alias Elem = BaseType!V; 396 397 static if (is(Elem == float)) 398 { 399 immutable uint m1 = 0xffffffff; 400 enum Elem TrueMask = *cast(float*)(&m1); 401 } 402 else static if (is(Elem == double)) 403 { 404 immutable ulong m1 = 0xffffffff_ffffffff; 405 enum Elem TrueMask = *cast(double*)(&m1); 406 } 407 else // integer case 408 { 409 enum Elem TrueMask = -1; 410 } 411 } 412 413 Vec equalMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "oeq" comparison 414 { 415 enum size_t Count = Vec.array.length; 416 Vec result; 417 foreach(int i; 0..Count) 418 { 419 bool cond = a.array[i] == b.array[i]; 420 result.ptr[i] = cond ? TrueMask!Vec : 0; 421 } 422 return result; 423 } 424 425 Vec notEqualMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "one" comparison 426 { 427 enum size_t Count = Vec.array.length; 428 Vec result; 429 foreach(int i; 0..Count) 430 { 431 bool cond = a.array[i] != b.array[i]; 432 result.ptr[i] = cond ? TrueMask!Vec : 0; 433 } 434 return result; 435 } 436 437 Vec greaterMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "ogt" comparison 438 { 439 enum size_t Count = Vec.array.length; 440 Vec result; 441 foreach(int i; 0..Count) 442 { 443 bool cond = a.array[i] > b.array[i]; 444 result.ptr[i] = cond ? TrueMask!Vec : 0; 445 } 446 return result; 447 } 448 } 449 450 unittest 451 { 452 float4 a = [1, 3, 5, 7]; 453 float4 b = [2, 3, 4, 5]; 454 int4 c = cast(int4)(greaterMask!float4(a, b)); 455 static immutable int[4] correct = [0, 0, 0xffff_ffff, 0xffff_ffff]; 456 assert(c.array == correct); 457 } 458 459 static if (MMXSizedVectorsAreEmulated) 460 { 461 /// MMX-like SIMD types 462 struct float2 463 { 464 float[2] array; 465 mixin VectorOps!(float2, float[2]); 466 } 467 468 struct byte8 469 { 470 byte[8] array; 471 mixin VectorOps!(byte8, byte[8]); 472 } 473 474 struct short4 475 { 476 short[4] array; 477 mixin VectorOps!(short4, short[4]); 478 } 479 480 struct int2 481 { 482 int[2] array; 483 mixin VectorOps!(int2, int[2]); 484 } 485 486 struct long1 487 { 488 long[1] array; 489 mixin VectorOps!(long1, long[1]); 490 } 491 } 492 else 493 { 494 // For this compiler, defining MMX-sized vectors is working. 495 public import core.simd; 496 alias Vector!(long [1]) long1; 497 alias Vector!(float[2]) float2; 498 alias Vector!(int [2]) int2; 499 alias Vector!(short[4]) short4; 500 alias Vector!(byte [8]) byte8; 501 } 502 503 static assert(float2.sizeof == 8); 504 static assert(byte8.sizeof == 8); 505 static assert(short4.sizeof == 8); 506 static assert(int2.sizeof == 8); 507 static assert(long1.sizeof == 8); 508 509 510 static if (SSESizedVectorsAreEmulated) 511 { 512 /// SSE-like SIMD types 513 514 struct float4 515 { 516 float[4] array; 517 mixin VectorOps!(float4, float[4]); 518 } 519 520 struct byte16 521 { 522 byte[16] array; 523 mixin VectorOps!(byte16, byte[16]); 524 } 525 526 struct short8 527 { 528 short[8] array; 529 mixin VectorOps!(short8, short[8]); 530 } 531 532 struct int4 533 { 534 int[4] array; 535 mixin VectorOps!(int4, int[4]); 536 } 537 538 struct long2 539 { 540 long[2] array; 541 mixin VectorOps!(long2, long[2]); 542 } 543 544 struct double2 545 { 546 double[2] array; 547 mixin VectorOps!(double2, double[2]); 548 } 549 } 550 551 static assert(float4.sizeof == 16); 552 static assert(byte16.sizeof == 16); 553 static assert(short8.sizeof == 16); 554 static assert(int4.sizeof == 16); 555 static assert(long2.sizeof == 16); 556 static assert(double2.sizeof == 16); 557 558 559 static if (AVXSizedVectorsAreEmulated) 560 { 561 /// AVX-like SIMD types 562 563 struct float8 564 { 565 float[8] array; 566 mixin VectorOps!(float8, float[8]); 567 } 568 569 struct byte32 570 { 571 byte[32] array; 572 mixin VectorOps!(byte32, byte[32]); 573 } 574 575 struct short16 576 { 577 short[16] array; 578 mixin VectorOps!(short16, short[16]); 579 } 580 581 struct int8 582 { 583 int[8] array; 584 mixin VectorOps!(int8, int[8]); 585 } 586 587 struct long4 588 { 589 long[4] array; 590 mixin VectorOps!(long4, long[4]); 591 } 592 593 struct double4 594 { 595 double[4] array; 596 mixin VectorOps!(double4, double[4]); 597 } 598 } 599 else 600 { 601 public import core.simd; 602 } 603 static assert(float8.sizeof == 32); 604 static assert(byte32.sizeof == 32); 605 static assert(short16.sizeof == 32); 606 static assert(int8.sizeof == 32); 607 static assert(long4.sizeof == 32); 608 static assert(double4.sizeof == 32); 609 610 611 612 613 alias __m256 = float8; 614 alias __m256i = long4; // long long __vector with ICC, GCC, and clang 615 alias __m256d = double4; 616 alias __m128 = float4; 617 alias __m128i = int4; 618 alias __m128d = double2; 619 alias __m64 = long1; // like in Clang, __m64 is a vector of 1 long 620 621 int _MM_SHUFFLE2(int x, int y) pure @safe 622 { 623 assert(x >= 0 && x <= 1); 624 assert(y >= 0 && y <= 1); 625 return (x << 1) | y; 626 } 627 628 int _MM_SHUFFLE(int z, int y, int x, int w) pure @safe 629 { 630 assert(x >= 0 && x <= 3); 631 assert(y >= 0 && y <= 3); 632 assert(z >= 0 && z <= 3); 633 assert(w >= 0 && w <= 3); 634 return (z<<6) | (y<<4) | (x<<2) | w; 635 } 636 637 // test assignment from scalar to vector type 638 unittest 639 { 640 float4 A = 3.0f; 641 float[4] correctA = [3.0f, 3.0f, 3.0f, 3.0f]; 642 assert(A.array == correctA); 643 644 int2 B = 42; 645 int[2] correctB = [42, 42]; 646 assert(B.array == correctB); 647 }