1 /** 2 * `core.simd` emulation layer. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.types; 8 9 10 pure: 11 nothrow: 12 @nogc: 13 14 version(GNU) 15 { 16 // Note: for GDC support, be sure to use https://explore.dgnu.org/ 17 18 version(X86_64) 19 { 20 enum MMXSizedVectorsAreEmulated = false; 21 enum SSESizedVectorsAreEmulated = false; 22 23 // TODO: use D_AVX and D_AVX2 eventually to detect AVX? 24 enum AVXSizedVectorsAreEmulated = true; 25 26 import gcc.builtins; 27 28 float4 loadUnaligned(Vec)(const(float)* pvec) @trusted if (is(Vec == float4)) 29 { 30 return __builtin_ia32_loadups(pvec); 31 } 32 33 double2 loadUnaligned(Vec)(const(double)* pvec) @trusted if (is(Vec == double2)) 34 { 35 return __builtin_ia32_loadupd(pvec); 36 } 37 38 byte16 loadUnaligned(Vec)(const(byte)* pvec) @trusted if (is(Vec == byte16)) 39 { 40 return cast(byte16) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 41 } 42 43 short8 loadUnaligned(Vec)(const(short)* pvec) @trusted if (is(Vec == short8)) 44 { 45 return cast(short8) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 46 } 47 48 int4 loadUnaligned(Vec)(const(int)* pvec) @trusted if (is(Vec == int4)) 49 { 50 return cast(int4) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 51 } 52 53 long2 loadUnaligned(Vec)(const(long)* pvec) @trusted if (is(Vec == long2)) 54 { 55 return cast(long2) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 56 } 57 58 void storeUnaligned(Vec)(Vec v, float* pvec) @trusted if (is(Vec == float4)) 59 { 60 __builtin_ia32_storeups(pvec, v); 61 } 62 63 void storeUnaligned(Vec)(Vec v, double* pvec) @trusted if (is(Vec == double2)) 64 { 65 __builtin_ia32_storeupd(pvec, v); 66 } 67 68 void storeUnaligned(Vec)(Vec v, byte* pvec) @trusted if (is(Vec == byte16)) 69 { 70 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 71 } 72 73 void storeUnaligned(Vec)(Vec v, short* pvec) @trusted if (is(Vec == short8)) 74 { 75 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 76 } 77 78 void storeUnaligned(Vec)(Vec v, int* pvec) @trusted if (is(Vec == int4)) 79 { 80 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 81 } 82 83 void storeUnaligned(Vec)(Vec v, long* pvec) @trusted if (is(Vec == long2)) 84 { 85 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 86 } 87 88 // TODO: for performance, replace that anywhere possible by a GDC intrinsic 89 Vec shufflevector(Vec, mask...)(Vec a, Vec b) @trusted 90 { 91 enum Count = Vec.array.length; 92 static assert(mask.length == Count); 93 94 Vec r = void; 95 foreach(int i, m; mask) 96 { 97 static assert (m < Count * 2); 98 int ind = cast(int)m; 99 if (ind < Count) 100 r.ptr[i] = a.array[ind]; 101 else 102 r.ptr[i] = b.array[ind - Count]; 103 } 104 return r; 105 } 106 } 107 else 108 { 109 enum MMXSizedVectorsAreEmulated = true; 110 enum SSESizedVectorsAreEmulated = true; 111 enum AVXSizedVectorsAreEmulated = true; 112 } 113 } 114 else version(LDC) 115 { 116 public import ldc.simd; 117 118 // Use this alias to mention it should only be used with LDC, 119 // for example when emulated shufflevector would just be wasteful. 120 alias shufflevectorLDC = shufflevector; 121 122 enum MMXSizedVectorsAreEmulated = false; 123 enum SSESizedVectorsAreEmulated = false; 124 enum AVXSizedVectorsAreEmulated = false; 125 } 126 else version(DigitalMars) 127 { 128 public import core.simd; 129 130 // Note: turning this true is desirable, 131 // and leads to many bugs being discovered upstream. 132 // the fact that it works relies on many workardounds. 133 // in particular intel-intrinsics with this on is a honeypot for DMD backend bugs. 134 // What happends next is that contributors end up on a DMD bug in their PR. 135 // 136 // Failed attempts: xxx 137 // 138 static if (__VERSION__ >= 2100) 139 { 140 enum bool tryToEnableCoreSimdWithDMD = false; 141 } 142 else 143 { 144 enum bool tryToEnableCoreSimdWithDMD = false; 145 } 146 147 version(D_SIMD) 148 { 149 enum MMXSizedVectorsAreEmulated = true; 150 enum SSESizedVectorsAreEmulated = !tryToEnableCoreSimdWithDMD; 151 version(D_AVX) 152 enum AVXSizedVectorsAreEmulated = !tryToEnableCoreSimdWithDMD; 153 else 154 enum AVXSizedVectorsAreEmulated = true; 155 } 156 else 157 { 158 // Some DMD 32-bit targets don't have D_SIMD 159 enum MMXSizedVectorsAreEmulated = true; 160 enum SSESizedVectorsAreEmulated = true; 161 enum AVXSizedVectorsAreEmulated = true; 162 } 163 } 164 165 enum CoreSimdIsEmulated = MMXSizedVectorsAreEmulated || SSESizedVectorsAreEmulated || AVXSizedVectorsAreEmulated; 166 167 version(GNU) 168 enum bool DefineGenericLoadStoreUnaligned = false; 169 else 170 enum bool DefineGenericLoadStoreUnaligned = CoreSimdIsEmulated; 171 172 173 static if (CoreSimdIsEmulated) 174 { 175 // core.simd is emulated in some capacity: introduce `VectorOps` 176 177 mixin template VectorOps(VectorType, ArrayType: BaseType[N], BaseType, size_t N) 178 { 179 enum Count = N; 180 alias Base = BaseType; 181 182 BaseType* ptr() return pure nothrow @nogc 183 { 184 return array.ptr; 185 } 186 187 // Unary operators 188 VectorType opUnary(string op)() pure nothrow @safe @nogc 189 { 190 VectorType res = void; 191 mixin("res.array[] = " ~ op ~ "array[];"); 192 return res; 193 } 194 195 // Binary operators 196 VectorType opBinary(string op)(VectorType other) pure const nothrow @safe @nogc 197 { 198 VectorType res = void; 199 mixin("res.array[] = array[] " ~ op ~ " other.array[];"); 200 return res; 201 } 202 203 // Assigning a BaseType value 204 void opAssign(BaseType e) pure nothrow @safe @nogc 205 { 206 array[] = e; 207 } 208 209 // Assigning a static array 210 void opAssign(ArrayType v) pure nothrow @safe @nogc 211 { 212 array[] = v[]; 213 } 214 215 void opOpAssign(string op)(VectorType other) pure nothrow @safe @nogc 216 { 217 mixin("array[] " ~ op ~ "= other.array[];"); 218 } 219 220 // Assigning a dyn array 221 this(ArrayType v) pure nothrow @safe @nogc 222 { 223 array[] = v[]; 224 } 225 226 // Broadcast constructor 227 this(BaseType x) pure nothrow @safe @nogc 228 { 229 array[] = x; 230 } 231 232 /// We can't support implicit conversion but do support explicit casting. 233 /// "Vector types of the same size can be implicitly converted among each other." 234 /// Casting to another vector type is always just a raw copy. 235 VecDest opCast(VecDest)() pure const nothrow @trusted @nogc 236 if (VecDest.sizeof == VectorType.sizeof) 237 { 238 VecDest dest = void; 239 // Copy 240 dest.array[] = (cast(typeof(dest.array))cast(void[VectorType.sizeof])array)[]; 241 return dest; 242 } 243 244 ref inout(BaseType) opIndex(size_t i) inout return pure nothrow @safe @nogc 245 { 246 return array[i]; 247 } 248 249 } 250 251 // they just weren't interesting enough, use v.array[i] instead. 252 deprecated auto extractelement(Vec, int index, Vec2)(Vec2 vec) @trusted 253 { 254 static assert(Vec.sizeof == Vec2.sizeof); 255 import core.stdc.string: memcpy; 256 Vec v = void; 257 memcpy(&v, &vec, Vec2.sizeof); 258 return v.array[index]; 259 } 260 261 // they just weren't interesting enough, use v.ptr[i] = x instead. 262 deprecated auto insertelement(Vec, int index, Vec2)(Vec2 vec, Vec.Base e) @trusted 263 { 264 static assert(Vec.sizeof == Vec2.sizeof); 265 import core.stdc.string: memcpy; 266 Vec v = void; 267 memcpy(&v, &vec, Vec2.sizeof); 268 v.array[index] = e; 269 return v; 270 } 271 } 272 else 273 { 274 public import core.simd; 275 276 // GDC cannot convert implicitely __vector from signed to unsigned, but LDC can 277 // And LDC sometimes need those unsigned vector types for some intrinsics. 278 // For internal use only. 279 package alias ushort8 = Vector!(ushort[8]); 280 package alias ubyte8 = Vector!(ubyte[8]); 281 package alias ubyte16 = Vector!(ubyte[16]); 282 } 283 284 static if (DefineGenericLoadStoreUnaligned) 285 { 286 template loadUnaligned(Vec) 287 { 288 // Note: can't be @safe with this signature 289 Vec loadUnaligned(const(BaseType!Vec)* pvec) @trusted 290 { 291 enum bool isVector = ( (Vec.sizeof == 8) && (!MMXSizedVectorsAreEmulated) 292 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated) 293 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) ); 294 295 static if (isVector) 296 { 297 // PERF DMD 298 // BUG: code is wrong, should cast to Vec, see https://github.com/dlang/druntime/pull/3808/commits/b5670753248ec3b1631a0eb8ca76a27e8d6a39b9 299 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d 300 static if (DMD_with_DSIMD && Vec.sizeof == 8) 301 { 302 static if (is(Vec == double2)) 303 return cast(Vec)__simd(XMM.LODUPD, *pvec); 304 else static if (is(Vec == float4)) 305 return cast(Vec)__simd(XMM.LODUPS, *pvec); 306 else 307 return cast(Vec)__simd(XMM.LODDQU, *pvec); 308 } 309 else */ 310 { 311 enum size_t Count = Vec.array.length; 312 Vec result; 313 foreach(int i; 0..Count) 314 { 315 result.ptr[i] = pvec[i]; 316 } 317 return result; 318 } 319 } 320 else 321 { 322 // Since this vector is emulated, it doesn't have alignement constraints 323 // and as such we can just cast it. 324 return *cast(Vec*)(pvec); 325 } 326 } 327 } 328 329 template storeUnaligned(Vec) 330 { 331 // Note: can't be @safe with this signature 332 void storeUnaligned(Vec v, BaseType!Vec* pvec) @trusted 333 { 334 enum bool isVector = ( (Vec.sizeof == 8) && (!MMXSizedVectorsAreEmulated) 335 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated) 336 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) ); 337 338 static if (isVector) 339 { 340 // PERF DMD 341 // BUG: code is wrong, should cast to Vec, see https://github.com/dlang/druntime/pull/3808/commits/b5670753248ec3b1631a0eb8ca76a27e8d6a39b9 342 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d 343 static if (DMD_with_DSIMD && Vec.sizeof == 8) 344 { 345 static if (is(Vec == double2)) 346 __simd_sto(XMM.STOUPD, *pvec, value); 347 else static if (is(Vec == float4)) 348 __simd_sto(XMM.STOUPS, *pvec, value); 349 else 350 __simd_sto(XMM.STODQU, *pvec, value); 351 } 352 else*/ 353 { 354 enum size_t Count = Vec.array.length; 355 foreach(int i; 0..Count) 356 pvec[i] = v.array[i]; 357 } 358 } 359 else 360 { 361 *cast(Vec*)(pvec) = v; 362 } 363 } 364 } 365 366 Vec shufflevector(Vec, mask...)(Vec a, Vec b) @safe if (Vec.sizeof < 32) 367 { 368 enum size_t Count = Vec.array.length; 369 static assert(mask.length == Count); 370 371 Vec r = void; 372 foreach(int i, m; mask) 373 { 374 static assert (m < Count * 2); 375 enum int ind = cast(int)m; 376 static if (ind < Count) 377 r.array[i] = a.array[ind]; 378 else 379 r.array[i] = b.array[ind-Count]; 380 } 381 return r; 382 } 383 } 384 385 // Emulate ldc.simd cmpMask and other masks. 386 // Note: these should be deprecated on non-LDC, 387 // since it's slower to generate that code. 388 version(LDC) 389 {} 390 else 391 { 392 private template BaseType(V) 393 { 394 alias typeof( ( { V v; return v; }()).array[0]) BaseType; 395 } 396 397 private template TrueMask(V) 398 { 399 alias Elem = BaseType!V; 400 401 static if (is(Elem == float)) 402 { 403 immutable uint m1 = 0xffffffff; 404 enum Elem TrueMask = *cast(float*)(&m1); 405 } 406 else static if (is(Elem == double)) 407 { 408 immutable ulong m1 = 0xffffffff_ffffffff; 409 enum Elem TrueMask = *cast(double*)(&m1); 410 } 411 else // integer case 412 { 413 enum Elem TrueMask = -1; 414 } 415 } 416 417 Vec equalMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "oeq" comparison 418 { 419 enum size_t Count = Vec.array.length; 420 Vec result; 421 foreach(int i; 0..Count) 422 { 423 bool cond = a.array[i] == b.array[i]; 424 result.ptr[i] = cond ? TrueMask!Vec : 0; 425 } 426 return result; 427 } 428 429 Vec notEqualMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "one" comparison 430 { 431 enum size_t Count = Vec.array.length; 432 Vec result; 433 foreach(int i; 0..Count) 434 { 435 bool cond = a.array[i] != b.array[i]; 436 result.ptr[i] = cond ? TrueMask!Vec : 0; 437 } 438 return result; 439 } 440 441 Vec greaterMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "ogt" comparison 442 { 443 enum size_t Count = Vec.array.length; 444 Vec result; 445 foreach(int i; 0..Count) 446 { 447 bool cond = a.array[i] > b.array[i]; 448 result.ptr[i] = cond ? TrueMask!Vec : 0; 449 } 450 return result; 451 } 452 } 453 454 unittest 455 { 456 float4 a = [1, 3, 5, 7]; 457 float4 b = [2, 3, 4, 5]; 458 int4 c = cast(int4)(greaterMask!float4(a, b)); 459 static immutable int[4] correct = [0, 0, 0xffff_ffff, 0xffff_ffff]; 460 assert(c.array == correct); 461 } 462 463 static if (MMXSizedVectorsAreEmulated) 464 { 465 /// MMX-like SIMD types 466 struct float2 467 { 468 float[2] array; 469 mixin VectorOps!(float2, float[2]); 470 } 471 472 struct byte8 473 { 474 byte[8] array; 475 mixin VectorOps!(byte8, byte[8]); 476 } 477 478 struct short4 479 { 480 short[4] array; 481 mixin VectorOps!(short4, short[4]); 482 } 483 484 struct int2 485 { 486 int[2] array; 487 mixin VectorOps!(int2, int[2]); 488 } 489 490 struct long1 491 { 492 long[1] array; 493 mixin VectorOps!(long1, long[1]); 494 } 495 } 496 else 497 { 498 // For this compiler, defining MMX-sized vectors is working. 499 public import core.simd; 500 alias Vector!(long [1]) long1; 501 alias Vector!(float[2]) float2; 502 alias Vector!(int [2]) int2; 503 alias Vector!(short[4]) short4; 504 alias Vector!(byte [8]) byte8; 505 } 506 507 static assert(float2.sizeof == 8); 508 static assert(byte8.sizeof == 8); 509 static assert(short4.sizeof == 8); 510 static assert(int2.sizeof == 8); 511 static assert(long1.sizeof == 8); 512 513 514 static if (SSESizedVectorsAreEmulated) 515 { 516 /// SSE-like SIMD types 517 518 struct float4 519 { 520 float[4] array; 521 mixin VectorOps!(float4, float[4]); 522 } 523 524 struct byte16 525 { 526 byte[16] array; 527 mixin VectorOps!(byte16, byte[16]); 528 } 529 530 struct short8 531 { 532 short[8] array; 533 mixin VectorOps!(short8, short[8]); 534 } 535 536 struct int4 537 { 538 int[4] array; 539 mixin VectorOps!(int4, int[4]); 540 } 541 542 struct long2 543 { 544 long[2] array; 545 mixin VectorOps!(long2, long[2]); 546 } 547 548 struct double2 549 { 550 double[2] array; 551 mixin VectorOps!(double2, double[2]); 552 } 553 } 554 555 static assert(float4.sizeof == 16); 556 static assert(byte16.sizeof == 16); 557 static assert(short8.sizeof == 16); 558 static assert(int4.sizeof == 16); 559 static assert(long2.sizeof == 16); 560 static assert(double2.sizeof == 16); 561 562 563 static if (AVXSizedVectorsAreEmulated) 564 { 565 /// AVX-like SIMD types 566 567 struct float8 568 { 569 float[8] array; 570 mixin VectorOps!(float8, float[8]); 571 } 572 573 struct byte32 574 { 575 byte[32] array; 576 mixin VectorOps!(byte32, byte[32]); 577 } 578 579 struct short16 580 { 581 short[16] array; 582 mixin VectorOps!(short16, short[16]); 583 } 584 585 struct int8 586 { 587 int[8] array; 588 mixin VectorOps!(int8, int[8]); 589 } 590 591 struct long4 592 { 593 long[4] array; 594 mixin VectorOps!(long4, long[4]); 595 } 596 597 struct double4 598 { 599 double[4] array; 600 mixin VectorOps!(double4, double[4]); 601 } 602 } 603 else 604 { 605 public import core.simd; 606 } 607 static assert(float8.sizeof == 32); 608 static assert(byte32.sizeof == 32); 609 static assert(short16.sizeof == 32); 610 static assert(int8.sizeof == 32); 611 static assert(long4.sizeof == 32); 612 static assert(double4.sizeof == 32); 613 614 615 616 617 alias __m256 = float8; 618 alias __m256i = long4; // long long __vector with ICC, GCC, and clang 619 alias __m256d = double4; 620 alias __m128 = float4; 621 alias __m128i = int4; 622 alias __m128d = double2; 623 alias __m64 = long1; // like in Clang, __m64 is a vector of 1 long 624 625 int _MM_SHUFFLE2(int x, int y) pure @safe 626 { 627 assert(x >= 0 && x <= 1); 628 assert(y >= 0 && y <= 1); 629 return (x << 1) | y; 630 } 631 632 int _MM_SHUFFLE(int z, int y, int x, int w) pure @safe 633 { 634 assert(x >= 0 && x <= 3); 635 assert(y >= 0 && y <= 3); 636 assert(z >= 0 && z <= 3); 637 assert(w >= 0 && w <= 3); 638 return (z<<6) | (y<<4) | (x<<2) | w; 639 } 640 641 // test assignment from scalar to vector type 642 unittest 643 { 644 float4 A = 3.0f; 645 float[4] correctA = [3.0f, 3.0f, 3.0f, 3.0f]; 646 assert(A.array == correctA); 647 648 int2 B = 42; 649 int[2] correctB = [42, 42]; 650 assert(B.array == correctB); 651 }