1 /** 2 * `core.simd` emulation layer. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.types; 8 9 10 pure: 11 nothrow: 12 @nogc: 13 14 version(GNU) 15 { 16 // Note: for GDC support, be sure to use https://explore.dgnu.org/ 17 18 version(X86_64) 19 { 20 enum MMXSizedVectorsAreEmulated = false; 21 enum SSESizedVectorsAreEmulated = false; 22 23 // TODO: use D_AVX and D_AVX2 eventually to detect AVX? 24 enum AVXSizedVectorsAreEmulated = true; 25 26 import gcc.builtins; 27 28 float4 loadUnaligned(Vec)(const(float)* pvec) @trusted if (is(Vec == float4)) 29 { 30 return __builtin_ia32_loadups(pvec); 31 } 32 33 double2 loadUnaligned(Vec)(const(double)* pvec) @trusted if (is(Vec == double2)) 34 { 35 return __builtin_ia32_loadupd(pvec); 36 } 37 38 byte16 loadUnaligned(Vec)(const(byte)* pvec) @trusted if (is(Vec == byte16)) 39 { 40 return cast(byte16) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 41 } 42 43 short8 loadUnaligned(Vec)(const(short)* pvec) @trusted if (is(Vec == short8)) 44 { 45 return cast(short8) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 46 } 47 48 int4 loadUnaligned(Vec)(const(int)* pvec) @trusted if (is(Vec == int4)) 49 { 50 return cast(int4) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 51 } 52 53 long2 loadUnaligned(Vec)(const(long)* pvec) @trusted if (is(Vec == long2)) 54 { 55 return cast(long2) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 56 } 57 58 void storeUnaligned(Vec)(Vec v, float* pvec) @trusted if (is(Vec == float4)) 59 { 60 __builtin_ia32_storeups(pvec, v); 61 } 62 63 void storeUnaligned(Vec)(Vec v, double* pvec) @trusted if (is(Vec == double2)) 64 { 65 __builtin_ia32_storeupd(pvec, v); 66 } 67 68 void storeUnaligned(Vec)(Vec v, byte* pvec) @trusted if (is(Vec == byte16)) 69 { 70 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 71 } 72 73 void storeUnaligned(Vec)(Vec v, short* pvec) @trusted if (is(Vec == short8)) 74 { 75 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 76 } 77 78 void storeUnaligned(Vec)(Vec v, int* pvec) @trusted if (is(Vec == int4)) 79 { 80 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 81 } 82 83 void storeUnaligned(Vec)(Vec v, long* pvec) @trusted if (is(Vec == long2)) 84 { 85 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 86 } 87 88 // TODO: for performance, replace that anywhere possible by a GDC intrinsic 89 Vec shufflevector(Vec, mask...)(Vec a, Vec b) @trusted 90 { 91 enum Count = Vec.array.length; 92 static assert(mask.length == Count); 93 94 Vec r = void; 95 foreach(int i, m; mask) 96 { 97 static assert (m < Count * 2); 98 int ind = cast(int)m; 99 if (ind < Count) 100 r.ptr[i] = a.array[ind]; 101 else 102 r.ptr[i] = b.array[ind - Count]; 103 } 104 return r; 105 } 106 } 107 else 108 { 109 enum MMXSizedVectorsAreEmulated = true; 110 enum SSESizedVectorsAreEmulated = true; 111 enum AVXSizedVectorsAreEmulated = true; 112 } 113 } 114 else version(LDC) 115 { 116 public import ldc.simd; 117 118 // Use this alias to mention it should only be used with LDC, 119 // for example when emulated shufflevector would just be wasteful. 120 alias shufflevectorLDC = shufflevector; 121 122 enum MMXSizedVectorsAreEmulated = false; 123 enum SSESizedVectorsAreEmulated = false; 124 enum AVXSizedVectorsAreEmulated = false; 125 } 126 else version(DigitalMars) 127 { 128 public import core.simd; 129 130 // Note: turning this true is desirable, 131 // and leads to many bugs being discovered upstream. 132 // Last attemps to enable this: DMD 2.100-b1 133 // When this turn true, make it depend on __VERSION__. 134 // 30.04.2022 = all tests pass, with DMD core.simd actually used. Promising. 135 enum bool tryToEnableCoreSimdWithDMD = false; 136 137 version(D_SIMD) 138 { 139 enum MMXSizedVectorsAreEmulated = true; 140 enum SSESizedVectorsAreEmulated = !tryToEnableCoreSimdWithDMD; 141 version(D_AVX) 142 enum AVXSizedVectorsAreEmulated = !tryToEnableCoreSimdWithDMD; 143 else 144 enum AVXSizedVectorsAreEmulated = true; 145 } 146 else 147 { 148 // Some DMD 32-bit targets don't have D_SIMD 149 enum MMXSizedVectorsAreEmulated = true; 150 enum SSESizedVectorsAreEmulated = true; 151 enum AVXSizedVectorsAreEmulated = true; 152 } 153 } 154 155 enum CoreSimdIsEmulated = MMXSizedVectorsAreEmulated || SSESizedVectorsAreEmulated || AVXSizedVectorsAreEmulated; 156 157 version(GNU) 158 enum bool DefineGenericLoadStoreUnaligned = false; 159 else 160 enum bool DefineGenericLoadStoreUnaligned = CoreSimdIsEmulated; 161 162 163 static if (CoreSimdIsEmulated) 164 { 165 // core.simd is emulated in some capacity: introduce `VectorOps` 166 167 mixin template VectorOps(VectorType, ArrayType: BaseType[N], BaseType, size_t N) 168 { 169 enum Count = N; 170 alias Base = BaseType; 171 172 BaseType* ptr() return pure nothrow @nogc 173 { 174 return array.ptr; 175 } 176 177 // Unary operators 178 VectorType opUnary(string op)() pure nothrow @safe @nogc 179 { 180 VectorType res = void; 181 mixin("res.array[] = " ~ op ~ "array[];"); 182 return res; 183 } 184 185 // Binary operators 186 VectorType opBinary(string op)(VectorType other) pure const nothrow @safe @nogc 187 { 188 VectorType res = void; 189 mixin("res.array[] = array[] " ~ op ~ " other.array[];"); 190 return res; 191 } 192 193 // Assigning a BaseType value 194 void opAssign(BaseType e) pure nothrow @safe @nogc 195 { 196 array[] = e; 197 } 198 199 // Assigning a static array 200 void opAssign(ArrayType v) pure nothrow @safe @nogc 201 { 202 array[] = v[]; 203 } 204 205 void opOpAssign(string op)(VectorType other) pure nothrow @safe @nogc 206 { 207 mixin("array[] " ~ op ~ "= other.array[];"); 208 } 209 210 // Assigning a dyn array 211 this(ArrayType v) pure nothrow @safe @nogc 212 { 213 array[] = v[]; 214 } 215 216 // Broadcast constructor 217 this(BaseType x) pure nothrow @safe @nogc 218 { 219 array[] = x; 220 } 221 222 /// We can't support implicit conversion but do support explicit casting. 223 /// "Vector types of the same size can be implicitly converted among each other." 224 /// Casting to another vector type is always just a raw copy. 225 VecDest opCast(VecDest)() pure const nothrow @trusted @nogc 226 if (VecDest.sizeof == VectorType.sizeof) 227 { 228 VecDest dest = void; 229 // Copy 230 dest.array[] = (cast(typeof(dest.array))cast(void[VectorType.sizeof])array)[]; 231 return dest; 232 } 233 234 ref inout(BaseType) opIndex(size_t i) inout return pure nothrow @safe @nogc 235 { 236 return array[i]; 237 } 238 239 } 240 241 // they just weren't interesting enough, use v.array[i] instead. 242 deprecated auto extractelement(Vec, int index, Vec2)(Vec2 vec) @trusted 243 { 244 static assert(Vec.sizeof == Vec2.sizeof); 245 import core.stdc.string: memcpy; 246 Vec v = void; 247 memcpy(&v, &vec, Vec2.sizeof); 248 return v.array[index]; 249 } 250 251 // they just weren't interesting enough, use v.ptr[i] = x instead. 252 deprecated auto insertelement(Vec, int index, Vec2)(Vec2 vec, Vec.Base e) @trusted 253 { 254 static assert(Vec.sizeof == Vec2.sizeof); 255 import core.stdc.string: memcpy; 256 Vec v = void; 257 memcpy(&v, &vec, Vec2.sizeof); 258 v.array[index] = e; 259 return v; 260 } 261 } 262 else 263 { 264 public import core.simd; 265 266 // GDC cannot convert implicitely __vector from signed to unsigned, but LDC can 267 // And LDC sometimes need those unsigned vector types for some intrinsics. 268 // For internal use only. 269 package alias ushort8 = Vector!(ushort[8]); 270 package alias ubyte8 = Vector!(ubyte[8]); 271 package alias ubyte16 = Vector!(ubyte[16]); 272 } 273 274 static if (DefineGenericLoadStoreUnaligned) 275 { 276 template loadUnaligned(Vec) 277 { 278 // Note: can't be @safe with this signature 279 Vec loadUnaligned(const(BaseType!Vec)* pvec) @trusted 280 { 281 enum bool isVector = ( (Vec.sizeof == 8) && (!MMXSizedVectorsAreEmulated) 282 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated) 283 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) ); 284 285 static if (isVector) 286 { 287 // PERF DMD 288 // BUG: code is wrong, should cast to Vec, see https://github.com/dlang/druntime/pull/3808/commits/b5670753248ec3b1631a0eb8ca76a27e8d6a39b9 289 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d 290 static if (DMD_with_DSIMD && Vec.sizeof == 8) 291 { 292 static if (is(Vec == double2)) 293 return cast(Vec)__simd(XMM.LODUPD, *pvec); 294 else static if (is(Vec == float4)) 295 return cast(Vec)__simd(XMM.LODUPS, *pvec); 296 else 297 return cast(Vec)__simd(XMM.LODDQU, *pvec); 298 } 299 else */ 300 { 301 enum size_t Count = Vec.array.length; 302 Vec result; 303 foreach(int i; 0..Count) 304 { 305 result.ptr[i] = pvec[i]; 306 } 307 return result; 308 } 309 } 310 else 311 { 312 // Since this vector is emulated, it doesn't have alignement constraints 313 // and as such we can just cast it. 314 return *cast(Vec*)(pvec); 315 } 316 } 317 } 318 319 template storeUnaligned(Vec) 320 { 321 // Note: can't be @safe with this signature 322 void storeUnaligned(Vec v, BaseType!Vec* pvec) @trusted 323 { 324 enum bool isVector = ( (Vec.sizeof == 8) && (!MMXSizedVectorsAreEmulated) 325 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated) 326 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) ); 327 328 static if (isVector) 329 { 330 // PERF DMD 331 // BUG: code is wrong, should cast to Vec, see https://github.com/dlang/druntime/pull/3808/commits/b5670753248ec3b1631a0eb8ca76a27e8d6a39b9 332 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d 333 static if (DMD_with_DSIMD && Vec.sizeof == 8) 334 { 335 static if (is(Vec == double2)) 336 __simd_sto(XMM.STOUPD, *pvec, value); 337 else static if (is(Vec == float4)) 338 __simd_sto(XMM.STOUPS, *pvec, value); 339 else 340 __simd_sto(XMM.STODQU, *pvec, value); 341 } 342 else*/ 343 { 344 enum size_t Count = Vec.array.length; 345 foreach(int i; 0..Count) 346 pvec[i] = v.array[i]; 347 } 348 } 349 else 350 { 351 *cast(Vec*)(pvec) = v; 352 } 353 } 354 } 355 356 Vec shufflevector(Vec, mask...)(Vec a, Vec b) @safe if (Vec.sizeof < 32) 357 { 358 enum size_t Count = Vec.array.length; 359 static assert(mask.length == Count); 360 361 Vec r = void; 362 foreach(int i, m; mask) 363 { 364 static assert (m < Count * 2); 365 enum int ind = cast(int)m; 366 static if (ind < Count) 367 r.array[i] = a.array[ind]; 368 else 369 r.array[i] = b.array[ind-Count]; 370 } 371 return r; 372 } 373 } 374 375 // Emulate ldc.simd cmpMask and other masks. 376 // Note: these should be deprecated on non-LDC, 377 // since it's slower to generate that code. 378 version(LDC) 379 {} 380 else 381 { 382 private template BaseType(V) 383 { 384 alias typeof( ( { V v; return v; }()).array[0]) BaseType; 385 } 386 387 private template TrueMask(V) 388 { 389 alias Elem = BaseType!V; 390 391 static if (is(Elem == float)) 392 { 393 immutable uint m1 = 0xffffffff; 394 enum Elem TrueMask = *cast(float*)(&m1); 395 } 396 else static if (is(Elem == double)) 397 { 398 immutable ulong m1 = 0xffffffff_ffffffff; 399 enum Elem TrueMask = *cast(double*)(&m1); 400 } 401 else // integer case 402 { 403 enum Elem TrueMask = -1; 404 } 405 } 406 407 Vec equalMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "oeq" comparison 408 { 409 enum size_t Count = Vec.array.length; 410 Vec result; 411 foreach(int i; 0..Count) 412 { 413 bool cond = a.array[i] == b.array[i]; 414 result.ptr[i] = cond ? TrueMask!Vec : 0; 415 } 416 return result; 417 } 418 419 Vec notEqualMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "one" comparison 420 { 421 enum size_t Count = Vec.array.length; 422 Vec result; 423 foreach(int i; 0..Count) 424 { 425 bool cond = a.array[i] != b.array[i]; 426 result.ptr[i] = cond ? TrueMask!Vec : 0; 427 } 428 return result; 429 } 430 431 Vec greaterMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "ogt" comparison 432 { 433 enum size_t Count = Vec.array.length; 434 Vec result; 435 foreach(int i; 0..Count) 436 { 437 bool cond = a.array[i] > b.array[i]; 438 result.ptr[i] = cond ? TrueMask!Vec : 0; 439 } 440 return result; 441 } 442 } 443 444 unittest 445 { 446 float4 a = [1, 3, 5, 7]; 447 float4 b = [2, 3, 4, 5]; 448 int4 c = cast(int4)(greaterMask!float4(a, b)); 449 static immutable int[4] correct = [0, 0, 0xffff_ffff, 0xffff_ffff]; 450 assert(c.array == correct); 451 } 452 453 static if (MMXSizedVectorsAreEmulated) 454 { 455 /// MMX-like SIMD types 456 struct float2 457 { 458 float[2] array; 459 mixin VectorOps!(float2, float[2]); 460 } 461 462 struct byte8 463 { 464 byte[8] array; 465 mixin VectorOps!(byte8, byte[8]); 466 } 467 468 struct short4 469 { 470 short[4] array; 471 mixin VectorOps!(short4, short[4]); 472 } 473 474 struct int2 475 { 476 int[2] array; 477 mixin VectorOps!(int2, int[2]); 478 } 479 480 struct long1 481 { 482 long[1] array; 483 mixin VectorOps!(long1, long[1]); 484 } 485 } 486 else 487 { 488 // For this compiler, defining MMX-sized vectors is working. 489 public import core.simd; 490 alias Vector!(long [1]) long1; 491 alias Vector!(float[2]) float2; 492 alias Vector!(int [2]) int2; 493 alias Vector!(short[4]) short4; 494 alias Vector!(byte [8]) byte8; 495 } 496 497 static assert(float2.sizeof == 8); 498 static assert(byte8.sizeof == 8); 499 static assert(short4.sizeof == 8); 500 static assert(int2.sizeof == 8); 501 static assert(long1.sizeof == 8); 502 503 504 static if (SSESizedVectorsAreEmulated) 505 { 506 /// SSE-like SIMD types 507 508 struct float4 509 { 510 float[4] array; 511 mixin VectorOps!(float4, float[4]); 512 } 513 514 struct byte16 515 { 516 byte[16] array; 517 mixin VectorOps!(byte16, byte[16]); 518 } 519 520 struct short8 521 { 522 short[8] array; 523 mixin VectorOps!(short8, short[8]); 524 } 525 526 struct int4 527 { 528 int[4] array; 529 mixin VectorOps!(int4, int[4]); 530 } 531 532 struct long2 533 { 534 long[2] array; 535 mixin VectorOps!(long2, long[2]); 536 } 537 538 struct double2 539 { 540 double[2] array; 541 mixin VectorOps!(double2, double[2]); 542 } 543 } 544 545 static assert(float4.sizeof == 16); 546 static assert(byte16.sizeof == 16); 547 static assert(short8.sizeof == 16); 548 static assert(int4.sizeof == 16); 549 static assert(long2.sizeof == 16); 550 static assert(double2.sizeof == 16); 551 552 553 static if (AVXSizedVectorsAreEmulated) 554 { 555 /// AVX-like SIMD types 556 557 struct float8 558 { 559 float[8] array; 560 mixin VectorOps!(float8, float[8]); 561 } 562 563 struct byte32 564 { 565 byte[32] array; 566 mixin VectorOps!(byte32, byte[32]); 567 } 568 569 struct short16 570 { 571 short[16] array; 572 mixin VectorOps!(short16, short[16]); 573 } 574 575 struct int8 576 { 577 int[8] array; 578 mixin VectorOps!(int8, int[8]); 579 } 580 581 struct long4 582 { 583 long[4] array; 584 mixin VectorOps!(long4, long[4]); 585 } 586 587 struct double4 588 { 589 double[4] array; 590 mixin VectorOps!(double4, double[4]); 591 } 592 } 593 else 594 { 595 public import core.simd; 596 } 597 static assert(float8.sizeof == 32); 598 static assert(byte32.sizeof == 32); 599 static assert(short16.sizeof == 32); 600 static assert(int8.sizeof == 32); 601 static assert(long4.sizeof == 32); 602 static assert(double4.sizeof == 32); 603 604 605 606 607 alias __m256 = float8; 608 alias __m256i = long4; // long long __vector with ICC, GCC, and clang 609 alias __m256d = double4; 610 alias __m128 = float4; 611 alias __m128i = int4; 612 alias __m128d = double2; 613 alias __m64 = long1; // like in Clang, __m64 is a vector of 1 long 614 615 int _MM_SHUFFLE2(int x, int y) pure @safe 616 { 617 assert(x >= 0 && x <= 1); 618 assert(y >= 0 && y <= 1); 619 return (x << 1) | y; 620 } 621 622 int _MM_SHUFFLE(int z, int y, int x, int w) pure @safe 623 { 624 assert(x >= 0 && x <= 3); 625 assert(y >= 0 && y <= 3); 626 assert(z >= 0 && z <= 3); 627 assert(w >= 0 && w <= 3); 628 return (z<<6) | (y<<4) | (x<<2) | w; 629 } 630 631 // test assignment from scalar to vector type 632 unittest 633 { 634 float4 A = 3.0f; 635 float[4] correctA = [3.0f, 3.0f, 3.0f, 3.0f]; 636 assert(A.array == correctA); 637 638 int2 B = 42; 639 int[2] correctB = [42, 42]; 640 assert(B.array == correctB); 641 }