1 /** 2 * `core.simd` emulation layer. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.types; 8 9 10 pure: 11 nothrow: 12 @nogc: 13 14 version(GNU) 15 { 16 // Note: for GDC support, be sure to use https://explore.dgnu.org/ 17 18 version(X86_64) 19 { 20 enum MMXSizedVectorsAreEmulated = false; 21 enum SSESizedVectorsAreEmulated = false; 22 23 // TODO: use D_AVX and D_AVX2 eventually to detect AVX? 24 enum AVXSizedVectorsAreEmulated = true; 25 26 import gcc.builtins; 27 28 float4 loadUnaligned(Vec)(const(float)* pvec) @trusted if (is(Vec == float4)) 29 { 30 return __builtin_ia32_loadups(pvec); 31 } 32 33 double2 loadUnaligned(Vec)(const(double)* pvec) @trusted if (is(Vec == double2)) 34 { 35 return __builtin_ia32_loadupd(pvec); 36 } 37 38 byte16 loadUnaligned(Vec)(const(byte)* pvec) @trusted if (is(Vec == byte16)) 39 { 40 return cast(byte16) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 41 } 42 43 short8 loadUnaligned(Vec)(const(short)* pvec) @trusted if (is(Vec == short8)) 44 { 45 return cast(short8) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 46 } 47 48 int4 loadUnaligned(Vec)(const(int)* pvec) @trusted if (is(Vec == int4)) 49 { 50 return cast(int4) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 51 } 52 53 long2 loadUnaligned(Vec)(const(long)* pvec) @trusted if (is(Vec == long2)) 54 { 55 return cast(long2) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 56 } 57 58 void storeUnaligned(Vec)(Vec v, float* pvec) @trusted if (is(Vec == float4)) 59 { 60 __builtin_ia32_storeups(pvec, v); 61 } 62 63 void storeUnaligned(Vec)(Vec v, double* pvec) @trusted if (is(Vec == double2)) 64 { 65 __builtin_ia32_storeupd(pvec, v); 66 } 67 68 void storeUnaligned(Vec)(Vec v, byte* pvec) @trusted if (is(Vec == byte16)) 69 { 70 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 71 } 72 73 void storeUnaligned(Vec)(Vec v, short* pvec) @trusted if (is(Vec == short8)) 74 { 75 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 76 } 77 78 void storeUnaligned(Vec)(Vec v, int* pvec) @trusted if (is(Vec == int4)) 79 { 80 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 81 } 82 83 void storeUnaligned(Vec)(Vec v, long* pvec) @trusted if (is(Vec == long2)) 84 { 85 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 86 } 87 88 // TODO: for performance, replace that anywhere possible by a GDC intrinsic 89 Vec shufflevector(Vec, mask...)(Vec a, Vec b) @trusted 90 { 91 enum Count = Vec.array.length; 92 static assert(mask.length == Count); 93 94 Vec r = void; 95 foreach(int i, m; mask) 96 { 97 static assert (m < Count * 2); 98 int ind = cast(int)m; 99 if (ind < Count) 100 r.ptr[i] = a.array[ind]; 101 else 102 r.ptr[i] = b.array[ind - Count]; 103 } 104 return r; 105 } 106 } 107 else 108 { 109 enum MMXSizedVectorsAreEmulated = true; 110 enum SSESizedVectorsAreEmulated = true; 111 enum AVXSizedVectorsAreEmulated = true; 112 } 113 } 114 else version(LDC) 115 { 116 public import ldc.simd; 117 118 // Use this alias to mention it should only be used with LDC, 119 // for example when emulated shufflevector would just be wasteful. 120 alias shufflevectorLDC = shufflevector; 121 122 enum MMXSizedVectorsAreEmulated = false; 123 enum SSESizedVectorsAreEmulated = false; 124 enum AVXSizedVectorsAreEmulated = false; 125 } 126 else version(DigitalMars) 127 { 128 public import core.simd; 129 130 version(D_SIMD) 131 { 132 enum MMXSizedVectorsAreEmulated = true; 133 134 static if (__VERSION__ >= 2100) 135 { 136 // Trying out D_SIMD finally, with DMD 2.100 137 //enum SSESizedVectorsAreEmulated = false; 138 139 // It didn't work, maybe one day. 140 enum SSESizedVectorsAreEmulated = true; 141 } 142 else 143 { 144 // Basically blockd by DMD backend issues, tagged codegen, backend, or SIMD in Bugzilla. 145 enum SSESizedVectorsAreEmulated = true; 146 } 147 148 enum AVXSizedVectorsAreEmulated = true; 149 } 150 else 151 { 152 // Some DMD 32-bit targets don't have D_SIMD 153 enum MMXSizedVectorsAreEmulated = true; 154 enum SSESizedVectorsAreEmulated = true; 155 enum AVXSizedVectorsAreEmulated = true; 156 } 157 } 158 159 enum CoreSimdIsEmulated = MMXSizedVectorsAreEmulated || SSESizedVectorsAreEmulated || AVXSizedVectorsAreEmulated; 160 161 version(GNU) 162 enum bool DefineGenericLoadStoreUnaligned = false; 163 else 164 enum bool DefineGenericLoadStoreUnaligned = CoreSimdIsEmulated; 165 166 167 static if (CoreSimdIsEmulated) 168 { 169 // core.simd is emulated in some capacity: introduce `VectorOps` 170 171 mixin template VectorOps(VectorType, ArrayType: BaseType[N], BaseType, size_t N) 172 { 173 enum Count = N; 174 alias Base = BaseType; 175 176 BaseType* ptr() return pure nothrow @nogc 177 { 178 return array.ptr; 179 } 180 181 // Unary operators 182 VectorType opUnary(string op)() pure nothrow @safe @nogc 183 { 184 VectorType res = void; 185 mixin("res.array[] = " ~ op ~ "array[];"); 186 return res; 187 } 188 189 // Binary operators 190 VectorType opBinary(string op)(VectorType other) pure const nothrow @safe @nogc 191 { 192 VectorType res = void; 193 mixin("res.array[] = array[] " ~ op ~ " other.array[];"); 194 return res; 195 } 196 197 // Assigning a BaseType value 198 void opAssign(BaseType e) pure nothrow @safe @nogc 199 { 200 array[] = e; 201 } 202 203 // Assigning a static array 204 void opAssign(ArrayType v) pure nothrow @safe @nogc 205 { 206 array[] = v[]; 207 } 208 209 void opOpAssign(string op)(VectorType other) pure nothrow @safe @nogc 210 { 211 mixin("array[] " ~ op ~ "= other.array[];"); 212 } 213 214 // Assigning a dyn array 215 this(ArrayType v) pure nothrow @safe @nogc 216 { 217 array[] = v[]; 218 } 219 220 // Broadcast constructor 221 this(BaseType x) pure nothrow @safe @nogc 222 { 223 array[] = x; 224 } 225 226 /// We can't support implicit conversion but do support explicit casting. 227 /// "Vector types of the same size can be implicitly converted among each other." 228 /// Casting to another vector type is always just a raw copy. 229 VecDest opCast(VecDest)() pure const nothrow @trusted @nogc 230 if (VecDest.sizeof == VectorType.sizeof) 231 { 232 VecDest dest = void; 233 // Copy 234 dest.array[] = (cast(typeof(dest.array))cast(void[VectorType.sizeof])array)[]; 235 return dest; 236 } 237 238 ref inout(BaseType) opIndex(size_t i) inout return pure nothrow @safe @nogc 239 { 240 return array[i]; 241 } 242 243 } 244 245 // they just weren't interesting enough, use v.array[i] instead. 246 deprecated auto extractelement(Vec, int index, Vec2)(Vec2 vec) @trusted 247 { 248 static assert(Vec.sizeof == Vec2.sizeof); 249 import core.stdc.string: memcpy; 250 Vec v = void; 251 memcpy(&v, &vec, Vec2.sizeof); 252 return v.array[index]; 253 } 254 255 // they just weren't interesting enough, use v.ptr[i] = x instead. 256 deprecated auto insertelement(Vec, int index, Vec2)(Vec2 vec, Vec.Base e) @trusted 257 { 258 static assert(Vec.sizeof == Vec2.sizeof); 259 import core.stdc.string: memcpy; 260 Vec v = void; 261 memcpy(&v, &vec, Vec2.sizeof); 262 v.array[index] = e; 263 return v; 264 } 265 } 266 else 267 { 268 public import core.simd; 269 270 // GDC cannot convert implicitely __vector from signed to unsigned, but LDC can 271 // And LDC sometimes need those unsigned vector types for some intrinsics. 272 // For internal use only. 273 package alias ushort8 = Vector!(ushort[8]); 274 package alias ubyte8 = Vector!(ubyte[8]); 275 package alias ubyte16 = Vector!(ubyte[16]); 276 } 277 278 static if (DefineGenericLoadStoreUnaligned) 279 { 280 template loadUnaligned(Vec) 281 { 282 // Note: can't be @safe with this signature 283 Vec loadUnaligned(const(BaseType!Vec)* pvec) @trusted 284 { 285 enum bool isVector = ( (Vec.sizeof == 8) && (!MMXSizedVectorsAreEmulated) 286 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated) 287 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) ); 288 289 static if (isVector) 290 { 291 // PERF DMD 292 // BUG: code is wrong, should cast to Vec, see https://github.com/dlang/druntime/pull/3808/commits/b5670753248ec3b1631a0eb8ca76a27e8d6a39b9 293 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d 294 static if (DMD_with_DSIMD && Vec.sizeof == 8) 295 { 296 static if (is(Vec == double2)) 297 return cast(Vec)__simd(XMM.LODUPD, *pvec); 298 else static if (is(Vec == float4)) 299 return cast(Vec)__simd(XMM.LODUPS, *pvec); 300 else 301 return cast(Vec)__simd(XMM.LODDQU, *pvec); 302 } 303 else */ 304 { 305 enum size_t Count = Vec.array.length; 306 Vec result; 307 foreach(int i; 0..Count) 308 { 309 result.ptr[i] = pvec[i]; 310 } 311 return result; 312 } 313 } 314 else 315 { 316 // Since this vector is emulated, it doesn't have alignement constraints 317 // and as such we can just cast it. 318 return *cast(Vec*)(pvec); 319 } 320 } 321 } 322 323 template storeUnaligned(Vec) 324 { 325 // Note: can't be @safe with this signature 326 void storeUnaligned(Vec v, BaseType!Vec* pvec) @trusted 327 { 328 enum bool isVector = ( (Vec.sizeof == 8) && (!MMXSizedVectorsAreEmulated) 329 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated) 330 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) ); 331 332 static if (isVector) 333 { 334 // PERF DMD 335 // BUG: code is wrong, should cast to Vec, see https://github.com/dlang/druntime/pull/3808/commits/b5670753248ec3b1631a0eb8ca76a27e8d6a39b9 336 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d 337 static if (DMD_with_DSIMD && Vec.sizeof == 8) 338 { 339 static if (is(Vec == double2)) 340 __simd_sto(XMM.STOUPD, *pvec, value); 341 else static if (is(Vec == float4)) 342 __simd_sto(XMM.STOUPS, *pvec, value); 343 else 344 __simd_sto(XMM.STODQU, *pvec, value); 345 } 346 else*/ 347 { 348 enum size_t Count = Vec.array.length; 349 foreach(int i; 0..Count) 350 pvec[i] = v.array[i]; 351 } 352 } 353 else 354 { 355 *cast(Vec*)(pvec) = v; 356 } 357 } 358 } 359 360 Vec shufflevector(Vec, mask...)(Vec a, Vec b) @safe if (Vec.sizeof < 32) 361 { 362 enum size_t Count = Vec.array.length; 363 static assert(mask.length == Count); 364 365 Vec r = void; 366 foreach(int i, m; mask) 367 { 368 static assert (m < Count * 2); 369 enum int ind = cast(int)m; 370 static if (ind < Count) 371 r.array[i] = a.array[ind]; 372 else 373 r.array[i] = b.array[ind-Count]; 374 } 375 return r; 376 } 377 } 378 379 // Emulate ldc.simd cmpMask and other masks. 380 // Note: these should be deprecated on non-LDC, 381 // since it's slower to generate that code. 382 version(LDC) 383 {} 384 else 385 { 386 private template BaseType(V) 387 { 388 alias typeof( ( { V v; return v; }()).array[0]) BaseType; 389 } 390 391 private template TrueMask(V) 392 { 393 alias Elem = BaseType!V; 394 395 static if (is(Elem == float)) 396 { 397 immutable uint m1 = 0xffffffff; 398 enum Elem TrueMask = *cast(float*)(&m1); 399 } 400 else static if (is(Elem == double)) 401 { 402 immutable ulong m1 = 0xffffffff_ffffffff; 403 enum Elem TrueMask = *cast(double*)(&m1); 404 } 405 else // integer case 406 { 407 enum Elem TrueMask = -1; 408 } 409 } 410 411 Vec equalMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "oeq" comparison 412 { 413 enum size_t Count = Vec.array.length; 414 Vec result; 415 foreach(int i; 0..Count) 416 { 417 bool cond = a.array[i] == b.array[i]; 418 result.ptr[i] = cond ? TrueMask!Vec : 0; 419 } 420 return result; 421 } 422 423 Vec notEqualMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "one" comparison 424 { 425 enum size_t Count = Vec.array.length; 426 Vec result; 427 foreach(int i; 0..Count) 428 { 429 bool cond = a.array[i] != b.array[i]; 430 result.ptr[i] = cond ? TrueMask!Vec : 0; 431 } 432 return result; 433 } 434 435 Vec greaterMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "ogt" comparison 436 { 437 enum size_t Count = Vec.array.length; 438 Vec result; 439 foreach(int i; 0..Count) 440 { 441 bool cond = a.array[i] > b.array[i]; 442 result.ptr[i] = cond ? TrueMask!Vec : 0; 443 } 444 return result; 445 } 446 } 447 448 unittest 449 { 450 float4 a = [1, 3, 5, 7]; 451 float4 b = [2, 3, 4, 5]; 452 int4 c = cast(int4)(greaterMask!float4(a, b)); 453 static immutable int[4] correct = [0, 0, 0xffff_ffff, 0xffff_ffff]; 454 assert(c.array == correct); 455 } 456 457 static if (MMXSizedVectorsAreEmulated) 458 { 459 /// MMX-like SIMD types 460 struct float2 461 { 462 float[2] array; 463 mixin VectorOps!(float2, float[2]); 464 } 465 466 struct byte8 467 { 468 byte[8] array; 469 mixin VectorOps!(byte8, byte[8]); 470 } 471 472 struct short4 473 { 474 short[4] array; 475 mixin VectorOps!(short4, short[4]); 476 } 477 478 struct int2 479 { 480 int[2] array; 481 mixin VectorOps!(int2, int[2]); 482 } 483 484 struct long1 485 { 486 long[1] array; 487 mixin VectorOps!(long1, long[1]); 488 } 489 } 490 else 491 { 492 // For this compiler, defining MMX-sized vectors is working. 493 public import core.simd; 494 alias Vector!(long [1]) long1; 495 alias Vector!(float[2]) float2; 496 alias Vector!(int [2]) int2; 497 alias Vector!(short[4]) short4; 498 alias Vector!(byte [8]) byte8; 499 } 500 501 static assert(float2.sizeof == 8); 502 static assert(byte8.sizeof == 8); 503 static assert(short4.sizeof == 8); 504 static assert(int2.sizeof == 8); 505 static assert(long1.sizeof == 8); 506 507 508 static if (SSESizedVectorsAreEmulated) 509 { 510 /// SSE-like SIMD types 511 512 struct float4 513 { 514 float[4] array; 515 mixin VectorOps!(float4, float[4]); 516 } 517 518 struct byte16 519 { 520 byte[16] array; 521 mixin VectorOps!(byte16, byte[16]); 522 } 523 524 struct short8 525 { 526 short[8] array; 527 mixin VectorOps!(short8, short[8]); 528 } 529 530 struct int4 531 { 532 int[4] array; 533 mixin VectorOps!(int4, int[4]); 534 } 535 536 struct long2 537 { 538 long[2] array; 539 mixin VectorOps!(long2, long[2]); 540 } 541 542 struct double2 543 { 544 double[2] array; 545 mixin VectorOps!(double2, double[2]); 546 } 547 } 548 549 static assert(float4.sizeof == 16); 550 static assert(byte16.sizeof == 16); 551 static assert(short8.sizeof == 16); 552 static assert(int4.sizeof == 16); 553 static assert(long2.sizeof == 16); 554 static assert(double2.sizeof == 16); 555 556 557 static if (AVXSizedVectorsAreEmulated) 558 { 559 /// AVX-like SIMD types 560 561 struct float8 562 { 563 float[8] array; 564 mixin VectorOps!(float8, float[8]); 565 } 566 567 struct byte32 568 { 569 byte[32] array; 570 mixin VectorOps!(byte32, byte[32]); 571 } 572 573 struct short16 574 { 575 short[16] array; 576 mixin VectorOps!(short16, short[16]); 577 } 578 579 struct int8 580 { 581 int[8] array; 582 mixin VectorOps!(int8, int[8]); 583 } 584 585 struct long4 586 { 587 long[4] array; 588 mixin VectorOps!(long4, long[4]); 589 } 590 591 struct double4 592 { 593 double[4] array; 594 mixin VectorOps!(double4, double[4]); 595 } 596 } 597 598 static assert(float8.sizeof == 32); 599 static assert(byte32.sizeof == 32); 600 static assert(short16.sizeof == 32); 601 static assert(int8.sizeof == 32); 602 static assert(long4.sizeof == 32); 603 static assert(double4.sizeof == 32); 604 605 606 607 608 alias __m256 = float8; 609 alias __m256i = long4; // long long __vector with ICC, GCC, and clang 610 alias __m256d = double4; 611 alias __m128 = float4; 612 alias __m128i = int4; 613 alias __m128d = double2; 614 alias __m64 = long1; // like in Clang, __m64 is a vector of 1 long 615 616 int _MM_SHUFFLE2(int x, int y) pure @safe 617 { 618 assert(x >= 0 && x <= 1); 619 assert(y >= 0 && y <= 1); 620 return (x << 1) | y; 621 } 622 623 int _MM_SHUFFLE(int z, int y, int x, int w) pure @safe 624 { 625 assert(x >= 0 && x <= 3); 626 assert(y >= 0 && y <= 3); 627 assert(z >= 0 && z <= 3); 628 assert(w >= 0 && w <= 3); 629 return (z<<6) | (y<<4) | (x<<2) | w; 630 } 631 632 // test assignment from scalar to vector type 633 unittest 634 { 635 float4 A = 3.0f; 636 float[4] correctA = [3.0f, 3.0f, 3.0f, 3.0f]; 637 assert(A.array == correctA); 638 639 int2 B = 42; 640 int[2] correctB = [42, 42]; 641 assert(B.array == correctB); 642 }