1 /** 2 * `core.simd` emulation layer. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.types; 8 9 10 pure: 11 nothrow: 12 @nogc: 13 14 version(GNU) 15 { 16 // Note: for GDC support, be sure to use https://explore.dgnu.org/ 17 18 version(X86_64) 19 { 20 enum MMXSizedVectorsAreEmulated = false; 21 enum SSESizedVectorsAreEmulated = false; 22 23 // TODO: use D_AVX and D_AVX2 eventually to detect AVX? 24 enum AVXSizedVectorsAreEmulated = true; 25 26 import gcc.builtins; 27 28 float4 loadUnaligned(Vec)(const(float)* pvec) @trusted if (is(Vec == float4)) 29 { 30 return __builtin_ia32_loadups(pvec); 31 } 32 33 double2 loadUnaligned(Vec)(const(double)* pvec) @trusted if (is(Vec == double2)) 34 { 35 return __builtin_ia32_loadupd(pvec); 36 } 37 38 byte16 loadUnaligned(Vec)(const(byte)* pvec) @trusted if (is(Vec == byte16)) 39 { 40 return cast(byte16) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 41 } 42 43 short8 loadUnaligned(Vec)(const(short)* pvec) @trusted if (is(Vec == short8)) 44 { 45 return cast(short8) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 46 } 47 48 int4 loadUnaligned(Vec)(const(int)* pvec) @trusted if (is(Vec == int4)) 49 { 50 return cast(int4) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 51 } 52 53 long2 loadUnaligned(Vec)(const(long)* pvec) @trusted if (is(Vec == long2)) 54 { 55 return cast(long2) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 56 } 57 58 void storeUnaligned(Vec)(Vec v, float* pvec) @trusted if (is(Vec == float4)) 59 { 60 __builtin_ia32_storeups(pvec, v); 61 } 62 63 void storeUnaligned(Vec)(Vec v, double* pvec) @trusted if (is(Vec == double2)) 64 { 65 __builtin_ia32_storeupd(pvec, v); 66 } 67 68 void storeUnaligned(Vec)(Vec v, byte* pvec) @trusted if (is(Vec == byte16)) 69 { 70 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 71 } 72 73 void storeUnaligned(Vec)(Vec v, short* pvec) @trusted if (is(Vec == short8)) 74 { 75 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 76 } 77 78 void storeUnaligned(Vec)(Vec v, int* pvec) @trusted if (is(Vec == int4)) 79 { 80 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 81 } 82 83 void storeUnaligned(Vec)(Vec v, long* pvec) @trusted if (is(Vec == long2)) 84 { 85 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 86 } 87 88 // TODO: for performance, replace that anywhere possible by a GDC intrinsic 89 Vec shufflevector(Vec, mask...)(Vec a, Vec b) @trusted 90 { 91 enum Count = Vec.array.length; 92 static assert(mask.length == Count); 93 94 Vec r = void; 95 foreach(int i, m; mask) 96 { 97 static assert (m < Count * 2); 98 int ind = cast(int)m; 99 if (ind < Count) 100 r.ptr[i] = a.array[ind]; 101 else 102 r.ptr[i] = b.array[ind - Count]; 103 } 104 return r; 105 } 106 } 107 else 108 { 109 enum MMXSizedVectorsAreEmulated = true; 110 enum SSESizedVectorsAreEmulated = true; 111 enum AVXSizedVectorsAreEmulated = true; 112 } 113 } 114 else version(LDC) 115 { 116 public import ldc.simd; 117 118 // Use this alias to mention it should only be used with LDC, 119 // for example when emulated shufflevector would just be wasteful. 120 alias shufflevectorLDC = shufflevector; 121 122 enum MMXSizedVectorsAreEmulated = false; 123 enum SSESizedVectorsAreEmulated = false; 124 enum AVXSizedVectorsAreEmulated = false; 125 } 126 else version(DigitalMars) 127 { 128 public import core.simd; 129 130 version(D_SIMD) 131 { 132 enum MMXSizedVectorsAreEmulated = true; 133 134 static if (__VERSION__ >= 2099) 135 { 136 // Trying out D_IMD finally, with DMD 2.099 137 enum SSESizedVectorsAreEmulated = false; 138 } 139 else 140 { 141 // Basically blockd by DMD backend issues, tagged codegen, backend, or SIMD in Bugzilla. 142 enum SSESizedVectorsAreEmulated = true; 143 } 144 145 enum AVXSizedVectorsAreEmulated = true; 146 } 147 else 148 { 149 // Some DMD 32-bit targets don't have D_SIMD 150 enum MMXSizedVectorsAreEmulated = true; 151 enum SSESizedVectorsAreEmulated = true; 152 enum AVXSizedVectorsAreEmulated = true; 153 } 154 } 155 156 enum CoreSimdIsEmulated = MMXSizedVectorsAreEmulated || SSESizedVectorsAreEmulated || AVXSizedVectorsAreEmulated; 157 158 version(GNU) 159 enum bool DefineGenericLoadStoreUnaligned = false; 160 else 161 enum bool DefineGenericLoadStoreUnaligned = CoreSimdIsEmulated; 162 163 164 static if (CoreSimdIsEmulated) 165 { 166 // core.simd is emulated in some capacity: introduce `VectorOps` 167 168 mixin template VectorOps(VectorType, ArrayType: BaseType[N], BaseType, size_t N) 169 { 170 enum Count = N; 171 alias Base = BaseType; 172 173 BaseType* ptr() return pure nothrow @nogc 174 { 175 return array.ptr; 176 } 177 178 // Unary operators 179 VectorType opUnary(string op)() pure nothrow @safe @nogc 180 { 181 VectorType res = void; 182 mixin("res.array[] = " ~ op ~ "array[];"); 183 return res; 184 } 185 186 // Binary operators 187 VectorType opBinary(string op)(VectorType other) pure const nothrow @safe @nogc 188 { 189 VectorType res = void; 190 mixin("res.array[] = array[] " ~ op ~ " other.array[];"); 191 return res; 192 } 193 194 // Assigning a BaseType value 195 void opAssign(BaseType e) pure nothrow @safe @nogc 196 { 197 array[] = e; 198 } 199 200 // Assigning a static array 201 void opAssign(ArrayType v) pure nothrow @safe @nogc 202 { 203 array[] = v[]; 204 } 205 206 void opOpAssign(string op)(VectorType other) pure nothrow @safe @nogc 207 { 208 mixin("array[] " ~ op ~ "= other.array[];"); 209 } 210 211 // Assigning a dyn array 212 this(ArrayType v) pure nothrow @safe @nogc 213 { 214 array[] = v[]; 215 } 216 217 // Broadcast constructor 218 this(BaseType x) pure nothrow @safe @nogc 219 { 220 array[] = x; 221 } 222 223 /// We can't support implicit conversion but do support explicit casting. 224 /// "Vector types of the same size can be implicitly converted among each other." 225 /// Casting to another vector type is always just a raw copy. 226 VecDest opCast(VecDest)() pure const nothrow @trusted @nogc 227 if (VecDest.sizeof == VectorType.sizeof) 228 { 229 VecDest dest = void; 230 // Copy 231 dest.array[] = (cast(typeof(dest.array))cast(void[VectorType.sizeof])array)[]; 232 return dest; 233 } 234 235 ref inout(BaseType) opIndex(size_t i) inout pure nothrow @safe @nogc 236 { 237 return array[i]; 238 } 239 240 } 241 242 // they just weren't interesting enough, use v.array[i] instead. 243 deprecated auto extractelement(Vec, int index, Vec2)(Vec2 vec) @trusted 244 { 245 static assert(Vec.sizeof == Vec2.sizeof); 246 import core.stdc.string: memcpy; 247 Vec v = void; 248 memcpy(&v, &vec, Vec2.sizeof); 249 return v.array[index]; 250 } 251 252 // they just weren't interesting enough, use v.ptr[i] = x instead. 253 deprecated auto insertelement(Vec, int index, Vec2)(Vec2 vec, Vec.Base e) @trusted 254 { 255 static assert(Vec.sizeof == Vec2.sizeof); 256 import core.stdc.string: memcpy; 257 Vec v = void; 258 memcpy(&v, &vec, Vec2.sizeof); 259 v.array[index] = e; 260 return v; 261 } 262 } 263 else 264 { 265 public import core.simd; 266 267 // GDC cannot convert implicitely __vector from signed to unsigned, but LDC can 268 // And LDC sometimes need those unsigned vector types for some intrinsics. 269 // For internal use only. 270 package alias ushort8 = Vector!(ushort[8]); 271 package alias ubyte8 = Vector!(ubyte[8]); 272 package alias ubyte16 = Vector!(ubyte[16]); 273 } 274 275 static if (DefineGenericLoadStoreUnaligned) 276 { 277 template loadUnaligned(Vec) 278 { 279 // Note: can't be @safe with this signature 280 Vec loadUnaligned(const(BaseType!Vec)* pvec) @trusted 281 { 282 enum bool isVector = ( (Vec.sizeof == 8) && (!MMXSizedVectorsAreEmulated) 283 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated) 284 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) ); 285 286 static if (isVector) 287 { 288 // PERF DMD 289 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d 290 static if (DMD_with_DSIMD && Vec.sizeof == 8) 291 { 292 static if (is(Vec == double2)) 293 return cast(Vec)__simd(XMM.LODUPD, *pvec); 294 else static if (is(Vec == float4)) 295 return cast(Vec)__simd(XMM.LODUPS, *pvec); 296 else 297 return cast(Vec)__simd(XMM.LODDQU, *pvec); 298 } 299 else */ 300 { 301 enum size_t Count = Vec.array.length; 302 Vec result; 303 foreach(int i; 0..Count) 304 { 305 result.ptr[i] = pvec[i]; 306 } 307 return result; 308 } 309 } 310 else 311 { 312 // Since this vector is emulated, it doesn't have alignement constraints 313 // and as such we can just cast it. 314 return *cast(Vec*)(pvec); 315 } 316 } 317 } 318 319 template storeUnaligned(Vec) 320 { 321 // Note: can't be @safe with this signature 322 void storeUnaligned(Vec v, BaseType!Vec* pvec) @trusted 323 { 324 enum bool isVector = ( (Vec.sizeof == 8) && (!MMXSizedVectorsAreEmulated) 325 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated) 326 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) ); 327 328 static if (isVector) 329 { 330 // PERF DMD 331 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d 332 static if (DMD_with_DSIMD && Vec.sizeof == 8) 333 { 334 static if (is(Vec == double2)) 335 __simd_sto(XMM.STOUPD, *pvec, value); 336 else static if (is(Vec == float4)) 337 __simd_sto(XMM.STOUPS, *pvec, value); 338 else 339 __simd_sto(XMM.STODQU, *pvec, value); 340 } 341 else*/ 342 { 343 enum size_t Count = Vec.array.length; 344 foreach(int i; 0..Count) 345 pvec[i] = v.array[i]; 346 } 347 } 348 else 349 { 350 *cast(Vec*)(pvec) = v; 351 } 352 } 353 } 354 355 Vec shufflevector(Vec, mask...)(Vec a, Vec b) @safe if (Vec.sizeof < 32) 356 { 357 enum size_t Count = Vec.array.length; 358 static assert(mask.length == Count); 359 360 Vec r = void; 361 foreach(int i, m; mask) 362 { 363 static assert (m < Count * 2); 364 enum int ind = cast(int)m; 365 static if (ind < Count) 366 r.array[i] = a.array[ind]; 367 else 368 r.array[i] = b.array[ind-Count]; 369 } 370 return r; 371 } 372 } 373 374 // Emulate ldc.simd cmpMask and other masks. 375 // Note: these should be deprecated on non-LDC, 376 // since it's slower to generate that code. 377 version(LDC) 378 {} 379 else 380 { 381 private template BaseType(V) 382 { 383 alias typeof( ( { V v; return v; }()).array[0]) BaseType; 384 } 385 386 private template TrueMask(V) 387 { 388 alias Elem = BaseType!V; 389 390 static if (is(Elem == float)) 391 { 392 immutable uint m1 = 0xffffffff; 393 enum Elem TrueMask = *cast(float*)(&m1); 394 } 395 else static if (is(Elem == double)) 396 { 397 immutable ulong m1 = 0xffffffff_ffffffff; 398 enum Elem TrueMask = *cast(double*)(&m1); 399 } 400 else // integer case 401 { 402 enum Elem TrueMask = -1; 403 } 404 } 405 406 Vec equalMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "oeq" comparison 407 { 408 enum size_t Count = Vec.array.length; 409 Vec result; 410 foreach(int i; 0..Count) 411 { 412 bool cond = a.array[i] == b.array[i]; 413 result.ptr[i] = cond ? TrueMask!Vec : 0; 414 } 415 return result; 416 } 417 418 Vec notEqualMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "one" comparison 419 { 420 enum size_t Count = Vec.array.length; 421 Vec result; 422 foreach(int i; 0..Count) 423 { 424 bool cond = a.array[i] != b.array[i]; 425 result.ptr[i] = cond ? TrueMask!Vec : 0; 426 } 427 return result; 428 } 429 430 Vec greaterMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "ogt" comparison 431 { 432 enum size_t Count = Vec.array.length; 433 Vec result; 434 foreach(int i; 0..Count) 435 { 436 bool cond = a.array[i] > b.array[i]; 437 result.ptr[i] = cond ? TrueMask!Vec : 0; 438 } 439 return result; 440 } 441 } 442 443 unittest 444 { 445 float4 a = [1, 3, 5, 7]; 446 float4 b = [2, 3, 4, 5]; 447 int4 c = cast(int4)(greaterMask!float4(a, b)); 448 static immutable int[4] correct = [0, 0, 0xffff_ffff, 0xffff_ffff]; 449 assert(c.array == correct); 450 } 451 452 static if (MMXSizedVectorsAreEmulated) 453 { 454 /// MMX-like SIMD types 455 struct float2 456 { 457 float[2] array; 458 mixin VectorOps!(float2, float[2]); 459 } 460 461 struct byte8 462 { 463 byte[8] array; 464 mixin VectorOps!(byte8, byte[8]); 465 } 466 467 struct short4 468 { 469 short[4] array; 470 mixin VectorOps!(short4, short[4]); 471 } 472 473 struct int2 474 { 475 int[2] array; 476 mixin VectorOps!(int2, int[2]); 477 } 478 479 struct long1 480 { 481 long[1] array; 482 mixin VectorOps!(long1, long[1]); 483 } 484 } 485 else 486 { 487 // For this compiler, defining MMX-sized vectors is working. 488 public import core.simd; 489 alias Vector!(long [1]) long1; 490 alias Vector!(float[2]) float2; 491 alias Vector!(int [2]) int2; 492 alias Vector!(short[4]) short4; 493 alias Vector!(byte [8]) byte8; 494 } 495 496 static assert(float2.sizeof == 8); 497 static assert(byte8.sizeof == 8); 498 static assert(short4.sizeof == 8); 499 static assert(int2.sizeof == 8); 500 static assert(long1.sizeof == 8); 501 502 503 static if (SSESizedVectorsAreEmulated) 504 { 505 /// SSE-like SIMD types 506 507 struct float4 508 { 509 float[4] array; 510 mixin VectorOps!(float4, float[4]); 511 } 512 513 struct byte16 514 { 515 byte[16] array; 516 mixin VectorOps!(byte16, byte[16]); 517 } 518 519 struct short8 520 { 521 short[8] array; 522 mixin VectorOps!(short8, short[8]); 523 } 524 525 struct int4 526 { 527 int[4] array; 528 mixin VectorOps!(int4, int[4]); 529 } 530 531 struct long2 532 { 533 long[2] array; 534 mixin VectorOps!(long2, long[2]); 535 } 536 537 struct double2 538 { 539 double[2] array; 540 mixin VectorOps!(double2, double[2]); 541 } 542 } 543 544 static assert(float4.sizeof == 16); 545 static assert(byte16.sizeof == 16); 546 static assert(short8.sizeof == 16); 547 static assert(int4.sizeof == 16); 548 static assert(long2.sizeof == 16); 549 static assert(double2.sizeof == 16); 550 551 552 static if (AVXSizedVectorsAreEmulated) 553 { 554 /// AVX-like SIMD types 555 556 struct float8 557 { 558 float[8] array; 559 mixin VectorOps!(float8, float[8]); 560 } 561 562 struct byte32 563 { 564 byte[32] array; 565 mixin VectorOps!(byte32, byte[32]); 566 } 567 568 struct short16 569 { 570 short[16] array; 571 mixin VectorOps!(short16, short[16]); 572 } 573 574 struct int8 575 { 576 int[8] array; 577 mixin VectorOps!(int8, int[8]); 578 } 579 580 struct long4 581 { 582 long[4] array; 583 mixin VectorOps!(long4, long[4]); 584 } 585 586 struct double4 587 { 588 double[4] array; 589 mixin VectorOps!(double4, double[4]); 590 } 591 } 592 593 static assert(float8.sizeof == 32); 594 static assert(byte32.sizeof == 32); 595 static assert(short16.sizeof == 32); 596 static assert(int8.sizeof == 32); 597 static assert(long4.sizeof == 32); 598 static assert(double4.sizeof == 32); 599 600 601 602 603 alias __m256 = float8; 604 alias __m256i = long4; // long long __vector with ICC, GCC, and clang 605 alias __m256d = double4; 606 alias __m128 = float4; 607 alias __m128i = int4; 608 alias __m128d = double2; 609 alias __m64 = long1; // like in Clang, __m64 is a vector of 1 long 610 611 int _MM_SHUFFLE2(int x, int y) pure @safe 612 { 613 assert(x >= 0 && x <= 1); 614 assert(y >= 0 && y <= 1); 615 return (x << 1) | y; 616 } 617 618 int _MM_SHUFFLE(int z, int y, int x, int w) pure @safe 619 { 620 assert(x >= 0 && x <= 3); 621 assert(y >= 0 && y <= 3); 622 assert(z >= 0 && z <= 3); 623 assert(w >= 0 && w <= 3); 624 return (z<<6) | (y<<4) | (x<<2) | w; 625 } 626 627 // test assignment from scalar to vector type 628 unittest 629 { 630 float4 A = 3.0f; 631 float[4] correctA = [3.0f, 3.0f, 3.0f, 3.0f]; 632 assert(A.array == correctA); 633 634 int2 B = 42; 635 int[2] correctB = [42, 42]; 636 assert(B.array == correctB); 637 }