1 /** 2 * `core.simd` emulation layer. 3 * 4 * Copyright: Copyright Guillaume Piolat 2016-2020, Stefanos Baziotis 2019. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module inteli.types; 8 9 10 pure: 11 nothrow: 12 @nogc: 13 14 version(GNU) 15 { 16 // Note: for GDC support, be sure to use https://explore.dgnu.org/ 17 18 version(X86_64) 19 { 20 enum MMXSizedVectorsAreEmulated = false; 21 enum SSESizedVectorsAreEmulated = false; 22 23 // TODO: use D_AVX and D_AVX2 eventually to detect AVX? 24 enum AVXSizedVectorsAreEmulated = true; 25 26 import gcc.builtins; 27 28 float4 loadUnaligned(Vec)(const(float)* pvec) @trusted if (is(Vec == float4)) 29 { 30 return __builtin_ia32_loadups(pvec); 31 } 32 33 double2 loadUnaligned(Vec)(const(double)* pvec) @trusted if (is(Vec == double2)) 34 { 35 return __builtin_ia32_loadupd(pvec); 36 } 37 38 byte16 loadUnaligned(Vec)(const(byte)* pvec) @trusted if (is(Vec == byte16)) 39 { 40 return cast(byte16) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 41 } 42 43 short8 loadUnaligned(Vec)(const(short)* pvec) @trusted if (is(Vec == short8)) 44 { 45 return cast(short8) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 46 } 47 48 int4 loadUnaligned(Vec)(const(int)* pvec) @trusted if (is(Vec == int4)) 49 { 50 return cast(int4) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 51 } 52 53 long2 loadUnaligned(Vec)(const(long)* pvec) @trusted if (is(Vec == long2)) 54 { 55 return cast(long2) __builtin_ia32_loaddqu(cast(const(char)*) pvec); 56 } 57 58 void storeUnaligned(Vec)(Vec v, float* pvec) @trusted if (is(Vec == float4)) 59 { 60 __builtin_ia32_storeups(pvec, v); 61 } 62 63 void storeUnaligned(Vec)(Vec v, double* pvec) @trusted if (is(Vec == double2)) 64 { 65 __builtin_ia32_storeupd(pvec, v); 66 } 67 68 void storeUnaligned(Vec)(Vec v, byte* pvec) @trusted if (is(Vec == byte16)) 69 { 70 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 71 } 72 73 void storeUnaligned(Vec)(Vec v, short* pvec) @trusted if (is(Vec == short8)) 74 { 75 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 76 } 77 78 void storeUnaligned(Vec)(Vec v, int* pvec) @trusted if (is(Vec == int4)) 79 { 80 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 81 } 82 83 void storeUnaligned(Vec)(Vec v, long* pvec) @trusted if (is(Vec == long2)) 84 { 85 __builtin_ia32_storedqu(cast(char*)pvec, cast(ubyte16)v); 86 } 87 88 // TODO: for performance, replace that anywhere possible by a GDC intrinsic 89 Vec shufflevector(Vec, mask...)(Vec a, Vec b) @trusted 90 { 91 enum Count = Vec.array.length; 92 static assert(mask.length == Count); 93 94 Vec r = void; 95 foreach(int i, m; mask) 96 { 97 static assert (m < Count * 2); 98 int ind = cast(int)m; 99 if (ind < Count) 100 r.ptr[i] = a.array[ind]; 101 else 102 r.ptr[i] = b.array[ind - Count]; 103 } 104 return r; 105 } 106 } 107 else 108 { 109 enum MMXSizedVectorsAreEmulated = true; 110 enum SSESizedVectorsAreEmulated = true; 111 enum AVXSizedVectorsAreEmulated = true; 112 } 113 } 114 else version(LDC) 115 { 116 public import ldc.simd; 117 118 // Use this alias to mention it should only be used with LDC, 119 // for example when emulated shufflevector would just be wasteful. 120 alias shufflevectorLDC = shufflevector; 121 122 enum MMXSizedVectorsAreEmulated = false; 123 enum SSESizedVectorsAreEmulated = false; 124 enum AVXSizedVectorsAreEmulated = false; 125 } 126 else version(DigitalMars) 127 { 128 public import core.simd; 129 130 version(D_SIMD) 131 { 132 enum MMXSizedVectorsAreEmulated = true; 133 134 static if (__VERSION__ >= 2099) 135 { 136 // Trying out D_SIMD finally, with DMD 2.099 137 //enum SSESizedVectorsAreEmulated = false; 138 139 // It didn't work, maybe one day. 140 enum SSESizedVectorsAreEmulated = true; 141 } 142 else 143 { 144 // Basically blockd by DMD backend issues, tagged codegen, backend, or SIMD in Bugzilla. 145 enum SSESizedVectorsAreEmulated = true; 146 } 147 148 enum AVXSizedVectorsAreEmulated = true; 149 } 150 else 151 { 152 // Some DMD 32-bit targets don't have D_SIMD 153 enum MMXSizedVectorsAreEmulated = true; 154 enum SSESizedVectorsAreEmulated = true; 155 enum AVXSizedVectorsAreEmulated = true; 156 } 157 } 158 159 enum CoreSimdIsEmulated = MMXSizedVectorsAreEmulated || SSESizedVectorsAreEmulated || AVXSizedVectorsAreEmulated; 160 161 version(GNU) 162 enum bool DefineGenericLoadStoreUnaligned = false; 163 else 164 enum bool DefineGenericLoadStoreUnaligned = CoreSimdIsEmulated; 165 166 167 static if (CoreSimdIsEmulated) 168 { 169 // core.simd is emulated in some capacity: introduce `VectorOps` 170 171 mixin template VectorOps(VectorType, ArrayType: BaseType[N], BaseType, size_t N) 172 { 173 enum Count = N; 174 alias Base = BaseType; 175 176 BaseType* ptr() return pure nothrow @nogc 177 { 178 return array.ptr; 179 } 180 181 // Unary operators 182 VectorType opUnary(string op)() pure nothrow @safe @nogc 183 { 184 VectorType res = void; 185 mixin("res.array[] = " ~ op ~ "array[];"); 186 return res; 187 } 188 189 // Binary operators 190 VectorType opBinary(string op)(VectorType other) pure const nothrow @safe @nogc 191 { 192 VectorType res = void; 193 mixin("res.array[] = array[] " ~ op ~ " other.array[];"); 194 return res; 195 } 196 197 // Assigning a BaseType value 198 void opAssign(BaseType e) pure nothrow @safe @nogc 199 { 200 array[] = e; 201 } 202 203 // Assigning a static array 204 void opAssign(ArrayType v) pure nothrow @safe @nogc 205 { 206 array[] = v[]; 207 } 208 209 void opOpAssign(string op)(VectorType other) pure nothrow @safe @nogc 210 { 211 mixin("array[] " ~ op ~ "= other.array[];"); 212 } 213 214 // Assigning a dyn array 215 this(ArrayType v) pure nothrow @safe @nogc 216 { 217 array[] = v[]; 218 } 219 220 // Broadcast constructor 221 this(BaseType x) pure nothrow @safe @nogc 222 { 223 array[] = x; 224 } 225 226 /// We can't support implicit conversion but do support explicit casting. 227 /// "Vector types of the same size can be implicitly converted among each other." 228 /// Casting to another vector type is always just a raw copy. 229 VecDest opCast(VecDest)() pure const nothrow @trusted @nogc 230 if (VecDest.sizeof == VectorType.sizeof) 231 { 232 VecDest dest = void; 233 // Copy 234 dest.array[] = (cast(typeof(dest.array))cast(void[VectorType.sizeof])array)[]; 235 return dest; 236 } 237 238 ref inout(BaseType) opIndex(size_t i) inout pure nothrow @safe @nogc 239 { 240 return array[i]; 241 } 242 243 } 244 245 // they just weren't interesting enough, use v.array[i] instead. 246 deprecated auto extractelement(Vec, int index, Vec2)(Vec2 vec) @trusted 247 { 248 static assert(Vec.sizeof == Vec2.sizeof); 249 import core.stdc.string: memcpy; 250 Vec v = void; 251 memcpy(&v, &vec, Vec2.sizeof); 252 return v.array[index]; 253 } 254 255 // they just weren't interesting enough, use v.ptr[i] = x instead. 256 deprecated auto insertelement(Vec, int index, Vec2)(Vec2 vec, Vec.Base e) @trusted 257 { 258 static assert(Vec.sizeof == Vec2.sizeof); 259 import core.stdc.string: memcpy; 260 Vec v = void; 261 memcpy(&v, &vec, Vec2.sizeof); 262 v.array[index] = e; 263 return v; 264 } 265 } 266 else 267 { 268 public import core.simd; 269 270 // GDC cannot convert implicitely __vector from signed to unsigned, but LDC can 271 // And LDC sometimes need those unsigned vector types for some intrinsics. 272 // For internal use only. 273 package alias ushort8 = Vector!(ushort[8]); 274 package alias ubyte8 = Vector!(ubyte[8]); 275 package alias ubyte16 = Vector!(ubyte[16]); 276 } 277 278 static if (DefineGenericLoadStoreUnaligned) 279 { 280 template loadUnaligned(Vec) 281 { 282 // Note: can't be @safe with this signature 283 Vec loadUnaligned(const(BaseType!Vec)* pvec) @trusted 284 { 285 enum bool isVector = ( (Vec.sizeof == 8) && (!MMXSizedVectorsAreEmulated) 286 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated) 287 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) ); 288 289 static if (isVector) 290 { 291 // PERF DMD 292 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d 293 static if (DMD_with_DSIMD && Vec.sizeof == 8) 294 { 295 static if (is(Vec == double2)) 296 return cast(Vec)__simd(XMM.LODUPD, *pvec); 297 else static if (is(Vec == float4)) 298 return cast(Vec)__simd(XMM.LODUPS, *pvec); 299 else 300 return cast(Vec)__simd(XMM.LODDQU, *pvec); 301 } 302 else */ 303 { 304 enum size_t Count = Vec.array.length; 305 Vec result; 306 foreach(int i; 0..Count) 307 { 308 result.ptr[i] = pvec[i]; 309 } 310 return result; 311 } 312 } 313 else 314 { 315 // Since this vector is emulated, it doesn't have alignement constraints 316 // and as such we can just cast it. 317 return *cast(Vec*)(pvec); 318 } 319 } 320 } 321 322 template storeUnaligned(Vec) 323 { 324 // Note: can't be @safe with this signature 325 void storeUnaligned(Vec v, BaseType!Vec* pvec) @trusted 326 { 327 enum bool isVector = ( (Vec.sizeof == 8) && (!MMXSizedVectorsAreEmulated) 328 || (Vec.sizeof == 16) && (!SSESizedVectorsAreEmulated) 329 || (Vec.sizeof == 32) && (!AVXSizedVectorsAreEmulated) ); 330 331 static if (isVector) 332 { 333 // PERF DMD 334 /* enabling this need to move loadUnaligned and storeUnaligned to internals.d 335 static if (DMD_with_DSIMD && Vec.sizeof == 8) 336 { 337 static if (is(Vec == double2)) 338 __simd_sto(XMM.STOUPD, *pvec, value); 339 else static if (is(Vec == float4)) 340 __simd_sto(XMM.STOUPS, *pvec, value); 341 else 342 __simd_sto(XMM.STODQU, *pvec, value); 343 } 344 else*/ 345 { 346 enum size_t Count = Vec.array.length; 347 foreach(int i; 0..Count) 348 pvec[i] = v.array[i]; 349 } 350 } 351 else 352 { 353 *cast(Vec*)(pvec) = v; 354 } 355 } 356 } 357 358 Vec shufflevector(Vec, mask...)(Vec a, Vec b) @safe if (Vec.sizeof < 32) 359 { 360 enum size_t Count = Vec.array.length; 361 static assert(mask.length == Count); 362 363 Vec r = void; 364 foreach(int i, m; mask) 365 { 366 static assert (m < Count * 2); 367 enum int ind = cast(int)m; 368 static if (ind < Count) 369 r.array[i] = a.array[ind]; 370 else 371 r.array[i] = b.array[ind-Count]; 372 } 373 return r; 374 } 375 } 376 377 // Emulate ldc.simd cmpMask and other masks. 378 // Note: these should be deprecated on non-LDC, 379 // since it's slower to generate that code. 380 version(LDC) 381 {} 382 else 383 { 384 private template BaseType(V) 385 { 386 alias typeof( ( { V v; return v; }()).array[0]) BaseType; 387 } 388 389 private template TrueMask(V) 390 { 391 alias Elem = BaseType!V; 392 393 static if (is(Elem == float)) 394 { 395 immutable uint m1 = 0xffffffff; 396 enum Elem TrueMask = *cast(float*)(&m1); 397 } 398 else static if (is(Elem == double)) 399 { 400 immutable ulong m1 = 0xffffffff_ffffffff; 401 enum Elem TrueMask = *cast(double*)(&m1); 402 } 403 else // integer case 404 { 405 enum Elem TrueMask = -1; 406 } 407 } 408 409 Vec equalMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "oeq" comparison 410 { 411 enum size_t Count = Vec.array.length; 412 Vec result; 413 foreach(int i; 0..Count) 414 { 415 bool cond = a.array[i] == b.array[i]; 416 result.ptr[i] = cond ? TrueMask!Vec : 0; 417 } 418 return result; 419 } 420 421 Vec notEqualMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "one" comparison 422 { 423 enum size_t Count = Vec.array.length; 424 Vec result; 425 foreach(int i; 0..Count) 426 { 427 bool cond = a.array[i] != b.array[i]; 428 result.ptr[i] = cond ? TrueMask!Vec : 0; 429 } 430 return result; 431 } 432 433 Vec greaterMask(Vec)(Vec a, Vec b) @trusted // for floats, equivalent to "ogt" comparison 434 { 435 enum size_t Count = Vec.array.length; 436 Vec result; 437 foreach(int i; 0..Count) 438 { 439 bool cond = a.array[i] > b.array[i]; 440 result.ptr[i] = cond ? TrueMask!Vec : 0; 441 } 442 return result; 443 } 444 } 445 446 unittest 447 { 448 float4 a = [1, 3, 5, 7]; 449 float4 b = [2, 3, 4, 5]; 450 int4 c = cast(int4)(greaterMask!float4(a, b)); 451 static immutable int[4] correct = [0, 0, 0xffff_ffff, 0xffff_ffff]; 452 assert(c.array == correct); 453 } 454 455 static if (MMXSizedVectorsAreEmulated) 456 { 457 /// MMX-like SIMD types 458 struct float2 459 { 460 float[2] array; 461 mixin VectorOps!(float2, float[2]); 462 } 463 464 struct byte8 465 { 466 byte[8] array; 467 mixin VectorOps!(byte8, byte[8]); 468 } 469 470 struct short4 471 { 472 short[4] array; 473 mixin VectorOps!(short4, short[4]); 474 } 475 476 struct int2 477 { 478 int[2] array; 479 mixin VectorOps!(int2, int[2]); 480 } 481 482 struct long1 483 { 484 long[1] array; 485 mixin VectorOps!(long1, long[1]); 486 } 487 } 488 else 489 { 490 // For this compiler, defining MMX-sized vectors is working. 491 public import core.simd; 492 alias Vector!(long [1]) long1; 493 alias Vector!(float[2]) float2; 494 alias Vector!(int [2]) int2; 495 alias Vector!(short[4]) short4; 496 alias Vector!(byte [8]) byte8; 497 } 498 499 static assert(float2.sizeof == 8); 500 static assert(byte8.sizeof == 8); 501 static assert(short4.sizeof == 8); 502 static assert(int2.sizeof == 8); 503 static assert(long1.sizeof == 8); 504 505 506 static if (SSESizedVectorsAreEmulated) 507 { 508 /// SSE-like SIMD types 509 510 struct float4 511 { 512 float[4] array; 513 mixin VectorOps!(float4, float[4]); 514 } 515 516 struct byte16 517 { 518 byte[16] array; 519 mixin VectorOps!(byte16, byte[16]); 520 } 521 522 struct short8 523 { 524 short[8] array; 525 mixin VectorOps!(short8, short[8]); 526 } 527 528 struct int4 529 { 530 int[4] array; 531 mixin VectorOps!(int4, int[4]); 532 } 533 534 struct long2 535 { 536 long[2] array; 537 mixin VectorOps!(long2, long[2]); 538 } 539 540 struct double2 541 { 542 double[2] array; 543 mixin VectorOps!(double2, double[2]); 544 } 545 } 546 547 static assert(float4.sizeof == 16); 548 static assert(byte16.sizeof == 16); 549 static assert(short8.sizeof == 16); 550 static assert(int4.sizeof == 16); 551 static assert(long2.sizeof == 16); 552 static assert(double2.sizeof == 16); 553 554 555 static if (AVXSizedVectorsAreEmulated) 556 { 557 /// AVX-like SIMD types 558 559 struct float8 560 { 561 float[8] array; 562 mixin VectorOps!(float8, float[8]); 563 } 564 565 struct byte32 566 { 567 byte[32] array; 568 mixin VectorOps!(byte32, byte[32]); 569 } 570 571 struct short16 572 { 573 short[16] array; 574 mixin VectorOps!(short16, short[16]); 575 } 576 577 struct int8 578 { 579 int[8] array; 580 mixin VectorOps!(int8, int[8]); 581 } 582 583 struct long4 584 { 585 long[4] array; 586 mixin VectorOps!(long4, long[4]); 587 } 588 589 struct double4 590 { 591 double[4] array; 592 mixin VectorOps!(double4, double[4]); 593 } 594 } 595 596 static assert(float8.sizeof == 32); 597 static assert(byte32.sizeof == 32); 598 static assert(short16.sizeof == 32); 599 static assert(int8.sizeof == 32); 600 static assert(long4.sizeof == 32); 601 static assert(double4.sizeof == 32); 602 603 604 605 606 alias __m256 = float8; 607 alias __m256i = long4; // long long __vector with ICC, GCC, and clang 608 alias __m256d = double4; 609 alias __m128 = float4; 610 alias __m128i = int4; 611 alias __m128d = double2; 612 alias __m64 = long1; // like in Clang, __m64 is a vector of 1 long 613 614 int _MM_SHUFFLE2(int x, int y) pure @safe 615 { 616 assert(x >= 0 && x <= 1); 617 assert(y >= 0 && y <= 1); 618 return (x << 1) | y; 619 } 620 621 int _MM_SHUFFLE(int z, int y, int x, int w) pure @safe 622 { 623 assert(x >= 0 && x <= 3); 624 assert(y >= 0 && y <= 3); 625 assert(z >= 0 && z <= 3); 626 assert(w >= 0 && w <= 3); 627 return (z<<6) | (y<<4) | (x<<2) | w; 628 } 629 630 // test assignment from scalar to vector type 631 unittest 632 { 633 float4 A = 3.0f; 634 float[4] correctA = [3.0f, 3.0f, 3.0f, 3.0f]; 635 assert(A.array == correctA); 636 637 int2 B = 42; 638 int[2] correctB = [42, 42]; 639 assert(B.array == correctB); 640 }