1 /** 2 * AVX intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX 4 * 5 * Copyright: Guillaume Piolat 2022. 6 * Johan Engelen 2022. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.avxintrin; 10 11 // AVX instructions 12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX 13 // Note: this header will work whether you have AVX enabled or not. 14 // With LDC, use "dflags-ldc": ["-mattr=+avx"] or equivalent to actively 15 // generate AVX instructions. 16 17 public import inteli.types; 18 import inteli.internals; 19 20 // Pull in all previous instruction set intrinsics. 21 public import inteli.tmmintrin; 22 23 nothrow @nogc: 24 25 /// Extract a 32-bit integer from `a`, selected with `imm8`. 26 int _mm256_extract_epi32 (__m256i a, const int imm8) pure @trusted 27 { 28 return (cast(int8)a).array[imm8 & 7]; 29 } 30 unittest 31 { 32 align(16) int[8] data = [-1, 2, -3, 4, 9, -7, 8, -6]; 33 auto A = _mm256_loadu_si256(cast(__m256i*) data.ptr); 34 assert(_mm256_extract_epi32(A, 0) == -1); 35 assert(_mm256_extract_epi32(A, 1 + 8) == 2); 36 assert(_mm256_extract_epi32(A, 3 + 16) == 4); 37 assert(_mm256_extract_epi32(A, 7 + 32) == -6); 38 } 39 40 /// Load 256-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary. 41 __m256i _mm256_loadu_si256 (const(__m256i)* mem_addr) pure @trusted 42 { 43 // PERF DMD 44 pragma(inline, true); 45 static if (GDC_with_AVX) 46 { 47 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) mem_addr); 48 } 49 else version(LDC) 50 { 51 return loadUnaligned!(__m256i)(cast(long*)mem_addr); 52 } 53 else 54 { 55 const(long)* p = cast(const(long)*)mem_addr; 56 long4 r; 57 r.ptr[0] = p[0]; 58 r.ptr[1] = p[1]; 59 r.ptr[2] = p[2]; 60 r.ptr[3] = p[3]; 61 return r; 62 } 63 } 64 unittest 65 { 66 align(16) int[8] correct = [-1, 2, -3, 4, 9, -7, 8, -6]; 67 int8 A = cast(int8) _mm256_loadu_si256(cast(__m256i*) correct.ptr); 68 assert(A.array == correct); 69 } 70 71 /// Broadcast 8-bit integer `a` to all elements of the return value. 72 __m256i _mm256_set1_epi8 (byte a) pure @trusted 73 { 74 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 75 { 76 byte32 v = a; 77 return cast(__m256i) v; 78 } 79 else 80 { 81 pragma(inline, true); 82 return cast(__m256i)(byte32(a)); 83 } 84 } 85 unittest 86 { 87 byte32 a = cast(byte32) _mm256_set1_epi8(31); 88 for (int i = 0; i < 32; ++i) 89 assert(a.array[i] == 31); 90 } 91 92 /// Broadcast 16-bit integer `a` to all elements of the return value. 93 __m256i _mm256_set1_epi16 (short a) pure @trusted 94 { 95 // workaround https://issues.dlang.org/show_bug.cgi?id=21469 96 // It used to ICE, now the codegen is just wrong. 97 // TODO report this backend issue. 98 version(DigitalMars) 99 { 100 short16 v = a; 101 return cast(__m256i) v; 102 } 103 else 104 { 105 pragma(inline, true); 106 return cast(__m256i)(short16(a)); 107 } 108 } 109 unittest 110 { 111 short16 a = cast(short16) _mm256_set1_epi16(31); 112 for (int i = 0; i < 16; ++i) 113 assert(a.array[i] == 31); 114 } 115 116 /// Broadcast 32-bit integer `a` to all elements. 117 __m256i _mm256_set1_epi32 (int a) pure @trusted 118 { 119 // Bad codegen else in DMD. 120 // TODO report this backend issue. 121 version(DigitalMars) 122 { 123 int8 v = a; 124 return cast(__m256i) v; 125 } 126 else 127 { 128 pragma(inline, true); 129 return cast(__m256i)(int8(a)); 130 } 131 } 132 unittest 133 { 134 int8 a = cast(int8) _mm256_set1_epi32(31); 135 for (int i = 0; i < 8; ++i) 136 assert(a.array[i] == 31); 137 } 138 139 /// Set packed 8-bit integers with the supplied values in reverse order. 140 __m256i _mm256_setr_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24, 141 byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16, 142 byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9, byte e8, 143 byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 144 { 145 // PERF GDC, not checked 146 pragma(inline, true); 147 byte[32] result = [ e31, e30, e29, e28, e27, e26, e25, e24, 148 e23, e22, e21, e20, e19, e18, e17, e16, 149 e15, e14, e13, e12, e11, e10, e9, e8, 150 e7, e6, e5, e4, e3, e2, e1, e0]; 151 static if (GDC_with_AVX) 152 { 153 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr); 154 } 155 else version(LDC) 156 { 157 return cast(__m256i)( loadUnaligned!(byte32)(result.ptr) ); 158 } 159 else 160 { 161 byte32 r; 162 for(int n = 0; n < 32; ++n) 163 r.ptr[n] = result[n]; 164 return cast(__m256i)r; 165 } 166 } 167 unittest 168 { 169 byte32 A = cast(byte32) _mm256_setr_epi8( -1, 0, -21, 21, 42, 127, -42, -128, 170 -1, 0, -21, 21, 42, 127, -42, -128, 171 -1, 0, -21, 21, 42, 127, -42, -128, 172 -1, 0, -21, 21, 42, 127, -42, -128); 173 byte[32] correct = [-1, 0, -21, 21, 42, 127, -42, -128, 174 -1, 0, -21, 21, 42, 127, -42, -128, 175 -1, 0, -21, 21, 42, 127, -42, -128, 176 -1, 0, -21, 21, 42, 127, -42, -128]; 177 assert(A.array == correct); 178 } 179 180 /// Set packed 16-bit integers with the supplied values in reverse order. 181 __m256i _mm256_setr_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, 182 short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 183 { 184 pragma(inline, true); 185 short[16] result = [ e15, e14, e13, e12, e11, e10, e9, e8, 186 e7, e6, e5, e4, e3, e2, e1, e0]; 187 static if (GDC_with_AVX) 188 { 189 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr); 190 } 191 else version(LDC) 192 { 193 return cast(__m256i)( loadUnaligned!(short16)(result.ptr) ); 194 } 195 else 196 { 197 short16 r; 198 for(int n = 0; n < 16; ++n) 199 r.ptr[n] = result[n]; 200 return cast(__m256i)r; 201 } 202 } 203 unittest 204 { 205 short16 A = cast(short16) _mm256_setr_epi16(-1, 0, -21, 21, 42, 127, -42, -128, 206 -1, 0, -21, 21, 42, 127, -42, -128); 207 short[16] correct = [-1, 0, -21, 21, 42, 127, -42, -128, 208 -1, 0, -21, 21, 42, 127, -42, -128]; 209 assert(A.array == correct); 210 } 211 212 /// Set packed 32-bit integers with the supplied values in reverse order. 213 __m256i _mm256_setr_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted 214 { 215 pragma(inline, true); 216 int[8] result = [e7, e6, e5, e4, e3, e2, e1, e0]; 217 static if (GDC_with_AVX) 218 { 219 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr); 220 } 221 else version(LDC) 222 { 223 return cast(__m256i)( loadUnaligned!(int8)(result.ptr) ); 224 } 225 else 226 { 227 int8 r; 228 for(int n = 0; n < 8; ++n) 229 r.ptr[n] = result[n]; 230 return cast(__m256i)r; 231 } 232 } 233 unittest 234 { 235 int8 A = cast(int8) _mm256_setr_epi32(-1, 0, -2147483648, 2147483647, 42, 666, -42, -666); 236 int[8] correct = [-1, 0, -2147483648, 2147483647, 42, 666, -42, -666]; 237 assert(A.array == correct); 238 } 239 240 241 /// Return vector of type `__m256i` with all elements set to zero. 242 __m256i _mm256_setzero_si256() pure @trusted 243 { 244 // PERF: nothing was checked 245 pragma(inline, true); 246 247 version(LDC) 248 { 249 int[8] result = [0, 0, 0, 0, 0, 0, 0, 0]; 250 return cast(__m256i)( loadUnaligned!(int8)(result.ptr) ); 251 } 252 else 253 { 254 __m256i r; 255 r = 0; 256 return r; 257 } 258 } 259 260 /// Store 256-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular boundary. 261 pragma(inline, true) 262 void _mm256_storeu_si256 (const(__m256i)* mem_addr, __m256i a) pure @trusted 263 { 264 // PERF: DMD and GDC 265 version(LDC) 266 { 267 storeUnaligned!__m256i(a, cast(long*)mem_addr); 268 } 269 else 270 { 271 long4 v = cast(long4)a; 272 long* p = cast(long*)mem_addr; 273 for(int n = 0; n < 4; ++n) 274 p[n] = v[n]; 275 } 276 }