1 /** 2 * AVX intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX 4 * 5 * Copyright: Guillaume Piolat 2022. 6 * Johan Engelen 2022. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.avxintrin; 10 11 // AVX instructions 12 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX 13 // Note: this header will work whether you have AVX enabled or not. 14 // With LDC, use "dflags-ldc": ["-mattr=+avx"] or equivalent to actively 15 // generate AVX instructions. 16 17 public import inteli.types; 18 import inteli.internals; 19 20 // Pull in all previous instruction set intrinsics. 21 public import inteli.tmmintrin; 22 23 nothrow @nogc: 24 25 /// Extract a 32-bit integer from `a`, selected with `imm8`. 26 int _mm256_extract_epi32 (__m256i a, const int imm8) pure @trusted 27 { 28 return (cast(int8)a).array[imm8 & 7]; 29 } 30 unittest 31 { 32 align(16) int[8] data = [-1, 2, -3, 4, 9, -7, 8, -6]; 33 auto A = _mm256_loadu_si256(cast(__m256i*) data.ptr); 34 assert(_mm256_extract_epi32(A, 0) == -1); 35 assert(_mm256_extract_epi32(A, 1 + 8) == 2); 36 assert(_mm256_extract_epi32(A, 3 + 16) == 4); 37 assert(_mm256_extract_epi32(A, 7 + 32) == -6); 38 } 39 40 /// Load 256-bits of integer data from memory. `mem_addr` does not need to be aligned on any particular boundary. 41 __m256i _mm256_loadu_si256 (const(__m256i)* mem_addr) pure @trusted 42 { 43 // PERF DMD 44 pragma(inline, true); 45 static if (GDC_with_AVX) 46 { 47 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) mem_addr); 48 } 49 else version(LDC) 50 { 51 return loadUnaligned!(__m256i)(cast(long*)mem_addr); 52 } 53 else 54 { 55 const(long)* p = cast(const(long)*)mem_addr; 56 long4 r; 57 r.ptr[0] = p[0]; 58 r.ptr[1] = p[1]; 59 r.ptr[2] = p[2]; 60 r.ptr[3] = p[3]; 61 return r; 62 } 63 } 64 unittest 65 { 66 align(16) int[8] correct = [-1, 2, -3, 4, 9, -7, 8, -6]; 67 int8 A = cast(int8) _mm256_loadu_si256(cast(__m256i*) correct.ptr); 68 assert(A.array == correct); 69 } 70 71 /// Broadcast 8-bit integer `a` to all elements of the return value. 72 __m256i _mm256_set1_epi8 (byte a) pure @trusted 73 { 74 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 75 { 76 byte32 v = a; 77 return cast(__m256i) v; 78 } 79 else 80 { 81 pragma(inline, true); 82 return cast(__m256i)(byte32(a)); 83 } 84 } 85 unittest 86 { 87 byte32 a = cast(byte32) _mm256_set1_epi8(31); 88 for (int i = 0; i < 32; ++i) 89 assert(a.array[i] == 31); 90 } 91 92 /// Broadcast 16-bit integer `a` to all elements of the return value. 93 __m256i _mm256_set1_epi16 (short a) pure @trusted 94 { 95 version(DigitalMars) // workaround https://issues.dlang.org/show_bug.cgi?id=21469 96 { 97 short16 v = a; 98 return cast(__m256i) v; 99 } 100 else 101 { 102 pragma(inline, true); 103 return cast(__m256i)(short16(a)); 104 } 105 } 106 unittest 107 { 108 short16 a = cast(short16) _mm256_set1_epi16(31); 109 for (int i = 0; i < 16; ++i) 110 assert(a.array[i] == 31); 111 } 112 113 /// Broadcast 32-bit integer `a` to all elements. 114 __m256i _mm256_set1_epi32 (int a) pure @trusted 115 { 116 pragma(inline, true); 117 return cast(__m256i)(int8(a)); 118 } 119 unittest 120 { 121 int8 a = cast(int8) _mm256_set1_epi32(31); 122 for (int i = 0; i < 8; ++i) 123 assert(a.array[i] == 31); 124 } 125 126 /// Set packed 8-bit integers with the supplied values in reverse order. 127 __m256i _mm256_setr_epi8 (byte e31, byte e30, byte e29, byte e28, byte e27, byte e26, byte e25, byte e24, 128 byte e23, byte e22, byte e21, byte e20, byte e19, byte e18, byte e17, byte e16, 129 byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9, byte e8, 130 byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) pure @trusted 131 { 132 // PERF GDC, not checked 133 pragma(inline, true); 134 byte[32] result = [ e31, e30, e29, e28, e27, e26, e25, e24, 135 e23, e22, e21, e20, e19, e18, e17, e16, 136 e15, e14, e13, e12, e11, e10, e9, e8, 137 e7, e6, e5, e4, e3, e2, e1, e0]; 138 static if (GDC_with_AVX) 139 { 140 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr); 141 } 142 else version(LDC) 143 { 144 return cast(__m256i)( loadUnaligned!(byte32)(result.ptr) ); 145 } 146 else 147 { 148 byte32 r; 149 for(int n = 0; n < 32; ++n) 150 r.ptr[n] = result[n]; 151 return cast(__m256i)r; 152 } 153 } 154 unittest 155 { 156 byte32 A = cast(byte32) _mm256_setr_epi8( -1, 0, -21, 21, 42, 127, -42, -128, 157 -1, 0, -21, 21, 42, 127, -42, -128, 158 -1, 0, -21, 21, 42, 127, -42, -128, 159 -1, 0, -21, 21, 42, 127, -42, -128); 160 byte[32] correct = [-1, 0, -21, 21, 42, 127, -42, -128, 161 -1, 0, -21, 21, 42, 127, -42, -128, 162 -1, 0, -21, 21, 42, 127, -42, -128, 163 -1, 0, -21, 21, 42, 127, -42, -128]; 164 assert(A.array == correct); 165 } 166 167 /// Set packed 16-bit integers with the supplied values in reverse order. 168 __m256i _mm256_setr_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, 169 short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted 170 { 171 pragma(inline, true); 172 short[16] result = [ e15, e14, e13, e12, e11, e10, e9, e8, 173 e7, e6, e5, e4, e3, e2, e1, e0]; 174 static if (GDC_with_AVX) 175 { 176 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr); 177 } 178 else version(LDC) 179 { 180 return cast(__m256i)( loadUnaligned!(short16)(result.ptr) ); 181 } 182 else 183 { 184 short16 r; 185 for(int n = 0; n < 16; ++n) 186 r.ptr[n] = result[n]; 187 return cast(__m256i)r; 188 } 189 } 190 unittest 191 { 192 short16 A = cast(short16) _mm256_setr_epi16(-1, 0, -21, 21, 42, 127, -42, -128, 193 -1, 0, -21, 21, 42, 127, -42, -128); 194 short[16] correct = [-1, 0, -21, 21, 42, 127, -42, -128, 195 -1, 0, -21, 21, 42, 127, -42, -128]; 196 assert(A.array == correct); 197 } 198 199 /// Set packed 32-bit integers with the supplied values in reverse order. 200 __m256i _mm256_setr_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) pure @trusted 201 { 202 pragma(inline, true); 203 int[8] result = [e7, e6, e5, e4, e3, e2, e1, e0]; 204 static if (GDC_with_AVX) 205 { 206 return cast(__m256i) __builtin_ia32_loaddqu256(cast(const(char)*) result.ptr); 207 } 208 else version(LDC) 209 { 210 return cast(__m256i)( loadUnaligned!(int8)(result.ptr) ); 211 } 212 else 213 { 214 int8 r; 215 for(int n = 0; n < 8; ++n) 216 r.ptr[n] = result[n]; 217 return cast(__m256i)r; 218 } 219 } 220 unittest 221 { 222 int8 A = cast(int8) _mm256_setr_epi32(-1, 0, -2147483648, 2147483647, 42, 666, -42, -666); 223 int[8] correct = [-1, 0, -2147483648, 2147483647, 42, 666, -42, -666]; 224 assert(A.array == correct); 225 } 226 227 228 /// Return vector of type `__m256i` with all elements set to zero. 229 __m256i _mm256_setzero_si256() pure @trusted 230 { 231 // PERF: nothing was checked 232 pragma(inline, true); 233 234 version(LDC) 235 { 236 int[8] result = [0, 0, 0, 0, 0, 0, 0, 0]; 237 return cast(__m256i)( loadUnaligned!(int8)(result.ptr) ); 238 } 239 else 240 { 241 __m256i r; 242 r = 0; 243 return r; 244 } 245 } 246 247 /// Store 256-bits of integer data from `a` into memory. `mem_addr` does not need to be aligned on any particular boundary. 248 pragma(inline, true) 249 void _mm256_storeu_si256 (const(__m256i)* mem_addr, __m256i a) pure @trusted 250 { 251 // PERF: DMD and GDC 252 version(LDC) 253 { 254 storeUnaligned!__m256i(a, cast(long*)mem_addr); 255 } 256 else 257 { 258 long4 v = cast(long4)a; 259 long* p = cast(long*)mem_addr; 260 for(int n = 0; n < 4; ++n) 261 p[n] = v[n]; 262 } 263 }