1 /** 2 * Copyright: Guillaume Piolat 2016-2019. 3 * Charles Gregory 2019. 4 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 5 * Authors: Guillaume Piolat 6 */ 7 module inteli.pmmintrin; 8 9 public import inteli.types; 10 import inteli.internals; 11 import inteli.emmintrin; 12 13 14 // Note: this header will work whether you have SSE3 enabled or not. 15 // With LDC, use "dflags-ldc": ["-mattr=+sse3"] or equivalent to actively 16 // generate SSE3 instruction (they are often enabled with -O1 or greater). 17 18 19 nothrow @nogc: 20 21 /// Alternatively add and subtract packed double-precision (64-bit) 22 /// floating-point elements in `a` to/from packed elements in `b`. 23 __m128d _mm_addsub_pd (__m128d a, __m128d b) pure @trusted 24 { 25 // Note: generates addsubpd since LDC 1.3.0 with -O1 26 // PERF: for GDC, detect SSE3 and use the relevant builtin, because it doesn't generates addsubpd 27 a.ptr[0] = a.array[0] - b.array[0]; 28 a.ptr[1] = a.array[1] + b.array[1]; 29 return a; 30 } 31 unittest 32 { 33 auto v1 =_mm_setr_pd(1.0,2.0); 34 auto v2 =_mm_setr_pd(1.0,2.0); 35 assert(_mm_addsub_pd(v1,v2).array == _mm_setr_pd(0.0,4.0).array); 36 } 37 38 /// Alternatively add and subtract packed single-precision (32-bit) 39 /// floating-point elements in `a` to/from packed elements in `b`. 40 float4 _mm_addsub_ps (float4 a, float4 b) pure @trusted 41 { 42 // Note: generates addsubps since LDC 1.3.0 with -O1 43 // PERF: for GDC, detect SSE3 and use the relevant builtin 44 a.ptr[0] -= b.array[0]; 45 a.ptr[1] += b.array[1]; 46 a.ptr[2] -= b.array[2]; 47 a.ptr[3] += b.array[3]; 48 return a; 49 } 50 unittest 51 { 52 auto v1 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 53 auto v2 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 54 assert( _mm_addsub_ps(v1,v2).array == _mm_setr_ps(0.0f, 4.0f, 0.0f, 8.0f).array ); 55 } 56 57 version(LDC) 58 { 59 /// Horizontally add adjacent pairs of double-precision (64-bit) 60 /// floating-point elements in `a` and `b`. 61 __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @safe 62 { 63 static if (__traits(targetHasFeature, "sse3")) 64 { 65 return __builtin_ia32_haddpd(a, b); 66 } 67 else 68 { 69 __m128d res; 70 res[0] = a[1] + a[0]; 71 res[1] = b[1] + b[0]; 72 return res; 73 } 74 } 75 } 76 else 77 { 78 /// Horizontally add adjacent pairs of double-precision (64-bit) 79 /// floating-point elements in `a` and `b`. 80 __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @trusted 81 { 82 // On GDC this generates haddpd with -O1 83 __m128d res; 84 res.ptr[0] = a.array[1] + a.array[0]; 85 res.ptr[1] = b.array[1] + b.array[0]; 86 return res; 87 } 88 } 89 unittest 90 { 91 auto A =_mm_setr_pd(1.5, 2.0); 92 auto B =_mm_setr_pd(1.0, 2.0); 93 assert( _mm_hadd_pd(A, B).array ==_mm_setr_pd(3.5, 3.0).array ); 94 } 95 96 version(LDC) 97 { 98 /// Horizontally add adjacent pairs of single-precision (32-bit) 99 /// floating-point elements in `a` and `b`. 100 __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @safe 101 { 102 static if (__traits(targetHasFeature, "sse3")) 103 { 104 return __builtin_ia32_haddps(a, b); 105 } 106 else 107 { 108 __m128 res; 109 res[0] = a[1] + a[0]; 110 res[1] = a[3] + a[2]; 111 res[2] = b[1] + b[0]; 112 res[3] = b[3] + b[2]; 113 return res; 114 } 115 } 116 } 117 else 118 { 119 // PERF: for GDC, detect SSE3 and use the relevant builtin 120 121 /// Horizontally add adjacent pairs of single-precision (32-bit) 122 /// floating-point elements in `a` and `b`. 123 __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @trusted 124 { 125 __m128 res; 126 res.ptr[0] = a.array[1] + a.array[0]; 127 res.ptr[1] = a.array[3] + a.array[2]; 128 res.ptr[2] = b.array[1] + b.array[0]; 129 res.ptr[3] = b.array[3] + b.array[2]; 130 return res; 131 } 132 } 133 unittest 134 { 135 __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f); 136 __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f); 137 assert( _mm_hadd_ps(A, B).array == _mm_setr_ps(3.0f, 8.0f, 3.5f, 7.5f).array ); 138 } 139 140 version(LDC) 141 { 142 /// Horizontally subtract adjacent pairs of double-precision (64-bit) 143 /// floating-point elements in `a` and `b`. 144 __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @safe 145 { 146 static if (__traits(targetHasFeature, "sse3")) 147 { 148 return __builtin_ia32_hsubpd(a, b); 149 } 150 else 151 { 152 __m128d res; 153 res[0] = a[0] - a[1]; 154 res[1] = b[0] - b[1]; 155 return res; 156 } 157 } 158 } 159 else 160 { 161 /// Horizontally subtract adjacent pairs of double-precision (64-bit) 162 /// floating-point elements in `a` and `b`. 163 __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @trusted 164 { 165 // On GDC this generates hsubpd with -O1 166 __m128d res; 167 res.ptr[0] = a.array[0] - a.array[1]; 168 res.ptr[1] = b.array[0] - b.array[1]; 169 return res; 170 } 171 } 172 unittest 173 { 174 auto A =_mm_setr_pd(1.5, 2.0); 175 auto B =_mm_setr_pd(1.0, 2.0); 176 assert( _mm_hsub_pd(A, B).array ==_mm_setr_pd(-0.5, -1.0).array ); 177 } 178 179 version(LDC) 180 { 181 /// Horizontally subtract adjacent pairs of single-precision (32-bit) 182 /// floating-point elements in `a` and `b`. 183 __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @safe 184 { 185 static if (__traits(targetHasFeature, "sse3")) 186 { 187 return __builtin_ia32_hsubps(a, b); 188 } 189 else 190 { 191 __m128 res; 192 res[0] = a[0] - a[1]; 193 res[1] = a[2] - a[3]; 194 res[2] = b[0] - b[1]; 195 res[3] = b[2] - b[3]; 196 return res; 197 } 198 } 199 } 200 else 201 { 202 /// Horizontally subtract adjacent pairs of single-precision (32-bit) 203 /// floating-point elements in `a` and `b`. 204 __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @trusted 205 { 206 // PERF: GDC doesn't generate the right instruction, do something 207 __m128 res; 208 res.ptr[0] = a.array[0] - a.array[1]; 209 res.ptr[1] = a.array[2] - a.array[3]; 210 res.ptr[2] = b.array[0] - b.array[1]; 211 res.ptr[3] = b.array[2] - b.array[3]; 212 return res; 213 } 214 } 215 unittest 216 { 217 __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f); 218 __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f); 219 assert(_mm_hsub_ps(A, B).array == _mm_setr_ps(-1.0f, -2.0f, -0.5f, -0.5f).array); 220 } 221 222 /// Load 128-bits of integer data from unaligned memory. 223 // Note: The saying is LDDQU was only ever useful around 2008 224 // See_also: https://stackoverflow.com/questions/38370622/a-faster-integer-sse-unalligned-load-thats-rarely-used 225 alias _mm_lddqu_si128 = _mm_loadu_si128; 226 227 228 __m128d _mm_loaddup_pd (const(double)* mem_addr) pure @trusted 229 { 230 // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3 231 // Same for GDC with -O1 232 double value = *mem_addr; 233 __m128d res; 234 res.ptr[0] = value; 235 res.ptr[1] = value; 236 return res; 237 } 238 unittest 239 { 240 version(LDC) 241 { 242 double a = 7.5; 243 assert(_mm_loaddup_pd(&a) == _mm_set_pd(7.5, 7.5)); 244 } 245 else 246 { 247 double a = 7.5; 248 // For some reason, this line breaks with LDC, but not when isolated! 249 // was not reported yet. 250 assert(_mm_loaddup_pd(&a).array == _mm_set_pd(7.5, 7.5).array); 251 } 252 } 253 254 __m128d _mm_movedup_pd (__m128d a) pure @trusted 255 { 256 // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3 257 // Something efficient with -01 for GDC 258 a.ptr[1] = a.array[0]; 259 return a; 260 } 261 unittest 262 { 263 __m128d A = _mm_setr_pd(7.0, 2.5); 264 assert(_mm_movedup_pd(A).array == _mm_set_pd(7.0, 7.0).array); 265 } 266 267 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`. 268 __m128 _mm_movehdup_ps (__m128 a) pure @trusted 269 { 270 // Generates movshdup since LDC 1.3 with -O1 -mattr=+sse3 271 // PERF but GDC never generates it 272 a.ptr[0] = a.array[1]; 273 a.ptr[2] = a.array[3]; 274 return a; 275 } 276 277 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`. 278 __m128 _mm_moveldup_ps (__m128 a) pure @trusted 279 { 280 // Generates movsldup since LDC 1.3 with -O1 -mattr=+sse3 281 // PERF but GDC never generates it 282 a.ptr[1] = a.array[0]; 283 a.ptr[3] = a.array[2]; 284 return a; 285 }