1 /** 2 * SSE3 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE3 4 * 5 * Copyright: Guillaume Piolat 2016-2020. 6 * Charles Gregory 2019. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.pmmintrin; 10 11 public import inteli.types; 12 import inteli.internals; 13 public import inteli.emmintrin; 14 15 16 // Note: this header will work whether you have SSE3 enabled or not. 17 // With LDC, use "dflags-ldc": ["-mattr=+sse3"] or equivalent to actively 18 // generate SSE3 instruction (they are often enabled with -O1 or greater). 19 // With GDC, use "dflags-gdc": ["-msse3"] or equivalent to generate SSE3 instructions. 20 21 22 nothrow @nogc: 23 24 /// Alternatively add and subtract packed double-precision (64-bit) 25 /// floating-point elements in `a` to/from packed elements in `b`. 26 __m128d _mm_addsub_pd (__m128d a, __m128d b) pure @trusted 27 { 28 // PERF DMD 29 static if (GDC_with_SSE3) 30 { 31 return __builtin_ia32_addsubpd(a, b); 32 } 33 else static if (LDC_with_SSE3) 34 { 35 return __builtin_ia32_addsubpd(a, b); 36 } 37 else 38 { 39 // ARM: well optimized starting with LDC 1.18.0 -O2, not disrupted by LLVM 13+ 40 a.ptr[0] = a.array[0] - b.array[0]; 41 a.ptr[1] = a.array[1] + b.array[1]; 42 return a; 43 } 44 } 45 unittest 46 { 47 auto v1 =_mm_setr_pd(1.0,2.0); 48 auto v2 =_mm_setr_pd(1.0,2.0); 49 assert(_mm_addsub_pd(v1,v2).array == _mm_setr_pd(0.0,4.0).array); 50 } 51 52 /// Alternatively add and subtract packed single-precision (32-bit) 53 /// floating-point elements in `a` to/from packed elements in `b`. 54 float4 _mm_addsub_ps (float4 a, float4 b) pure @trusted 55 { 56 // PERF DMD 57 static if (GDC_with_SSE3) 58 { 59 return __builtin_ia32_addsubps(a, b); 60 } 61 else static if (LDC_with_SSE3) 62 { 63 return __builtin_ia32_addsubps(a, b); 64 } 65 else 66 { 67 a.ptr[0] -= b.array[0]; 68 a.ptr[1] += b.array[1]; 69 a.ptr[2] -= b.array[2]; 70 a.ptr[3] += b.array[3]; 71 return a; 72 } 73 } 74 unittest 75 { 76 auto v1 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 77 auto v2 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 78 assert( _mm_addsub_ps(v1,v2).array == _mm_setr_ps(0.0f, 4.0f, 0.0f, 8.0f).array ); 79 } 80 81 82 /// Horizontally add adjacent pairs of double-precision (64-bit) 83 /// floating-point elements in `a` and `b`. 84 __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @trusted 85 { 86 static if (LDC_with_SSE3) 87 { 88 return __builtin_ia32_haddpd(a, b); 89 } 90 else 91 { 92 // On GDC this generates haddpd with -O1 93 __m128d res; // PERF =void; 94 res.ptr[0] = a.array[1] + a.array[0]; 95 res.ptr[1] = b.array[1] + b.array[0]; 96 return res; 97 } 98 } 99 unittest 100 { 101 auto A =_mm_setr_pd(1.5, 2.0); 102 auto B =_mm_setr_pd(1.0, 2.0); 103 assert( _mm_hadd_pd(A, B).array ==_mm_setr_pd(3.5, 3.0).array ); 104 } 105 106 // PERF: for GDC, detect SSE3 and use the relevant builtin 107 /// Horizontally add adjacent pairs of single-precision (32-bit) 108 /// floating-point elements in `a` and `b`. 109 __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @trusted 110 { 111 static if (LDC_with_SSE3) 112 { 113 return __builtin_ia32_haddps(a, b); 114 } 115 else static if (LDC_with_ARM64) 116 { 117 return vpaddq_f32(a, b); 118 } 119 else 120 { 121 __m128 res; // PERF =void; 122 res.ptr[0] = a.array[1] + a.array[0]; 123 res.ptr[1] = a.array[3] + a.array[2]; 124 res.ptr[2] = b.array[1] + b.array[0]; 125 res.ptr[3] = b.array[3] + b.array[2]; 126 return res; 127 } 128 } 129 unittest 130 { 131 __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f); 132 __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f); 133 assert( _mm_hadd_ps(A, B).array == _mm_setr_ps(3.0f, 8.0f, 3.5f, 7.5f).array ); 134 } 135 136 /// Horizontally subtract adjacent pairs of double-precision (64-bit) 137 /// floating-point elements in `a` and `b`. 138 __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @trusted 139 { 140 static if (LDC_with_SSE3) 141 { 142 return __builtin_ia32_hsubpd(a, b); 143 } 144 else 145 { 146 // On GDC this generates hsubpd with -O1 147 __m128d res; // PERF =void; 148 res.ptr[0] = a.array[0] - a.array[1]; 149 res.ptr[1] = b.array[0] - b.array[1]; 150 return res; 151 } 152 } 153 unittest 154 { 155 auto A =_mm_setr_pd(1.5, 2.0); 156 auto B =_mm_setr_pd(1.0, 2.0); 157 assert( _mm_hsub_pd(A, B).array ==_mm_setr_pd(-0.5, -1.0).array ); 158 } 159 160 /// Horizontally subtract adjacent pairs of single-precision (32-bit) 161 /// floating-point elements in `a` and `b`. 162 __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @trusted 163 { 164 static if (LDC_with_SSE3) 165 { 166 return __builtin_ia32_hsubps(a, b); 167 } 168 else static if (LDC_with_ARM64) 169 { 170 int4 mask = [0, 0x80000000, 0, 0x80000000]; 171 a = cast(__m128)(cast(int4)a ^ mask); 172 b = cast(__m128)(cast(int4)b ^ mask); 173 return vpaddq_f32(a, b); 174 } 175 else 176 { 177 // PERF: GDC doesn't generate the right instruction, do something 178 __m128 res; // PERF =void; 179 res.ptr[0] = a.array[0] - a.array[1]; 180 res.ptr[1] = a.array[2] - a.array[3]; 181 res.ptr[2] = b.array[0] - b.array[1]; 182 res.ptr[3] = b.array[2] - b.array[3]; 183 return res; 184 } 185 } 186 unittest 187 { 188 __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f); 189 __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f); 190 assert(_mm_hsub_ps(A, B).array == _mm_setr_ps(-1.0f, -2.0f, -0.5f, -0.5f).array); 191 } 192 193 /// Load 128-bits of integer data from unaligned memory. 194 // Note: The saying is LDDQU was only ever useful around 2008 195 // See_also: https://stackoverflow.com/questions/38370622/a-faster-integer-sse-unalligned-load-thats-rarely-used 196 alias _mm_lddqu_si128 = _mm_loadu_si128; 197 198 /// Load a double-precision (64-bit) floating-point element from memory into both elements of result. 199 __m128d _mm_loaddup_pd (const(double)* mem_addr) pure @trusted 200 { 201 // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3 202 // Same for GDC with -O1 203 double value = *mem_addr; 204 __m128d res; // PERF =void; 205 res.ptr[0] = value; 206 res.ptr[1] = value; 207 return res; 208 } 209 unittest 210 { 211 version(LDC) 212 { 213 double a = 7.5; 214 assert(_mm_loaddup_pd(&a) == _mm_set_pd(7.5, 7.5)); 215 } 216 else 217 { 218 double a = 7.5; 219 // For some reason, this line used to break with LDC, but not when isolated! Was never reported. 220 assert(_mm_loaddup_pd(&a).array == _mm_set_pd(7.5, 7.5).array); 221 } 222 } 223 224 /// Duplicate the low double-precision (64-bit) floating-point element from `a`. 225 __m128d _mm_movedup_pd (__m128d a) pure @trusted 226 { 227 // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3 228 // Something efficient with -01 for GDC 229 a.ptr[1] = a.array[0]; 230 return a; 231 } 232 unittest 233 { 234 __m128d A = _mm_setr_pd(7.0, 2.5); 235 assert(_mm_movedup_pd(A).array == _mm_set_pd(7.0, 7.0).array); 236 } 237 238 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`. 239 __m128 _mm_movehdup_ps (__m128 a) pure @trusted 240 { 241 // Generates movshdup since LDC 1.3 with -O1 -mattr=+sse3 242 // PERF but GDC never generates it 243 a.ptr[0] = a.array[1]; 244 a.ptr[2] = a.array[3]; 245 return a; 246 } 247 unittest 248 { 249 __m128 A = _mm_movehdup_ps(_mm_setr_ps(1, 2, 3, 4)); 250 float[4] correct = [2.0f, 2, 4, 4 ]; 251 assert(A.array == correct); 252 } 253 254 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`. 255 __m128 _mm_moveldup_ps (__m128 a) pure @trusted 256 { 257 // Generates movsldup since LDC 1.3 with -O1 -mattr=+sse3 258 // PERF but GDC never generates it 259 a.ptr[1] = a.array[0]; 260 a.ptr[3] = a.array[2]; 261 return a; 262 } 263 unittest 264 { 265 __m128 A = _mm_moveldup_ps(_mm_setr_ps(1, 2, 3, 4)); 266 float[4] correct = [1.0f, 1, 3, 3 ]; 267 assert(A.array == correct); 268 }