1 /** 2 * SSE3 intrinsics. 3 * 4 * Copyright: Guillaume Piolat 2016-2019. 5 * Charles Gregory 2019. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 * Authors: Guillaume Piolat 8 */ 9 module inteli.pmmintrin; 10 11 public import inteli.types; 12 import inteli.internals; 13 import inteli.emmintrin; 14 15 16 // Note: this header will work whether you have SSE3 enabled or not. 17 // With LDC, use "dflags-ldc": ["-mattr=+sse3"] or equivalent to actively 18 // generate SSE3 instruction (they are often enabled with -O1 or greater). 19 20 21 nothrow @nogc: 22 23 /// Alternatively add and subtract packed double-precision (64-bit) 24 /// floating-point elements in `a` to/from packed elements in `b`. 25 __m128d _mm_addsub_pd (__m128d a, __m128d b) pure @trusted 26 { 27 // Note: generates addsubpd since LDC 1.3.0 with -O1 28 // PERF: for GDC, detect SSE3 and use the relevant builtin, because it doesn't generates addsubpd 29 // ARM: well optimized starting with LDC 1.18.0 -O2 30 a.ptr[0] = a.array[0] - b.array[0]; 31 a.ptr[1] = a.array[1] + b.array[1]; 32 return a; 33 } 34 unittest 35 { 36 auto v1 =_mm_setr_pd(1.0,2.0); 37 auto v2 =_mm_setr_pd(1.0,2.0); 38 assert(_mm_addsub_pd(v1,v2).array == _mm_setr_pd(0.0,4.0).array); 39 } 40 41 /// Alternatively add and subtract packed single-precision (32-bit) 42 /// floating-point elements in `a` to/from packed elements in `b`. 43 float4 _mm_addsub_ps (float4 a, float4 b) pure @trusted 44 { 45 // Note: generates addsubps since LDC 1.3.0 with -O1 46 // PERF: for GDC, detect SSE3 and use the relevant builtin 47 a.ptr[0] -= b.array[0]; 48 a.ptr[1] += b.array[1]; 49 a.ptr[2] -= b.array[2]; 50 a.ptr[3] += b.array[3]; 51 return a; 52 } 53 unittest 54 { 55 auto v1 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 56 auto v2 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 57 assert( _mm_addsub_ps(v1,v2).array == _mm_setr_ps(0.0f, 4.0f, 0.0f, 8.0f).array ); 58 } 59 60 static if (LDC_with_SSE3) 61 { 62 /// Horizontally add adjacent pairs of double-precision (64-bit) 63 /// floating-point elements in `a` and `b`. 64 __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @safe 65 { 66 return __builtin_ia32_haddpd(a, b); 67 } 68 } 69 else 70 { 71 /// Horizontally add adjacent pairs of double-precision (64-bit) 72 /// floating-point elements in `a` and `b`. 73 __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @trusted 74 { 75 // On GDC this generates haddpd with -O1 76 __m128d res; 77 res.ptr[0] = a.array[1] + a.array[0]; 78 res.ptr[1] = b.array[1] + b.array[0]; 79 return res; 80 } 81 } 82 unittest 83 { 84 auto A =_mm_setr_pd(1.5, 2.0); 85 auto B =_mm_setr_pd(1.0, 2.0); 86 assert( _mm_hadd_pd(A, B).array ==_mm_setr_pd(3.5, 3.0).array ); 87 } 88 89 static if (LDC_with_SSE3) 90 { 91 /// Horizontally add adjacent pairs of single-precision (32-bit) 92 /// floating-point elements in `a` and `b`. 93 __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @safe 94 { 95 96 return __builtin_ia32_haddps(a, b); 97 } 98 } 99 else static if (LDC_with_ARM64) 100 { 101 float4 _mm_hadd_ps (float4 a, float4 b) pure @safe 102 { 103 return vpaddq_f32(a, b); 104 } 105 } 106 else 107 { 108 // PERF: for GDC, detect SSE3 and use the relevant builtin 109 110 /// Horizontally add adjacent pairs of single-precision (32-bit) 111 /// floating-point elements in `a` and `b`. 112 __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @trusted 113 { 114 __m128 res; 115 res.ptr[0] = a.array[1] + a.array[0]; 116 res.ptr[1] = a.array[3] + a.array[2]; 117 res.ptr[2] = b.array[1] + b.array[0]; 118 res.ptr[3] = b.array[3] + b.array[2]; 119 return res; 120 } 121 } 122 unittest 123 { 124 __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f); 125 __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f); 126 assert( _mm_hadd_ps(A, B).array == _mm_setr_ps(3.0f, 8.0f, 3.5f, 7.5f).array ); 127 } 128 129 static if (LDC_with_SSE3) 130 { 131 /// Horizontally subtract adjacent pairs of double-precision (64-bit) 132 /// floating-point elements in `a` and `b`. 133 __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @safe 134 { 135 return __builtin_ia32_hsubpd(a, b); 136 } 137 } 138 else 139 { 140 /// Horizontally subtract adjacent pairs of double-precision (64-bit) 141 /// floating-point elements in `a` and `b`. 142 __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @trusted 143 { 144 // On GDC this generates hsubpd with -O1 145 __m128d res; 146 res.ptr[0] = a.array[0] - a.array[1]; 147 res.ptr[1] = b.array[0] - b.array[1]; 148 return res; 149 } 150 } 151 unittest 152 { 153 auto A =_mm_setr_pd(1.5, 2.0); 154 auto B =_mm_setr_pd(1.0, 2.0); 155 assert( _mm_hsub_pd(A, B).array ==_mm_setr_pd(-0.5, -1.0).array ); 156 } 157 158 static if (LDC_with_SSE3) 159 { 160 /// Horizontally subtract adjacent pairs of single-precision (32-bit) 161 /// floating-point elements in `a` and `b`. 162 __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @safe 163 { 164 return __builtin_ia32_hsubps(a, b); 165 } 166 } 167 else static if (LDC_with_ARM64) 168 { 169 /// Horizontally subtract adjacent pairs of single-precision (32-bit) 170 /// floating-point elements in `a` and `b`. 171 float4 _mm_hsub_ps (float4 a, float4 b) pure @safe 172 { 173 int4 mask = [0, 0x80000000, 0, 0x80000000]; 174 a = cast(__m128)(cast(int4)a ^ mask); 175 b = cast(__m128)(cast(int4)b ^ mask); 176 return vpaddq_f32(a, b); 177 } 178 } 179 else 180 { 181 /// Horizontally subtract adjacent pairs of single-precision (32-bit) 182 /// floating-point elements in `a` and `b`. 183 __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @trusted 184 { 185 // PERF: GDC doesn't generate the right instruction, do something 186 __m128 res; 187 res.ptr[0] = a.array[0] - a.array[1]; 188 res.ptr[1] = a.array[2] - a.array[3]; 189 res.ptr[2] = b.array[0] - b.array[1]; 190 res.ptr[3] = b.array[2] - b.array[3]; 191 return res; 192 } 193 } 194 unittest 195 { 196 __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f); 197 __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f); 198 assert(_mm_hsub_ps(A, B).array == _mm_setr_ps(-1.0f, -2.0f, -0.5f, -0.5f).array); 199 } 200 201 /// Load 128-bits of integer data from unaligned memory. 202 // Note: The saying is LDDQU was only ever useful around 2008 203 // See_also: https://stackoverflow.com/questions/38370622/a-faster-integer-sse-unalligned-load-thats-rarely-used 204 alias _mm_lddqu_si128 = _mm_loadu_si128; 205 206 207 __m128d _mm_loaddup_pd (const(double)* mem_addr) pure @trusted 208 { 209 // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3 210 // Same for GDC with -O1 211 double value = *mem_addr; 212 __m128d res; 213 res.ptr[0] = value; 214 res.ptr[1] = value; 215 return res; 216 } 217 unittest 218 { 219 version(LDC) 220 { 221 double a = 7.5; 222 assert(_mm_loaddup_pd(&a) == _mm_set_pd(7.5, 7.5)); 223 } 224 else 225 { 226 double a = 7.5; 227 // For some reason, this line breaks with LDC, but not when isolated! 228 // was not reported yet. 229 assert(_mm_loaddup_pd(&a).array == _mm_set_pd(7.5, 7.5).array); 230 } 231 } 232 233 __m128d _mm_movedup_pd (__m128d a) pure @trusted 234 { 235 // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3 236 // Something efficient with -01 for GDC 237 a.ptr[1] = a.array[0]; 238 return a; 239 } 240 unittest 241 { 242 __m128d A = _mm_setr_pd(7.0, 2.5); 243 assert(_mm_movedup_pd(A).array == _mm_set_pd(7.0, 7.0).array); 244 } 245 246 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`. 247 __m128 _mm_movehdup_ps (__m128 a) pure @trusted 248 { 249 // Generates movshdup since LDC 1.3 with -O1 -mattr=+sse3 250 // PERF but GDC never generates it 251 a.ptr[0] = a.array[1]; 252 a.ptr[2] = a.array[3]; 253 return a; 254 } 255 256 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`. 257 __m128 _mm_moveldup_ps (__m128 a) pure @trusted 258 { 259 // Generates movsldup since LDC 1.3 with -O1 -mattr=+sse3 260 // PERF but GDC never generates it 261 a.ptr[1] = a.array[0]; 262 a.ptr[3] = a.array[2]; 263 return a; 264 }