1 /** 2 * SSE3 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE3 4 * 5 * Copyright: Guillaume Piolat 2016-2020. 6 * Charles Gregory 2019. 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 */ 9 module inteli.pmmintrin; 10 11 public import inteli.types; 12 import inteli.internals; 13 public import inteli.emmintrin; 14 15 16 // Note: this header will work whether you have SSE3 enabled or not. 17 // With LDC, use "dflags-ldc": ["-mattr=+sse3"] or equivalent to actively 18 // generate SSE3 instruction (they are often enabled with -O1 or greater). 19 20 21 nothrow @nogc: 22 23 /// Alternatively add and subtract packed double-precision (64-bit) 24 /// floating-point elements in `a` to/from packed elements in `b`. 25 __m128d _mm_addsub_pd (__m128d a, __m128d b) pure @trusted 26 { 27 // Note: generates addsubpd since LDC 1.3.0 with -O1 28 // PERF: for GDC, detect SSE3 and use the relevant builtin, because it doesn't generates addsubpd 29 // ARM: well optimized starting with LDC 1.18.0 -O2 30 a.ptr[0] = a.array[0] - b.array[0]; 31 a.ptr[1] = a.array[1] + b.array[1]; 32 return a; 33 } 34 unittest 35 { 36 auto v1 =_mm_setr_pd(1.0,2.0); 37 auto v2 =_mm_setr_pd(1.0,2.0); 38 assert(_mm_addsub_pd(v1,v2).array == _mm_setr_pd(0.0,4.0).array); 39 } 40 41 /// Alternatively add and subtract packed single-precision (32-bit) 42 /// floating-point elements in `a` to/from packed elements in `b`. 43 float4 _mm_addsub_ps (float4 a, float4 b) pure @trusted 44 { 45 // Note: generates addsubps since LDC 1.3.0 with -O1 46 // PERF: for GDC, detect SSE3 and use the relevant builtin 47 a.ptr[0] -= b.array[0]; 48 a.ptr[1] += b.array[1]; 49 a.ptr[2] -= b.array[2]; 50 a.ptr[3] += b.array[3]; 51 return a; 52 } 53 unittest 54 { 55 auto v1 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 56 auto v2 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 57 assert( _mm_addsub_ps(v1,v2).array == _mm_setr_ps(0.0f, 4.0f, 0.0f, 8.0f).array ); 58 } 59 60 61 /// Horizontally add adjacent pairs of double-precision (64-bit) 62 /// floating-point elements in `a` and `b`. 63 __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @trusted 64 { 65 static if (LDC_with_SSE3) 66 { 67 return __builtin_ia32_haddpd(a, b); 68 } 69 else 70 { 71 // On GDC this generates haddpd with -O1 72 __m128d res; 73 res.ptr[0] = a.array[1] + a.array[0]; 74 res.ptr[1] = b.array[1] + b.array[0]; 75 return res; 76 } 77 } 78 unittest 79 { 80 auto A =_mm_setr_pd(1.5, 2.0); 81 auto B =_mm_setr_pd(1.0, 2.0); 82 assert( _mm_hadd_pd(A, B).array ==_mm_setr_pd(3.5, 3.0).array ); 83 } 84 85 // PERF: for GDC, detect SSE3 and use the relevant builtin 86 /// Horizontally add adjacent pairs of single-precision (32-bit) 87 /// floating-point elements in `a` and `b`. 88 __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @trusted 89 { 90 static if (LDC_with_SSE3) 91 { 92 return __builtin_ia32_haddps(a, b); 93 } 94 else static if (LDC_with_ARM64) 95 { 96 return vpaddq_f32(a, b); 97 } 98 else 99 { 100 __m128 res; 101 res.ptr[0] = a.array[1] + a.array[0]; 102 res.ptr[1] = a.array[3] + a.array[2]; 103 res.ptr[2] = b.array[1] + b.array[0]; 104 res.ptr[3] = b.array[3] + b.array[2]; 105 return res; 106 } 107 } 108 unittest 109 { 110 __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f); 111 __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f); 112 assert( _mm_hadd_ps(A, B).array == _mm_setr_ps(3.0f, 8.0f, 3.5f, 7.5f).array ); 113 } 114 115 /// Horizontally subtract adjacent pairs of double-precision (64-bit) 116 /// floating-point elements in `a` and `b`. 117 __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @trusted 118 { 119 static if (LDC_with_SSE3) 120 { 121 return __builtin_ia32_hsubpd(a, b); 122 } 123 else 124 { 125 // On GDC this generates hsubpd with -O1 126 __m128d res; 127 res.ptr[0] = a.array[0] - a.array[1]; 128 res.ptr[1] = b.array[0] - b.array[1]; 129 return res; 130 } 131 } 132 unittest 133 { 134 auto A =_mm_setr_pd(1.5, 2.0); 135 auto B =_mm_setr_pd(1.0, 2.0); 136 assert( _mm_hsub_pd(A, B).array ==_mm_setr_pd(-0.5, -1.0).array ); 137 } 138 139 /// Horizontally subtract adjacent pairs of single-precision (32-bit) 140 /// floating-point elements in `a` and `b`. 141 __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @trusted 142 { 143 static if (LDC_with_SSE3) 144 { 145 return __builtin_ia32_hsubps(a, b); 146 } 147 else static if (LDC_with_ARM64) 148 { 149 int4 mask = [0, 0x80000000, 0, 0x80000000]; 150 a = cast(__m128)(cast(int4)a ^ mask); 151 b = cast(__m128)(cast(int4)b ^ mask); 152 return vpaddq_f32(a, b); 153 } 154 else 155 { 156 // PERF: GDC doesn't generate the right instruction, do something 157 __m128 res; 158 res.ptr[0] = a.array[0] - a.array[1]; 159 res.ptr[1] = a.array[2] - a.array[3]; 160 res.ptr[2] = b.array[0] - b.array[1]; 161 res.ptr[3] = b.array[2] - b.array[3]; 162 return res; 163 } 164 } 165 unittest 166 { 167 __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f); 168 __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f); 169 assert(_mm_hsub_ps(A, B).array == _mm_setr_ps(-1.0f, -2.0f, -0.5f, -0.5f).array); 170 } 171 172 /// Load 128-bits of integer data from unaligned memory. 173 // Note: The saying is LDDQU was only ever useful around 2008 174 // See_also: https://stackoverflow.com/questions/38370622/a-faster-integer-sse-unalligned-load-thats-rarely-used 175 alias _mm_lddqu_si128 = _mm_loadu_si128; 176 177 /// Load a double-precision (64-bit) floating-point element from memory into both elements of result. 178 __m128d _mm_loaddup_pd (const(double)* mem_addr) pure @trusted 179 { 180 // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3 181 // Same for GDC with -O1 182 double value = *mem_addr; 183 __m128d res; 184 res.ptr[0] = value; 185 res.ptr[1] = value; 186 return res; 187 } 188 unittest 189 { 190 version(LDC) 191 { 192 double a = 7.5; 193 assert(_mm_loaddup_pd(&a) == _mm_set_pd(7.5, 7.5)); 194 } 195 else 196 { 197 double a = 7.5; 198 // For some reason, this line used to break with LDC, but not when isolated! Was never reported. 199 assert(_mm_loaddup_pd(&a).array == _mm_set_pd(7.5, 7.5).array); 200 } 201 } 202 203 /// Duplicate the low double-precision (64-bit) floating-point element from `a`. 204 __m128d _mm_movedup_pd (__m128d a) pure @trusted 205 { 206 // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3 207 // Something efficient with -01 for GDC 208 a.ptr[1] = a.array[0]; 209 return a; 210 } 211 unittest 212 { 213 __m128d A = _mm_setr_pd(7.0, 2.5); 214 assert(_mm_movedup_pd(A).array == _mm_set_pd(7.0, 7.0).array); 215 } 216 217 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`. 218 __m128 _mm_movehdup_ps (__m128 a) pure @trusted 219 { 220 // Generates movshdup since LDC 1.3 with -O1 -mattr=+sse3 221 // PERF but GDC never generates it 222 a.ptr[0] = a.array[1]; 223 a.ptr[2] = a.array[3]; 224 return a; 225 } 226 unittest 227 { 228 __m128 A = _mm_movehdup_ps(_mm_setr_ps(1, 2, 3, 4)); 229 float[4] correct = [2.0f, 2, 4, 4 ]; 230 assert(A.array == correct); 231 } 232 233 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`. 234 __m128 _mm_moveldup_ps (__m128 a) pure @trusted 235 { 236 // Generates movsldup since LDC 1.3 with -O1 -mattr=+sse3 237 // PERF but GDC never generates it 238 a.ptr[1] = a.array[0]; 239 a.ptr[3] = a.array[2]; 240 return a; 241 } 242 unittest 243 { 244 __m128 A = _mm_moveldup_ps(_mm_setr_ps(1, 2, 3, 4)); 245 float[4] correct = [1.0f, 1, 3, 3 ]; 246 assert(A.array == correct); 247 }