1 /** 2 * SSE3 intrinsics. 3 * 4 * Copyright: Guillaume Piolat 2016-2020. 5 * Charles Gregory 2019. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.pmmintrin; 9 10 public import inteli.types; 11 import inteli.internals; 12 public import inteli.emmintrin; 13 14 15 // Note: this header will work whether you have SSE3 enabled or not. 16 // With LDC, use "dflags-ldc": ["-mattr=+sse3"] or equivalent to actively 17 // generate SSE3 instruction (they are often enabled with -O1 or greater). 18 19 20 nothrow @nogc: 21 22 /// Alternatively add and subtract packed double-precision (64-bit) 23 /// floating-point elements in `a` to/from packed elements in `b`. 24 __m128d _mm_addsub_pd (__m128d a, __m128d b) pure @trusted 25 { 26 // Note: generates addsubpd since LDC 1.3.0 with -O1 27 // PERF: for GDC, detect SSE3 and use the relevant builtin, because it doesn't generates addsubpd 28 // ARM: well optimized starting with LDC 1.18.0 -O2 29 a.ptr[0] = a.array[0] - b.array[0]; 30 a.ptr[1] = a.array[1] + b.array[1]; 31 return a; 32 } 33 unittest 34 { 35 auto v1 =_mm_setr_pd(1.0,2.0); 36 auto v2 =_mm_setr_pd(1.0,2.0); 37 assert(_mm_addsub_pd(v1,v2).array == _mm_setr_pd(0.0,4.0).array); 38 } 39 40 /// Alternatively add and subtract packed single-precision (32-bit) 41 /// floating-point elements in `a` to/from packed elements in `b`. 42 float4 _mm_addsub_ps (float4 a, float4 b) pure @trusted 43 { 44 // Note: generates addsubps since LDC 1.3.0 with -O1 45 // PERF: for GDC, detect SSE3 and use the relevant builtin 46 a.ptr[0] -= b.array[0]; 47 a.ptr[1] += b.array[1]; 48 a.ptr[2] -= b.array[2]; 49 a.ptr[3] += b.array[3]; 50 return a; 51 } 52 unittest 53 { 54 auto v1 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 55 auto v2 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 56 assert( _mm_addsub_ps(v1,v2).array == _mm_setr_ps(0.0f, 4.0f, 0.0f, 8.0f).array ); 57 } 58 59 60 /// Horizontally add adjacent pairs of double-precision (64-bit) 61 /// floating-point elements in `a` and `b`. 62 __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @trusted 63 { 64 static if (LDC_with_SSE3) 65 { 66 return __builtin_ia32_haddpd(a, b); 67 } 68 else 69 { 70 // On GDC this generates haddpd with -O1 71 __m128d res; 72 res.ptr[0] = a.array[1] + a.array[0]; 73 res.ptr[1] = b.array[1] + b.array[0]; 74 return res; 75 } 76 } 77 unittest 78 { 79 auto A =_mm_setr_pd(1.5, 2.0); 80 auto B =_mm_setr_pd(1.0, 2.0); 81 assert( _mm_hadd_pd(A, B).array ==_mm_setr_pd(3.5, 3.0).array ); 82 } 83 84 // PERF: for GDC, detect SSE3 and use the relevant builtin 85 /// Horizontally add adjacent pairs of single-precision (32-bit) 86 /// floating-point elements in `a` and `b`. 87 __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @trusted 88 { 89 static if (LDC_with_SSE3) 90 { 91 return __builtin_ia32_haddps(a, b); 92 } 93 else static if (LDC_with_ARM64) 94 { 95 return vpaddq_f32(a, b); 96 } 97 else 98 { 99 __m128 res; 100 res.ptr[0] = a.array[1] + a.array[0]; 101 res.ptr[1] = a.array[3] + a.array[2]; 102 res.ptr[2] = b.array[1] + b.array[0]; 103 res.ptr[3] = b.array[3] + b.array[2]; 104 return res; 105 } 106 } 107 unittest 108 { 109 __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f); 110 __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f); 111 assert( _mm_hadd_ps(A, B).array == _mm_setr_ps(3.0f, 8.0f, 3.5f, 7.5f).array ); 112 } 113 114 /// Horizontally subtract adjacent pairs of double-precision (64-bit) 115 /// floating-point elements in `a` and `b`. 116 __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @trusted 117 { 118 static if (LDC_with_SSE3) 119 { 120 return __builtin_ia32_hsubpd(a, b); 121 } 122 else 123 { 124 // On GDC this generates hsubpd with -O1 125 __m128d res; 126 res.ptr[0] = a.array[0] - a.array[1]; 127 res.ptr[1] = b.array[0] - b.array[1]; 128 return res; 129 } 130 } 131 unittest 132 { 133 auto A =_mm_setr_pd(1.5, 2.0); 134 auto B =_mm_setr_pd(1.0, 2.0); 135 assert( _mm_hsub_pd(A, B).array ==_mm_setr_pd(-0.5, -1.0).array ); 136 } 137 138 /// Horizontally subtract adjacent pairs of single-precision (32-bit) 139 /// floating-point elements in `a` and `b`. 140 __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @trusted 141 { 142 static if (LDC_with_SSE3) 143 { 144 return __builtin_ia32_hsubps(a, b); 145 } 146 else static if (LDC_with_ARM64) 147 { 148 int4 mask = [0, 0x80000000, 0, 0x80000000]; 149 a = cast(__m128)(cast(int4)a ^ mask); 150 b = cast(__m128)(cast(int4)b ^ mask); 151 return vpaddq_f32(a, b); 152 } 153 else 154 { 155 // PERF: GDC doesn't generate the right instruction, do something 156 __m128 res; 157 res.ptr[0] = a.array[0] - a.array[1]; 158 res.ptr[1] = a.array[2] - a.array[3]; 159 res.ptr[2] = b.array[0] - b.array[1]; 160 res.ptr[3] = b.array[2] - b.array[3]; 161 return res; 162 } 163 } 164 unittest 165 { 166 __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f); 167 __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f); 168 assert(_mm_hsub_ps(A, B).array == _mm_setr_ps(-1.0f, -2.0f, -0.5f, -0.5f).array); 169 } 170 171 /// Load 128-bits of integer data from unaligned memory. 172 // Note: The saying is LDDQU was only ever useful around 2008 173 // See_also: https://stackoverflow.com/questions/38370622/a-faster-integer-sse-unalligned-load-thats-rarely-used 174 alias _mm_lddqu_si128 = _mm_loadu_si128; 175 176 /// Load a double-precision (64-bit) floating-point element from memory into both elements of result. 177 __m128d _mm_loaddup_pd (const(double)* mem_addr) pure @trusted 178 { 179 // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3 180 // Same for GDC with -O1 181 double value = *mem_addr; 182 __m128d res; 183 res.ptr[0] = value; 184 res.ptr[1] = value; 185 return res; 186 } 187 unittest 188 { 189 version(LDC) 190 { 191 double a = 7.5; 192 assert(_mm_loaddup_pd(&a) == _mm_set_pd(7.5, 7.5)); 193 } 194 else 195 { 196 double a = 7.5; 197 // For some reason, this line used to break with LDC, but not when isolated! Was never reported. 198 assert(_mm_loaddup_pd(&a).array == _mm_set_pd(7.5, 7.5).array); 199 } 200 } 201 202 /// Duplicate the low double-precision (64-bit) floating-point element from `a`. 203 __m128d _mm_movedup_pd (__m128d a) pure @trusted 204 { 205 // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3 206 // Something efficient with -01 for GDC 207 a.ptr[1] = a.array[0]; 208 return a; 209 } 210 unittest 211 { 212 __m128d A = _mm_setr_pd(7.0, 2.5); 213 assert(_mm_movedup_pd(A).array == _mm_set_pd(7.0, 7.0).array); 214 } 215 216 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`. 217 __m128 _mm_movehdup_ps (__m128 a) pure @trusted 218 { 219 // Generates movshdup since LDC 1.3 with -O1 -mattr=+sse3 220 // PERF but GDC never generates it 221 a.ptr[0] = a.array[1]; 222 a.ptr[2] = a.array[3]; 223 return a; 224 } 225 unittest 226 { 227 __m128 A = _mm_movehdup_ps(_mm_setr_ps(1, 2, 3, 4)); 228 float[4] correct = [2.0f, 2, 4, 4 ]; 229 assert(A.array == correct); 230 } 231 232 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`. 233 __m128 _mm_moveldup_ps (__m128 a) pure @trusted 234 { 235 // Generates movsldup since LDC 1.3 with -O1 -mattr=+sse3 236 // PERF but GDC never generates it 237 a.ptr[1] = a.array[0]; 238 a.ptr[3] = a.array[2]; 239 return a; 240 } 241 unittest 242 { 243 __m128 A = _mm_moveldup_ps(_mm_setr_ps(1, 2, 3, 4)); 244 float[4] correct = [1.0f, 1, 3, 3 ]; 245 assert(A.array == correct); 246 }