1 /** 2 * Copyright: Guillaume Piolat 2016-2019. 3 * Charles Gregory 2019. 4 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 5 * Authors: Guillaume Piolat 6 */ 7 module inteli.pmmintrin; 8 9 public import inteli.types; 10 import inteli.internals; 11 import inteli.emmintrin; 12 13 14 // Note: this header will work whether you have SSE3 enabled or not. 15 // With LDC, use "dflags-ldc": ["-mattr=+sse3"] or equivalent to actively 16 // generate SSE3 instruction (they are often enabled with -O1 or greater). 17 18 19 nothrow @nogc: 20 21 /// Alternatively add and subtract packed double-precision (64-bit) 22 /// floating-point elements in `a` to/from packed elements in `b`. 23 __m128d _mm_addsub_pd (__m128d a, __m128d b) pure @trusted 24 { 25 // Note: generates addsubpd since LDC 1.3.0 with -O1 26 // PERF: for GDC, detect SSE3 and use the relevant builtin, because it doesn't generates addsubpd 27 // ARM: well optimized starting with LDC 1.18.0 -O2 28 a.ptr[0] = a.array[0] - b.array[0]; 29 a.ptr[1] = a.array[1] + b.array[1]; 30 return a; 31 } 32 unittest 33 { 34 auto v1 =_mm_setr_pd(1.0,2.0); 35 auto v2 =_mm_setr_pd(1.0,2.0); 36 assert(_mm_addsub_pd(v1,v2).array == _mm_setr_pd(0.0,4.0).array); 37 } 38 39 /// Alternatively add and subtract packed single-precision (32-bit) 40 /// floating-point elements in `a` to/from packed elements in `b`. 41 float4 _mm_addsub_ps (float4 a, float4 b) pure @trusted 42 { 43 // Note: generates addsubps since LDC 1.3.0 with -O1 44 // PERF: for GDC, detect SSE3 and use the relevant builtin 45 a.ptr[0] -= b.array[0]; 46 a.ptr[1] += b.array[1]; 47 a.ptr[2] -= b.array[2]; 48 a.ptr[3] += b.array[3]; 49 return a; 50 } 51 unittest 52 { 53 auto v1 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 54 auto v2 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); 55 assert( _mm_addsub_ps(v1,v2).array == _mm_setr_ps(0.0f, 4.0f, 0.0f, 8.0f).array ); 56 } 57 58 static if (LDC_with_SSE3) 59 { 60 /// Horizontally add adjacent pairs of double-precision (64-bit) 61 /// floating-point elements in `a` and `b`. 62 __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @safe 63 { 64 return __builtin_ia32_haddpd(a, b); 65 } 66 } 67 else 68 { 69 /// Horizontally add adjacent pairs of double-precision (64-bit) 70 /// floating-point elements in `a` and `b`. 71 __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @trusted 72 { 73 // On GDC this generates haddpd with -O1 74 __m128d res; 75 res.ptr[0] = a.array[1] + a.array[0]; 76 res.ptr[1] = b.array[1] + b.array[0]; 77 return res; 78 } 79 } 80 unittest 81 { 82 auto A =_mm_setr_pd(1.5, 2.0); 83 auto B =_mm_setr_pd(1.0, 2.0); 84 assert( _mm_hadd_pd(A, B).array ==_mm_setr_pd(3.5, 3.0).array ); 85 } 86 87 static if (LDC_with_SSE3) 88 { 89 /// Horizontally add adjacent pairs of single-precision (32-bit) 90 /// floating-point elements in `a` and `b`. 91 __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @safe 92 { 93 94 return __builtin_ia32_haddps(a, b); 95 } 96 } 97 else 98 { 99 // PERF: for GDC, detect SSE3 and use the relevant builtin 100 101 /// Horizontally add adjacent pairs of single-precision (32-bit) 102 /// floating-point elements in `a` and `b`. 103 __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @trusted 104 { 105 // TODO: quite bad on #ARM 106 __m128 res; 107 res.ptr[0] = a.array[1] + a.array[0]; 108 res.ptr[1] = a.array[3] + a.array[2]; 109 res.ptr[2] = b.array[1] + b.array[0]; 110 res.ptr[3] = b.array[3] + b.array[2]; 111 return res; 112 } 113 } 114 unittest 115 { 116 __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f); 117 __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f); 118 assert( _mm_hadd_ps(A, B).array == _mm_setr_ps(3.0f, 8.0f, 3.5f, 7.5f).array ); 119 } 120 121 static if (LDC_with_SSE3) 122 { 123 /// Horizontally subtract adjacent pairs of double-precision (64-bit) 124 /// floating-point elements in `a` and `b`. 125 __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @safe 126 { 127 return __builtin_ia32_hsubpd(a, b); 128 } 129 } 130 else 131 { 132 /// Horizontally subtract adjacent pairs of double-precision (64-bit) 133 /// floating-point elements in `a` and `b`. 134 __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @trusted 135 { 136 // On GDC this generates hsubpd with -O1 137 __m128d res; 138 res.ptr[0] = a.array[0] - a.array[1]; 139 res.ptr[1] = b.array[0] - b.array[1]; 140 return res; 141 } 142 } 143 unittest 144 { 145 auto A =_mm_setr_pd(1.5, 2.0); 146 auto B =_mm_setr_pd(1.0, 2.0); 147 assert( _mm_hsub_pd(A, B).array ==_mm_setr_pd(-0.5, -1.0).array ); 148 } 149 150 static if (LDC_with_SSE3) 151 { 152 /// Horizontally subtract adjacent pairs of single-precision (32-bit) 153 /// floating-point elements in `a` and `b`. 154 __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @safe 155 { 156 return __builtin_ia32_hsubps(a, b); 157 } 158 } 159 else 160 { 161 /// Horizontally subtract adjacent pairs of single-precision (32-bit) 162 /// floating-point elements in `a` and `b`. 163 __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @trusted 164 { 165 // PERF: GDC doesn't generate the right instruction, do something 166 // TODO: quite bad on #ARM 167 __m128 res; 168 res.ptr[0] = a.array[0] - a.array[1]; 169 res.ptr[1] = a.array[2] - a.array[3]; 170 res.ptr[2] = b.array[0] - b.array[1]; 171 res.ptr[3] = b.array[2] - b.array[3]; 172 return res; 173 } 174 } 175 unittest 176 { 177 __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f); 178 __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f); 179 assert(_mm_hsub_ps(A, B).array == _mm_setr_ps(-1.0f, -2.0f, -0.5f, -0.5f).array); 180 } 181 182 /// Load 128-bits of integer data from unaligned memory. 183 // Note: The saying is LDDQU was only ever useful around 2008 184 // See_also: https://stackoverflow.com/questions/38370622/a-faster-integer-sse-unalligned-load-thats-rarely-used 185 alias _mm_lddqu_si128 = _mm_loadu_si128; 186 187 188 __m128d _mm_loaddup_pd (const(double)* mem_addr) pure @trusted 189 { 190 // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3 191 // Same for GDC with -O1 192 double value = *mem_addr; 193 __m128d res; 194 res.ptr[0] = value; 195 res.ptr[1] = value; 196 return res; 197 } 198 unittest 199 { 200 version(LDC) 201 { 202 double a = 7.5; 203 assert(_mm_loaddup_pd(&a) == _mm_set_pd(7.5, 7.5)); 204 } 205 else 206 { 207 double a = 7.5; 208 // For some reason, this line breaks with LDC, but not when isolated! 209 // was not reported yet. 210 assert(_mm_loaddup_pd(&a).array == _mm_set_pd(7.5, 7.5).array); 211 } 212 } 213 214 __m128d _mm_movedup_pd (__m128d a) pure @trusted 215 { 216 // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3 217 // Something efficient with -01 for GDC 218 a.ptr[1] = a.array[0]; 219 return a; 220 } 221 unittest 222 { 223 __m128d A = _mm_setr_pd(7.0, 2.5); 224 assert(_mm_movedup_pd(A).array == _mm_set_pd(7.0, 7.0).array); 225 } 226 227 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`. 228 __m128 _mm_movehdup_ps (__m128 a) pure @trusted 229 { 230 // Generates movshdup since LDC 1.3 with -O1 -mattr=+sse3 231 // PERF but GDC never generates it 232 a.ptr[0] = a.array[1]; 233 a.ptr[2] = a.array[3]; 234 return a; 235 } 236 237 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`. 238 __m128 _mm_moveldup_ps (__m128 a) pure @trusted 239 { 240 // Generates movsldup since LDC 1.3 with -O1 -mattr=+sse3 241 // PERF but GDC never generates it 242 a.ptr[1] = a.array[0]; 243 a.ptr[3] = a.array[2]; 244 return a; 245 }