1 /** 2 * Transcendental function on 4 numbers at once. 3 * 4 * Copyright: Copyright Auburn Sounds 2016-2018. 5 * Copyright (C) 2007 Julien Pommier 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 * Authors: Guillaume Piolat 8 */ 9 module inteli.math; 10 11 /* Copyright (C) 2007 Julien Pommier 12 13 This software is provided 'as-is', without any express or implied 14 warranty. In no event will the authors be held liable for any damages 15 arising from the use of this software. 16 17 Permission is granted to anyone to use this software for any purpose, 18 including commercial applications, and to alter it and redistribute it 19 freely, subject to the following restrictions: 20 21 1. The origin of this software must not be misrepresented; you must not 22 claim that you wrote the original software. If you use this software 23 in a product, an acknowledgment in the product documentation would be 24 appreciated but is not required. 25 2. Altered source versions must be plainly marked as such, and must not be 26 misrepresented as being the original software. 27 3. This notice may not be removed or altered from any source distribution. 28 29 (this is the zlib license) 30 */ 31 import inteli.emmintrin; 32 import inteli.internals; 33 34 nothrow @nogc: 35 36 /// Natural `log` computed for a single 32-bit float. 37 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50 38 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 39 // #BONUS 40 float _mm_log_ss(float v) pure @safe 41 { 42 __m128 r = _mm_log_ps(_mm_set1_ps(v)); 43 return r[0]; 44 } 45 46 /// Natural logarithm computed for 4 simultaneous float. 47 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50 48 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 49 // #BONUS 50 __m128 _mm_log_ps(__m128 x) pure @safe 51 { 52 static immutable __m128i _psi_inv_mant_mask = [~0x7f800000, ~0x7f800000, ~0x7f800000, ~0x7f800000]; 53 static immutable __m128 _ps_cephes_SQRTHF = [0.707106781186547524, 0.707106781186547524, 0.707106781186547524, 0.707106781186547524]; 54 static immutable __m128 _ps_cephes_log_p0 = [7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2]; 55 static immutable __m128 _ps_cephes_log_p1 = [- 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1]; 56 static immutable __m128 _ps_cephes_log_p2 = [1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1]; 57 static immutable __m128 _ps_cephes_log_p3 = [- 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1]; 58 static immutable __m128 _ps_cephes_log_p4 = [+ 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1]; 59 static immutable __m128 _ps_cephes_log_p5 = [- 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1]; 60 static immutable __m128 _ps_cephes_log_p6 = [+ 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1]; 61 static immutable __m128 _ps_cephes_log_p7 = [- 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1]; 62 static immutable __m128 _ps_cephes_log_p8 = [+ 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1]; 63 static immutable __m128 _ps_cephes_log_q1 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4]; 64 static immutable __m128 _ps_cephes_log_q2 = [0.693359375, 0.693359375, 0.693359375, 0.693359375]; 65 66 /* the smallest non denormalized float number */ 67 static immutable __m128i _psi_min_norm_pos = [0x00800000, 0x00800000, 0x00800000, 0x00800000]; 68 69 __m128i emm0; 70 __m128 one = _ps_1; 71 __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps()); 72 x = _mm_max_ps(x, cast(__m128)_psi_min_norm_pos); /* cut off denormalized stuff */ 73 emm0 = _mm_srli_epi32(cast(__m128i)x, 23); 74 75 /* keep only the fractional part */ 76 x = _mm_and_ps(x, cast(__m128)_psi_inv_mant_mask); 77 x = _mm_or_ps(x, _ps_0p5); 78 79 emm0 -= _pi32_0x7f; 80 __m128 e = _mm_cvtepi32_ps(emm0); 81 e += one; 82 __m128 mask = _mm_cmplt_ps(x, _ps_cephes_SQRTHF); 83 __m128 tmp = _mm_and_ps(x, mask); 84 x -= one; 85 e -= _mm_and_ps(one, mask); 86 x += tmp; 87 __m128 z = x * x; 88 __m128 y = _ps_cephes_log_p0; 89 y *= x; 90 y += _ps_cephes_log_p1; 91 y *= x; 92 y += _ps_cephes_log_p2; 93 y *= x; 94 y += _ps_cephes_log_p3; 95 y *= x; 96 y += _ps_cephes_log_p4; 97 y *= x; 98 y += _ps_cephes_log_p5; 99 y *= x; 100 y += _ps_cephes_log_p6; 101 y *= x; 102 y += _ps_cephes_log_p7; 103 y *= x; 104 y += _ps_cephes_log_p8; 105 y *= x; 106 y *= z; 107 tmp = e * _ps_cephes_log_q1; 108 y += tmp; 109 tmp = z * _ps_0p5; 110 y = y - tmp; 111 tmp = e * _ps_cephes_log_q2; 112 x += y; 113 x += tmp; 114 x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN 115 return x; 116 } 117 118 /// Natural `exp` computed for a single float. 119 /// This is an approximation, valid up to approximately -109dB of accuracy 120 /// IMPORTANT: NaN input not supported. 121 // #BONUS 122 float _mm_exp_ss(float v) pure @safe 123 { 124 __m128 r = _mm_exp_ps(_mm_set1_ps(v)); 125 return r[0]; 126 } 127 128 /// Natural `exp` computed for 4 simultaneous float in `x`. 129 /// This is an approximation, valid up to approximately -109dB of accuracy 130 /// IMPORTANT: NaN input not supported. 131 // #BONUS 132 __m128 _mm_exp_ps(__m128 x) pure @safe 133 { 134 static immutable __m128 _ps_exp_hi = [88.3762626647949f, 88.3762626647949f, 88.3762626647949f, 88.3762626647949f]; 135 static immutable __m128 _ps_exp_lo = [-88.3762626647949f, -88.3762626647949f, -88.3762626647949f, -88.3762626647949f]; 136 static immutable __m128 _ps_cephes_LOG2EF = [1.44269504088896341, 1.44269504088896341, 1.44269504088896341, 1.44269504088896341]; 137 static immutable __m128 _ps_cephes_exp_C1 = [0.693359375, 0.693359375, 0.693359375, 0.693359375]; 138 static immutable __m128 _ps_cephes_exp_C2 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4]; 139 static immutable __m128 _ps_cephes_exp_p0 = [1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4]; 140 static immutable __m128 _ps_cephes_exp_p1 = [1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3]; 141 static immutable __m128 _ps_cephes_exp_p2 = [8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3]; 142 static immutable __m128 _ps_cephes_exp_p3 = [4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2]; 143 static immutable __m128 _ps_cephes_exp_p4 = [1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1]; 144 static immutable __m128 _ps_cephes_exp_p5 = [5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1]; 145 146 __m128 tmp = _mm_setzero_ps(), fx; 147 __m128i emm0; 148 __m128 one = _ps_1; 149 150 x = _mm_min_ps(x, _ps_exp_hi); 151 x = _mm_max_ps(x, _ps_exp_lo); 152 153 /* express exp(x) as exp(g + n*log(2)) */ 154 fx = x * _ps_cephes_LOG2EF; 155 fx += _ps_0p5; 156 157 /* how to perform a floorf with SSE: just below */ 158 emm0 = _mm_cvttps_epi32(fx); 159 tmp = _mm_cvtepi32_ps(emm0); 160 161 /* if greater, substract 1 */ 162 __m128 mask = _mm_cmpgt_ps(tmp, fx); 163 mask = _mm_and_ps(mask, one); 164 fx = tmp - mask; 165 166 tmp = fx * _ps_cephes_exp_C1; 167 __m128 z = fx * _ps_cephes_exp_C2; 168 x -= tmp; 169 x -= z; 170 171 z = x * x; 172 173 __m128 y = _ps_cephes_exp_p0; 174 y *= x; 175 y += _ps_cephes_exp_p1; 176 y *= x; 177 y += _ps_cephes_exp_p2; 178 y *= x; 179 y += _ps_cephes_exp_p3; 180 y *= x; 181 y += _ps_cephes_exp_p4; 182 y *= x; 183 y += _ps_cephes_exp_p5; 184 y *= z; 185 y += x; 186 y += one; 187 188 /* build 2^n */ 189 emm0 = _mm_cvttps_epi32(fx); 190 emm0 += _pi32_0x7f; 191 emm0 = _mm_slli_epi32(emm0, 23); 192 __m128 pow2n = cast(__m128)emm0; 193 y *= pow2n; 194 return y; 195 } 196 197 /// Computes `base^exponent` for a single 32-bit float. 198 /// This is an approximation, valid up to approximately -100dB of accuracy 199 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 200 // #BONUS 201 float _mm_pow_ss(float base, float exponent) pure @safe 202 { 203 __m128 r = _mm_pow_ps(_mm_set1_ps(base), _mm_set1_ps(exponent)); 204 return r[0]; 205 } 206 207 /// Computes `base^exponent`, for 4 floats at once. 208 /// This is an approximation, valid up to approximately -100dB of accuracy 209 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 210 // #BONUS 211 __m128 _mm_pow_ps(__m128 base, __m128 exponents) pure @safe 212 { 213 return _mm_exp_ps(exponents * _mm_log_ps(base)); 214 } 215 216 /// Computes `base^exponent`, for 4 floats at once. 217 /// This is an approximation, valid up to approximately -100dB of accuracy 218 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 219 // #BONUS 220 __m128 _mm_pow_ps(__m128 base, float exponent) pure @safe 221 { 222 return _mm_exp_ps(_mm_set1_ps(exponent) * _mm_log_ps(base)); 223 } 224 225 unittest 226 { 227 import std.math; 228 229 bool approxEquals(double groundTruth, double approx, double epsilon) pure @trusted @nogc nothrow 230 { 231 if (!isFinite(groundTruth)) 232 return true; // no need to approximate where this is NaN or infinite 233 234 if (groundTruth == 0) // the approximaton should produce zero too if needed 235 { 236 return approx == 0; 237 } 238 239 if (approx == 0) 240 { 241 // If the approximation produces zero, the error should be below 140 dB 242 return ( abs(groundTruth) < 1e-7 ); 243 } 244 245 if ( ( abs(groundTruth / approx) - 1 ) >= epsilon) 246 { 247 import core.stdc.stdio; 248 debug printf("approxEquals (%g, %g, %g) failed\n", groundTruth, approx, epsilon); 249 debug printf("ratio is %f\n", abs(groundTruth / approx) - 1); 250 } 251 252 return ( abs(groundTruth / approx) - 1 ) < epsilon; 253 } 254 255 // test _mm_log_ps 256 for (double mantissa = 0.1; mantissa < 1.0; mantissa += 0.05) 257 { 258 foreach (exponent; -23..23) 259 { 260 double x = mantissa * 2.0 ^^ exponent; 261 double phobosValue = log(x); 262 __m128 v = _mm_log_ps(_mm_set1_ps(x)); 263 foreach(i; 0..4) 264 assert(approxEquals(phobosValue, v[i], 1.1e-6)); 265 } 266 } 267 268 // test _mm_exp_ps 269 for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1) 270 { 271 foreach (exponent; -23..23) 272 { 273 double x = mantissa * 2.0 ^^ exponent; 274 275 // don't test too high numbers because they saturate FP precision pretty fast 276 if (x > 50) continue; 277 278 double phobosValue = exp(x); 279 __m128 v = _mm_exp_ps(_mm_set1_ps(x)); 280 foreach(i; 0..4) 281 assert(approxEquals(phobosValue, v[i], 3.4e-6)); 282 } 283 } 284 285 // test than exp(-inf) is 0 286 { 287 __m128 R = _mm_exp_ps(_mm_set1_ps(-float.infinity)); 288 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 289 assert(R.array == correct); 290 } 291 292 // test log baheviour with NaN and infinities 293 // the only guarantee for now is that _mm_log_ps(negative) yield a NaN 294 { 295 __m128 R = _mm_log_ps(_mm_setr_ps(+0.0f, -0.0f, -1.0f, float.nan)); 296 // DOESN'T PASS 297 // assert(isInfinity(R[0]) && R[0] < 0); // log(+0.0f) = -infinity 298 // DOESN'T PASS 299 // assert(isInfinity(R[1]) && R[1] < 0); // log(-0.0f) = -infinity 300 assert(isNaN(R[2])); // log(negative number) = NaN 301 302 // DOESN'T PASS 303 //assert(isNaN(R[3])); // log(NaN) = NaN 304 } 305 306 307 // test _mm_pow_ps 308 for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1) 309 { 310 foreach (exponent; -8..4) 311 { 312 double powExponent = mantissa * 2.0 ^^ exponent; 313 314 for (double mantissa2 = 0.1; mantissa2 < 1.0; mantissa2 += 0.1) 315 { 316 foreach (exponent2; -4..4) 317 { 318 double powBase = mantissa2 * 2.0 ^^ exponent2; 319 double phobosValue = pow(powBase, powExponent); 320 float fPhobos = phobosValue; 321 if (!isFinite(fPhobos)) continue; 322 __m128 v = _mm_pow_ps(_mm_set1_ps(powBase), _mm_set1_ps(powExponent)); 323 324 foreach(i; 0..4) 325 { 326 if (!approxEquals(phobosValue, v[i], 1e-5)) 327 { 328 printf("%g ^^ %g\n", powBase, powExponent); 329 assert(false); 330 } 331 } 332 } 333 } 334 } 335 } 336 } 337 338 private: 339 340 static immutable __m128 _ps_1 = [1.0f, 1.0f, 1.0f, 1.0f]; 341 static immutable __m128 _ps_0p5 = [0.5f, 0.5f, 0.5f, 0.5f]; 342 static immutable __m128i _pi32_0x7f = [0x7f, 0x7f, 0x7f, 0x7f];