1 /** 2 * Transcendental bonus functions. 3 * 4 * Copyright: Copyright Guillaumr Piolat 2016-2020. 5 * Copyright (C) 2007 Julien Pommier 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.math; 9 10 /* Copyright (C) 2007 Julien Pommier 11 12 This software is provided 'as-is', without any express or implied 13 warranty. In no event will the authors be held liable for any damages 14 arising from the use of this software. 15 16 Permission is granted to anyone to use this software for any purpose, 17 including commercial applications, and to alter it and redistribute it 18 freely, subject to the following restrictions: 19 20 1. The origin of this software must not be misrepresented; you must not 21 claim that you wrote the original software. If you use this software 22 in a product, an acknowledgment in the product documentation would be 23 appreciated but is not required. 24 2. Altered source versions must be plainly marked as such, and must not be 25 misrepresented as being the original software. 26 3. This notice may not be removed or altered from any source distribution. 27 28 (this is the zlib license) 29 */ 30 import inteli.emmintrin; 31 import inteli.internals; 32 33 nothrow @nogc: 34 35 /// Natural `log` computed for a single 32-bit float. 36 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50 37 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 38 // #BONUS 39 float _mm_log_ss(float v) pure @safe 40 { 41 __m128 r = _mm_log_ps(_mm_set1_ps(v)); 42 return r.array[0]; 43 } 44 45 /// Natural logarithm computed for 4 simultaneous float. 46 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50 47 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 48 // #BONUS 49 __m128 _mm_log_ps(__m128 x) pure @safe 50 { 51 static immutable __m128i _psi_inv_mant_mask = [~0x7f800000, ~0x7f800000, ~0x7f800000, ~0x7f800000]; 52 static immutable __m128 _ps_cephes_SQRTHF = [0.707106781186547524, 0.707106781186547524, 0.707106781186547524, 0.707106781186547524]; 53 static immutable __m128 _ps_cephes_log_p0 = [7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2]; 54 static immutable __m128 _ps_cephes_log_p1 = [- 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1]; 55 static immutable __m128 _ps_cephes_log_p2 = [1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1]; 56 static immutable __m128 _ps_cephes_log_p3 = [- 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1]; 57 static immutable __m128 _ps_cephes_log_p4 = [+ 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1]; 58 static immutable __m128 _ps_cephes_log_p5 = [- 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1]; 59 static immutable __m128 _ps_cephes_log_p6 = [+ 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1]; 60 static immutable __m128 _ps_cephes_log_p7 = [- 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1]; 61 static immutable __m128 _ps_cephes_log_p8 = [+ 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1]; 62 static immutable __m128 _ps_cephes_log_q1 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4]; 63 static immutable __m128 _ps_cephes_log_q2 = [0.693359375, 0.693359375, 0.693359375, 0.693359375]; 64 65 /* the smallest non denormalized float number */ 66 static immutable __m128i _psi_min_norm_pos = [0x00800000, 0x00800000, 0x00800000, 0x00800000]; 67 68 __m128i emm0; 69 __m128 one = _ps_1; 70 __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps()); 71 x = _mm_max_ps(x, cast(__m128)_psi_min_norm_pos); /* cut off denormalized stuff */ 72 emm0 = _mm_srli_epi32(cast(__m128i)x, 23); 73 74 /* keep only the fractional part */ 75 x = _mm_and_ps(x, cast(__m128)_psi_inv_mant_mask); 76 x = _mm_or_ps(x, _ps_0p5); 77 78 emm0 = _mm_sub_epi32(emm0, _pi32_0x7f); 79 __m128 e = _mm_cvtepi32_ps(emm0); 80 e += one; 81 __m128 mask = _mm_cmplt_ps(x, _ps_cephes_SQRTHF); 82 __m128 tmp = _mm_and_ps(x, mask); 83 x -= one; 84 e -= _mm_and_ps(one, mask); 85 x += tmp; 86 __m128 z = x * x; 87 __m128 y = _ps_cephes_log_p0; 88 y = _mm_mul_ps(y, x); 89 y = _mm_add_ps(y, _ps_cephes_log_p1); 90 y = _mm_mul_ps(y, x); 91 y = _mm_add_ps(y, _ps_cephes_log_p2); 92 y = _mm_mul_ps(y, x); 93 y = _mm_add_ps(y, _ps_cephes_log_p3); 94 y = _mm_mul_ps(y, x); 95 y = _mm_add_ps(y, _ps_cephes_log_p4); 96 y = _mm_mul_ps(y, x); 97 y = _mm_add_ps(y, _ps_cephes_log_p5); 98 y = _mm_mul_ps(y, x); 99 y = _mm_add_ps(y, _ps_cephes_log_p6); 100 y = _mm_mul_ps(y, x); 101 y = _mm_add_ps(y, _ps_cephes_log_p7); 102 y = _mm_mul_ps(y, x); 103 y = _mm_add_ps(y, _ps_cephes_log_p8); 104 y = _mm_mul_ps(y, x); 105 y = _mm_mul_ps(y, z); 106 tmp = _mm_mul_ps(e, _ps_cephes_log_q1); 107 y = _mm_add_ps(y, tmp); 108 tmp = _mm_mul_ps(z, _ps_0p5); 109 y = _mm_sub_ps(y, tmp); 110 tmp = _mm_mul_ps(e, _ps_cephes_log_q2); 111 x = _mm_add_ps(x, y); 112 x = _mm_add_ps(x, tmp); 113 x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN 114 return x; 115 } 116 117 /// Natural `exp` computed for a single float. 118 /// This is an approximation, valid up to approximately -109dB of accuracy 119 /// IMPORTANT: NaN input not supported. 120 // #BONUS 121 float _mm_exp_ss(float v) pure @safe 122 { 123 __m128 r = _mm_exp_ps(_mm_set1_ps(v)); 124 return r.array[0]; 125 } 126 127 /// Natural `exp` computed for 4 simultaneous float in `x`. 128 /// This is an approximation, valid up to approximately -109dB of accuracy 129 /// IMPORTANT: NaN input not supported. 130 // #BONUS 131 __m128 _mm_exp_ps(__m128 x) pure @safe 132 { 133 static immutable __m128 _ps_exp_hi = [88.3762626647949f, 88.3762626647949f, 88.3762626647949f, 88.3762626647949f]; 134 static immutable __m128 _ps_exp_lo = [-88.3762626647949f, -88.3762626647949f, -88.3762626647949f, -88.3762626647949f]; 135 static immutable __m128 _ps_cephes_LOG2EF = [1.44269504088896341, 1.44269504088896341, 1.44269504088896341, 1.44269504088896341]; 136 static immutable __m128 _ps_cephes_exp_C1 = [0.693359375, 0.693359375, 0.693359375, 0.693359375]; 137 static immutable __m128 _ps_cephes_exp_C2 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4]; 138 static immutable __m128 _ps_cephes_exp_p0 = [1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4]; 139 static immutable __m128 _ps_cephes_exp_p1 = [1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3]; 140 static immutable __m128 _ps_cephes_exp_p2 = [8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3]; 141 static immutable __m128 _ps_cephes_exp_p3 = [4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2]; 142 static immutable __m128 _ps_cephes_exp_p4 = [1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1]; 143 static immutable __m128 _ps_cephes_exp_p5 = [5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1]; 144 145 __m128 tmp = _mm_setzero_ps(), fx; 146 __m128i emm0; 147 __m128 one = _ps_1; 148 149 x = _mm_min_ps(x, _ps_exp_hi); 150 x = _mm_max_ps(x, _ps_exp_lo); 151 152 /* express exp(x) as exp(g + n*log(2)) */ 153 fx = _mm_mul_ps(x, _ps_cephes_LOG2EF); 154 fx = _mm_add_ps(fx, _ps_0p5); 155 156 /* how to perform a floorf with SSE: just below */ 157 emm0 = _mm_cvttps_epi32(fx); 158 tmp = _mm_cvtepi32_ps(emm0); 159 160 /* if greater, substract 1 */ 161 __m128 mask = _mm_cmpgt_ps(tmp, fx); 162 mask = _mm_and_ps(mask, one); 163 fx = tmp - mask; 164 165 tmp = _mm_mul_ps(fx, _ps_cephes_exp_C1); 166 __m128 z = _mm_mul_ps(fx, _ps_cephes_exp_C2); 167 x -= tmp; 168 x -= z; 169 170 z = x * x; 171 172 __m128 y = _ps_cephes_exp_p0; 173 y = _mm_mul_ps(y, x); 174 y = _mm_add_ps(y, _ps_cephes_exp_p1); 175 y = _mm_mul_ps(y, x); 176 y = _mm_add_ps(y, _ps_cephes_exp_p2); 177 y = _mm_mul_ps(y, x); 178 y = _mm_add_ps(y, _ps_cephes_exp_p3); 179 y = _mm_mul_ps(y, x); 180 y = _mm_add_ps(y, _ps_cephes_exp_p4); 181 y = _mm_mul_ps(y, x); 182 y = _mm_add_ps(y, _ps_cephes_exp_p5); 183 y = _mm_mul_ps(y, z); 184 y = _mm_add_ps(y, x); 185 y += one; 186 187 /* build 2^n */ 188 emm0 = _mm_cvttps_epi32(fx); 189 190 emm0 = _mm_add_epi32(emm0, _pi32_0x7f); 191 emm0 = _mm_slli_epi32(emm0, 23); 192 __m128 pow2n = cast(__m128)emm0; 193 y *= pow2n; 194 return y; 195 } 196 197 /// Computes `base^exponent` for a single 32-bit float. 198 /// This is an approximation, valid up to approximately -100dB of accuracy 199 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 200 // #BONUS 201 float _mm_pow_ss(float base, float exponent) pure @safe 202 { 203 __m128 r = _mm_pow_ps(_mm_set1_ps(base), _mm_set1_ps(exponent)); 204 return r.array[0]; 205 } 206 207 /// Computes `base^exponent`, for 4 floats at once. 208 /// This is an approximation, valid up to approximately -100dB of accuracy 209 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 210 // #BONUS 211 __m128 _mm_pow_ps(__m128 base, __m128 exponents) pure @safe 212 { 213 return _mm_exp_ps(exponents * _mm_log_ps(base)); 214 } 215 216 /// Computes `base^exponent`, for 4 floats at once. 217 /// This is an approximation, valid up to approximately -100dB of accuracy 218 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 219 // #BONUS 220 __m128 _mm_pow_ps(__m128 base, float exponent) pure @safe 221 { 222 return _mm_exp_ps(_mm_set1_ps(exponent) * _mm_log_ps(base)); 223 } 224 225 unittest 226 { 227 import std.math; 228 229 bool approxEquals(double groundTruth, double approx, double epsilon) pure @trusted @nogc nothrow 230 { 231 if (!isFinite(groundTruth)) 232 return true; // no need to approximate where this is NaN or infinite 233 234 if (groundTruth == 0) // the approximaton should produce zero too if needed 235 { 236 return approx == 0; 237 } 238 239 if (approx == 0) 240 { 241 // If the approximation produces zero, the error should be below 140 dB 242 return ( abs(groundTruth) < 1e-7 ); 243 } 244 245 if ( ( abs(groundTruth / approx) - 1 ) >= epsilon) 246 { 247 import core.stdc.stdio; 248 debug printf("approxEquals (%g, %g, %g) failed\n", groundTruth, approx, epsilon); 249 debug printf("ratio is %f\n", abs(groundTruth / approx) - 1); 250 } 251 252 return ( abs(groundTruth / approx) - 1 ) < epsilon; 253 } 254 255 // test _mm_log_ps 256 for (double mantissa = 0.1; mantissa < 1.0; mantissa += 0.05) 257 { 258 foreach (exponent; -23..23) 259 { 260 double x = mantissa * 2.0 ^^ exponent; 261 double phobosValue = log(x); 262 __m128 v = _mm_log_ps(_mm_set1_ps(x)); 263 foreach(i; 0..4) 264 assert(approxEquals(phobosValue, v.array[i], 1.1e-6)); 265 } 266 } 267 268 // test _mm_exp_ps 269 for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1) 270 { 271 foreach (exponent; -23..23) 272 { 273 double x = mantissa * 2.0 ^^ exponent; 274 275 // don't test too high numbers because they saturate FP precision pretty fast 276 if (x > 50) continue; 277 278 double phobosValue = exp(x); 279 __m128 v = _mm_exp_ps(_mm_set1_ps(x)); 280 foreach(i; 0..4) 281 { 282 if (!approxEquals(phobosValue, v.array[i], 3.4e-6)) 283 { 284 import core.stdc.stdio; 285 printf("x = %f truth = %f vs estimate = %fn", x, phobosValue, v.array[i]); 286 assert(false); 287 } 288 } 289 } 290 } 291 292 // test than exp(-inf) is 0 293 { 294 __m128 R = _mm_exp_ps(_mm_set1_ps(-float.infinity)); 295 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 296 assert(R.array == correct); 297 } 298 299 // test log baheviour with NaN and infinities 300 // the only guarantee for now is that _mm_log_ps(negative) yield a NaN 301 { 302 __m128 R = _mm_log_ps(_mm_setr_ps(+0.0f, -0.0f, -1.0f, float.nan)); 303 // DOESN'T PASS 304 // assert(isInfinity(R[0]) && R[0] < 0); // log(+0.0f) = -infinity 305 // DOESN'T PASS 306 // assert(isInfinity(R[1]) && R[1] < 0); // log(-0.0f) = -infinity 307 assert(isNaN(R.array[2])); // log(negative number) = NaN 308 309 // DOESN'T PASS 310 //assert(isNaN(R[3])); // log(NaN) = NaN 311 } 312 313 314 // test _mm_pow_ps 315 for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1) 316 { 317 foreach (exponent; -8..4) 318 { 319 double powExponent = mantissa * 2.0 ^^ exponent; 320 321 for (double mantissa2 = 0.1; mantissa2 < 1.0; mantissa2 += 0.1) 322 { 323 foreach (exponent2; -4..4) 324 { 325 double powBase = mantissa2 * 2.0 ^^ exponent2; 326 double phobosValue = pow(powBase, powExponent); 327 float fPhobos = phobosValue; 328 if (!isFinite(fPhobos)) continue; 329 __m128 v = _mm_pow_ps(_mm_set1_ps(powBase), _mm_set1_ps(powExponent)); 330 331 foreach(i; 0..4) 332 { 333 if (!approxEquals(phobosValue, v.array[i], 1e-5)) 334 { 335 printf("%g ^^ %g\n", powBase, powExponent); 336 assert(false); 337 } 338 } 339 } 340 } 341 } 342 } 343 } 344 345 private: 346 347 static immutable __m128 _ps_1 = [1.0f, 1.0f, 1.0f, 1.0f]; 348 static immutable __m128 _ps_0p5 = [0.5f, 0.5f, 0.5f, 0.5f]; 349 static immutable __m128i _pi32_0x7f = [0x7f, 0x7f, 0x7f, 0x7f];