1 /** 2 * Transcendental bonus functions. 3 * 4 * Copyright: Copyright Guillaumr Piolat 2016-2020. 5 * Copyright (C) 2007 Julien Pommier 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.math; 9 10 /* Copyright (C) 2007 Julien Pommier 11 12 This software is provided 'as-is', without any express or implied 13 warranty. In no event will the authors be held liable for any damages 14 arising from the use of this software. 15 16 Permission is granted to anyone to use this software for any purpose, 17 including commercial applications, and to alter it and redistribute it 18 freely, subject to the following restrictions: 19 20 1. The origin of this software must not be misrepresented; you must not 21 claim that you wrote the original software. If you use this software 22 in a product, an acknowledgment in the product documentation would be 23 appreciated but is not required. 24 2. Altered source versions must be plainly marked as such, and must not be 25 misrepresented as being the original software. 26 3. This notice may not be removed or altered from any source distribution. 27 28 (this is the zlib license) 29 */ 30 import inteli.emmintrin; 31 import inteli.internals; 32 33 nothrow @nogc: 34 35 /// Natural `log` computed for a single 32-bit float. 36 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50 37 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 38 // #BONUS 39 float _mm_log_ss(float v) pure @safe 40 { 41 __m128 r = _mm_log_ps(_mm_set1_ps(v)); 42 return r.array[0]; 43 } 44 45 /// Natural logarithm computed for 4 simultaneous float. 46 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50 47 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 48 // #BONUS 49 __m128 _mm_log_ps(__m128 x) pure @safe 50 { 51 static immutable __m128i _psi_inv_mant_mask = [~0x7f800000, ~0x7f800000, ~0x7f800000, ~0x7f800000]; 52 static immutable __m128 _ps_cephes_SQRTHF = [0.707106781186547524, 0.707106781186547524, 0.707106781186547524, 0.707106781186547524]; 53 static immutable __m128 _ps_cephes_log_p0 = [7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2]; 54 static immutable __m128 _ps_cephes_log_p1 = [- 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1]; 55 static immutable __m128 _ps_cephes_log_p2 = [1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1]; 56 static immutable __m128 _ps_cephes_log_p3 = [- 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1]; 57 static immutable __m128 _ps_cephes_log_p4 = [+ 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1]; 58 static immutable __m128 _ps_cephes_log_p5 = [- 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1]; 59 static immutable __m128 _ps_cephes_log_p6 = [+ 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1]; 60 static immutable __m128 _ps_cephes_log_p7 = [- 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1]; 61 static immutable __m128 _ps_cephes_log_p8 = [+ 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1]; 62 static immutable __m128 _ps_cephes_log_q1 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4]; 63 static immutable __m128 _ps_cephes_log_q2 = [0.693359375, 0.693359375, 0.693359375, 0.693359375]; 64 65 /* the smallest non denormalized float number */ 66 static immutable __m128i _psi_min_norm_pos = [0x00800000, 0x00800000, 0x00800000, 0x00800000]; 67 68 __m128i emm0; 69 __m128 one = _ps_1; 70 __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps()); 71 x = _mm_max_ps(x, cast(__m128)_psi_min_norm_pos); /* cut off denormalized stuff */ 72 emm0 = _mm_srli_epi32(cast(__m128i)x, 23); 73 74 /* keep only the fractional part */ 75 x = _mm_and_ps(x, cast(__m128)_psi_inv_mant_mask); 76 x = _mm_or_ps(x, _ps_0p5); 77 78 emm0 = _mm_sub_epi32(emm0, _pi32_0x7f); 79 __m128 e = _mm_cvtepi32_ps(emm0); 80 e += one; 81 __m128 mask = _mm_cmplt_ps(x, _ps_cephes_SQRTHF); 82 __m128 tmp = _mm_and_ps(x, mask); 83 x -= one; 84 e -= _mm_and_ps(one, mask); 85 x += tmp; 86 __m128 z = x * x; 87 __m128 y = _ps_cephes_log_p0; 88 y *= x; 89 y += _ps_cephes_log_p1; 90 y *= x; 91 y += _ps_cephes_log_p2; 92 y *= x; 93 y += _ps_cephes_log_p3; 94 y *= x; 95 y += _ps_cephes_log_p4; 96 y *= x; 97 y += _ps_cephes_log_p5; 98 y *= x; 99 y += _ps_cephes_log_p6; 100 y *= x; 101 y += _ps_cephes_log_p7; 102 y *= x; 103 y += _ps_cephes_log_p8; 104 y *= x; 105 106 y = y * z; 107 tmp = e * _ps_cephes_log_q1; 108 y += tmp; 109 tmp = z * _ps_0p5; 110 y = y - tmp; 111 tmp = e * _ps_cephes_log_q2; 112 x += y; 113 x += tmp; 114 x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN 115 return x; 116 } 117 118 /// Natural `exp` computed for a single float. 119 /// This is an approximation, valid up to approximately -109dB of accuracy 120 /// IMPORTANT: NaN input not supported. 121 // #BONUS 122 float _mm_exp_ss(float v) pure @safe 123 { 124 __m128 r = _mm_exp_ps(_mm_set1_ps(v)); 125 return r.array[0]; 126 } 127 128 /// Natural `exp` computed for 4 simultaneous float in `x`. 129 /// This is an approximation, valid up to approximately -109dB of accuracy 130 /// IMPORTANT: NaN input not supported. 131 // #BONUS 132 __m128 _mm_exp_ps(__m128 x) pure @safe 133 { 134 static immutable __m128 _ps_exp_hi = [88.3762626647949f, 88.3762626647949f, 88.3762626647949f, 88.3762626647949f]; 135 static immutable __m128 _ps_exp_lo = [-88.3762626647949f, -88.3762626647949f, -88.3762626647949f, -88.3762626647949f]; 136 static immutable __m128 _ps_cephes_LOG2EF = [1.44269504088896341, 1.44269504088896341, 1.44269504088896341, 1.44269504088896341]; 137 static immutable __m128 _ps_cephes_exp_C1 = [0.693359375, 0.693359375, 0.693359375, 0.693359375]; 138 static immutable __m128 _ps_cephes_exp_C2 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4]; 139 static immutable __m128 _ps_cephes_exp_p0 = [1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4]; 140 static immutable __m128 _ps_cephes_exp_p1 = [1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3]; 141 static immutable __m128 _ps_cephes_exp_p2 = [8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3]; 142 static immutable __m128 _ps_cephes_exp_p3 = [4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2]; 143 static immutable __m128 _ps_cephes_exp_p4 = [1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1]; 144 static immutable __m128 _ps_cephes_exp_p5 = [5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1]; 145 146 __m128 tmp = _mm_setzero_ps(), fx; 147 __m128i emm0; 148 __m128 one = _ps_1; 149 150 x = _mm_min_ps(x, _ps_exp_hi); 151 x = _mm_max_ps(x, _ps_exp_lo); 152 153 /* express exp(x) as exp(g + n*log(2)) */ 154 fx = x * _ps_cephes_LOG2EF; 155 fx += _ps_0p5; 156 157 /* how to perform a floorf with SSE: just below */ 158 emm0 = _mm_cvttps_epi32(fx); 159 tmp = _mm_cvtepi32_ps(emm0); 160 161 /* if greater, substract 1 */ 162 __m128 mask = _mm_cmpgt_ps(tmp, fx); 163 mask = _mm_and_ps(mask, one); 164 fx = tmp - mask; 165 166 tmp = fx * _ps_cephes_exp_C1; 167 __m128 z = fx * _ps_cephes_exp_C2; 168 x -= tmp; 169 x -= z; 170 171 z = x * x; 172 173 __m128 y = _ps_cephes_exp_p0; 174 y *= x; 175 y += _ps_cephes_exp_p1; 176 y *= x; 177 y += _ps_cephes_exp_p2; 178 y *= x; 179 y += _ps_cephes_exp_p3; 180 y *= x; 181 y += _ps_cephes_exp_p4; 182 y *= x; 183 y += _ps_cephes_exp_p5; 184 y *= z; 185 y += x; 186 y += one; 187 188 /* build 2^n */ 189 emm0 = _mm_cvttps_epi32(fx); 190 191 emm0 = _mm_add_epi32(emm0, _pi32_0x7f); 192 emm0 = _mm_slli_epi32(emm0, 23); 193 __m128 pow2n = cast(__m128)emm0; 194 y *= pow2n; 195 return y; 196 } 197 198 /// Computes `base^exponent` for a single 32-bit float. 199 /// This is an approximation, valid up to approximately -100dB of accuracy 200 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 201 // #BONUS 202 float _mm_pow_ss(float base, float exponent) pure @safe 203 { 204 __m128 r = _mm_pow_ps(_mm_set1_ps(base), _mm_set1_ps(exponent)); 205 return r.array[0]; 206 } 207 208 /// Computes `base^exponent`, for 4 floats at once. 209 /// This is an approximation, valid up to approximately -100dB of accuracy 210 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 211 // #BONUS 212 __m128 _mm_pow_ps(__m128 base, __m128 exponents) pure @safe 213 { 214 return _mm_exp_ps(exponents * _mm_log_ps(base)); 215 } 216 217 /// Computes `base^exponent`, for 4 floats at once. 218 /// This is an approximation, valid up to approximately -100dB of accuracy 219 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 220 // #BONUS 221 __m128 _mm_pow_ps(__m128 base, float exponent) pure @safe 222 { 223 return _mm_exp_ps(_mm_set1_ps(exponent) * _mm_log_ps(base)); 224 } 225 226 unittest 227 { 228 import std.math; 229 230 bool approxEquals(double groundTruth, double approx, double epsilon) pure @trusted @nogc nothrow 231 { 232 if (!isFinite(groundTruth)) 233 return true; // no need to approximate where this is NaN or infinite 234 235 if (groundTruth == 0) // the approximaton should produce zero too if needed 236 { 237 return approx == 0; 238 } 239 240 if (approx == 0) 241 { 242 // If the approximation produces zero, the error should be below 140 dB 243 return ( abs(groundTruth) < 1e-7 ); 244 } 245 246 if ( ( abs(groundTruth / approx) - 1 ) >= epsilon) 247 { 248 assert(false); 249 } 250 251 return ( abs(groundTruth / approx) - 1 ) < epsilon; 252 } 253 254 // test _mm_log_ps 255 for (double mantissa = 0.1; mantissa < 1.0; mantissa += 0.05) 256 { 257 foreach (exponent; -23..23) 258 { 259 double x = mantissa * 2.0 ^^ exponent; 260 double phobosValue = log(x); 261 __m128 v = _mm_log_ps(_mm_set1_ps(x)); 262 foreach(i; 0..4) 263 assert(approxEquals(phobosValue, v.array[i], 1.1e-6)); 264 } 265 } 266 267 // test _mm_exp_ps 268 for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1) 269 { 270 foreach (exponent; -23..23) 271 { 272 double x = mantissa * 2.0 ^^ exponent; 273 274 // don't test too high numbers because they saturate FP precision pretty fast 275 if (x > 50) continue; 276 277 double phobosValue = exp(x); 278 __m128 v = _mm_exp_ps(_mm_set1_ps(x)); 279 foreach(i; 0..4) 280 { 281 if (!approxEquals(phobosValue, v.array[i], 3.4e-6)) 282 { 283 assert(false); 284 } 285 } 286 } 287 } 288 289 // test than exp(-inf) is 0 290 { 291 __m128 R = _mm_exp_ps(_mm_set1_ps(-float.infinity)); 292 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 293 assert(R.array == correct); 294 } 295 296 // test log baheviour with NaN and infinities 297 // the only guarantee for now is that _mm_log_ps(negative) yield a NaN 298 { 299 __m128 R = _mm_log_ps(_mm_setr_ps(+0.0f, -0.0f, -1.0f, float.nan)); 300 // DOESN'T PASS 301 // assert(isInfinity(R[0]) && R[0] < 0); // log(+0.0f) = -infinity 302 // DOESN'T PASS 303 // assert(isInfinity(R[1]) && R[1] < 0); // log(-0.0f) = -infinity 304 assert(isNaN(R.array[2])); // log(negative number) = NaN 305 306 // DOESN'T PASS 307 //assert(isNaN(R[3])); // log(NaN) = NaN 308 } 309 310 311 // test _mm_pow_ps 312 for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1) 313 { 314 foreach (exponent; -8..4) 315 { 316 double powExponent = mantissa * 2.0 ^^ exponent; 317 318 for (double mantissa2 = 0.1; mantissa2 < 1.0; mantissa2 += 0.1) 319 { 320 foreach (exponent2; -4..4) 321 { 322 double powBase = mantissa2 * 2.0 ^^ exponent2; 323 double phobosValue = pow(powBase, powExponent); 324 float fPhobos = phobosValue; 325 if (!isFinite(fPhobos)) continue; 326 __m128 v = _mm_pow_ps(_mm_set1_ps(powBase), _mm_set1_ps(powExponent)); 327 328 foreach(i; 0..4) 329 { 330 if (!approxEquals(phobosValue, v.array[i], 1e-5)) 331 { 332 printf("%g ^^ %g\n", powBase, powExponent); 333 assert(false); 334 } 335 } 336 } 337 } 338 } 339 } 340 } 341 342 private: 343 344 static immutable __m128 _ps_1 = [1.0f, 1.0f, 1.0f, 1.0f]; 345 static immutable __m128 _ps_0p5 = [0.5f, 0.5f, 0.5f, 0.5f]; 346 static immutable __m128i _pi32_0x7f = [0x7f, 0x7f, 0x7f, 0x7f];