1 /** 2 * Transcendental function on 4 numbers at once. 3 * 4 * Copyright: Copyright Auburn Sounds 2016-2018. 5 * Copyright (C) 2007 Julien Pommier 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 * Authors: Guillaume Piolat 8 */ 9 module inteli.math; 10 11 /* Copyright (C) 2007 Julien Pommier 12 13 This software is provided 'as-is', without any express or implied 14 warranty. In no event will the authors be held liable for any damages 15 arising from the use of this software. 16 17 Permission is granted to anyone to use this software for any purpose, 18 including commercial applications, and to alter it and redistribute it 19 freely, subject to the following restrictions: 20 21 1. The origin of this software must not be misrepresented; you must not 22 claim that you wrote the original software. If you use this software 23 in a product, an acknowledgment in the product documentation would be 24 appreciated but is not required. 25 2. Altered source versions must be plainly marked as such, and must not be 26 misrepresented as being the original software. 27 3. This notice may not be removed or altered from any source distribution. 28 29 (this is the zlib license) 30 */ 31 import inteli.emmintrin; 32 import inteli.internals; 33 34 static immutable __m128 _ps_1 = [1.0f, 1.0f, 1.0f, 1.0f]; 35 static immutable __m128 _ps_0p5 = [0.5f, 0.5f, 0.5f, 0.5f]; 36 37 /* the smallest non denormalized float number */ 38 static immutable __m128i _psi_min_norm_pos = [0x00800000, 0x00800000, 0x00800000, 0x00800000]; 39 40 static immutable __m128i _pi32_0x7f = [0x7f, 0x7f, 0x7f, 0x7f]; 41 42 43 nothrow @nogc: 44 45 /// Natural `log` computed for a single 32-bit float. 46 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50 47 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 48 // #BONUS 49 float _mm_log_ss(float v) pure @safe 50 { 51 __m128 r = _mm_log_ps(_mm_set1_ps(v)); 52 return r[0]; 53 } 54 55 /// Natural logarithm computed for 4 simultaneous float. 56 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50 57 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 58 // #BONUS 59 __m128 _mm_log_ps(__m128 x) pure @safe 60 { 61 static immutable __m128i _psi_inv_mant_mask = [~0x7f800000, ~0x7f800000, ~0x7f800000, ~0x7f800000]; 62 static immutable __m128 _ps_cephes_SQRTHF = [0.707106781186547524, 0.707106781186547524, 0.707106781186547524, 0.707106781186547524]; 63 static immutable __m128 _ps_cephes_log_p0 = [7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2]; 64 static immutable __m128 _ps_cephes_log_p1 = [- 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1]; 65 static immutable __m128 _ps_cephes_log_p2 = [1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1]; 66 static immutable __m128 _ps_cephes_log_p3 = [- 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1]; 67 static immutable __m128 _ps_cephes_log_p4 = [+ 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1]; 68 static immutable __m128 _ps_cephes_log_p5 = [- 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1]; 69 static immutable __m128 _ps_cephes_log_p6 = [+ 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1]; 70 static immutable __m128 _ps_cephes_log_p7 = [- 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1]; 71 static immutable __m128 _ps_cephes_log_p8 = [+ 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1]; 72 static immutable __m128 _ps_cephes_log_q1 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4]; 73 static immutable __m128 _ps_cephes_log_q2 = [0.693359375, 0.693359375, 0.693359375, 0.693359375]; 74 75 __m128i emm0; 76 __m128 one = _ps_1; 77 __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps()); 78 x = _mm_max_ps(x, cast(__m128)_psi_min_norm_pos); /* cut off denormalized stuff */ 79 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); 80 81 /* keep only the fractional part */ 82 x = _mm_and_ps(x, cast(__m128)_psi_inv_mant_mask); 83 x = _mm_or_ps(x, _ps_0p5); 84 85 emm0 = _mm_sub_epi32(emm0, cast(__m128i)_pi32_0x7f); 86 __m128 e = _mm_cvtepi32_ps(emm0); 87 e += one; 88 __m128 mask = _mm_cmplt_ps(x, _ps_cephes_SQRTHF); 89 __m128 tmp = _mm_and_ps(x, mask); 90 x -= one; 91 e -= _mm_and_ps(one, mask); 92 x += tmp; 93 __m128 z = x * x; 94 __m128 y = _ps_cephes_log_p0; 95 y *= x; 96 y += _ps_cephes_log_p1; 97 y *= x; 98 y += _ps_cephes_log_p2; 99 y *= x; 100 y += _ps_cephes_log_p3; 101 y *= x; 102 y += _ps_cephes_log_p4; 103 y *= x; 104 y += _ps_cephes_log_p5; 105 y *= x; 106 y += _ps_cephes_log_p6; 107 y *= x; 108 y += _ps_cephes_log_p7; 109 y *= x; 110 y += _ps_cephes_log_p8; 111 y *= x; 112 y *= z; 113 tmp = e * _ps_cephes_log_q1; 114 y += tmp; 115 tmp = z * _ps_0p5; 116 y = y - tmp; 117 tmp = e * _ps_cephes_log_q2; 118 x += y; 119 x += tmp; 120 x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN 121 return x; 122 } 123 124 /// Natural `exp` computed for a single float. 125 /// This is an approximation, valid up to approximately -109dB of accuracy 126 /// IMPORTANT: NaN input not supported. 127 // #BONUS 128 float _mm_exp_ss(float v) pure @safe 129 { 130 __m128 r = _mm_exp_ps(_mm_set1_ps(v)); 131 return r[0]; 132 } 133 134 /// Natural `exp` computed for 4 simultaneous float in `x`. 135 /// This is an approximation, valid up to approximately -109dB of accuracy 136 /// IMPORTANT: NaN input not supported. 137 // #BONUS 138 __m128 _mm_exp_ps(__m128 x) pure @safe 139 { 140 static immutable __m128 _ps_exp_hi = [88.3762626647949f, 88.3762626647949f, 88.3762626647949f, 88.3762626647949f]; 141 static immutable __m128 _ps_exp_lo = [-88.3762626647949f, -88.3762626647949f, -88.3762626647949f, -88.3762626647949f]; 142 static immutable __m128 _ps_cephes_LOG2EF = [1.44269504088896341, 1.44269504088896341, 1.44269504088896341, 1.44269504088896341]; 143 static immutable __m128 _ps_cephes_exp_C1 = [0.693359375, 0.693359375, 0.693359375, 0.693359375]; 144 static immutable __m128 _ps_cephes_exp_C2 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4]; 145 static immutable __m128 _ps_cephes_exp_p0 = [1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4]; 146 static immutable __m128 _ps_cephes_exp_p1 = [1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3]; 147 static immutable __m128 _ps_cephes_exp_p2 = [8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3]; 148 static immutable __m128 _ps_cephes_exp_p3 = [4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2]; 149 static immutable __m128 _ps_cephes_exp_p4 = [1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1]; 150 static immutable __m128 _ps_cephes_exp_p5 = [5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1]; 151 152 __m128 tmp = _mm_setzero_ps(), fx; 153 __m128i emm0; 154 __m128 one = _ps_1; 155 156 x = _mm_min_ps(x, _ps_exp_hi); 157 x = _mm_max_ps(x, _ps_exp_lo); 158 159 /* express exp(x) as exp(g + n*log(2)) */ 160 fx = x * _ps_cephes_LOG2EF; 161 fx += _ps_0p5; 162 163 /* how to perform a floorf with SSE: just below */ 164 emm0 = _mm_cvttps_epi32(fx); 165 tmp = _mm_cvtepi32_ps(emm0); 166 167 /* if greater, substract 1 */ 168 __m128 mask = _mm_cmpgt_ps(tmp, fx); 169 mask = _mm_and_ps(mask, one); 170 fx = tmp - mask; 171 172 tmp = fx * _ps_cephes_exp_C1; 173 __m128 z = fx * _ps_cephes_exp_C2; 174 x -= tmp; 175 x -= z; 176 177 z = x * x; 178 179 __m128 y = _ps_cephes_exp_p0; 180 y *= x; 181 y += _ps_cephes_exp_p1; 182 y *= x; 183 y += _ps_cephes_exp_p2; 184 y *= x; 185 y += _ps_cephes_exp_p3; 186 y *= x; 187 y += _ps_cephes_exp_p4; 188 y *= x; 189 y += _ps_cephes_exp_p5; 190 y *= z; 191 y += x; 192 y += one; 193 194 /* build 2^n */ 195 emm0 = _mm_cvttps_epi32(fx); 196 emm0 = _mm_add_epi32(emm0, cast(__m128i)_pi32_0x7f); 197 emm0 = _mm_slli_epi32(emm0, 23); 198 __m128 pow2n = _mm_castsi128_ps(emm0); 199 y = _mm_mul_ps(y, pow2n); 200 return y; 201 } 202 203 /// Computes `base^exponent` for a single 32-bit float. 204 /// This is an approximation, valid up to approximately -100dB of accuracy 205 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 206 // #BONUS 207 float _mm_pow_ss(float base, float exponent) pure @safe 208 { 209 __m128 r = _mm_pow_ps(_mm_set1_ps(base), _mm_set1_ps(exponent)); 210 return r[0]; 211 } 212 213 /// Computes `base^exponent`, for 4 floats at once. 214 /// This is an approximation, valid up to approximately -100dB of accuracy 215 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 216 // #BONUS 217 __m128 _mm_pow_ps(__m128 base, __m128 exponents) pure @safe 218 { 219 return _mm_exp_ps(exponents * _mm_log_ps(base)); 220 } 221 222 /// Computes `base^exponent`, for 4 floats at once. 223 /// This is an approximation, valid up to approximately -100dB of accuracy 224 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 225 // #BONUS 226 __m128 _mm_pow_ps(__m128 base, float exponent) pure @safe 227 { 228 return _mm_exp_ps(_mm_set1_ps(exponent) * _mm_log_ps(base)); 229 } 230 231 unittest 232 { 233 import std.math; 234 235 bool approxEquals(double groundTruth, double approx, double epsilon) pure @trusted @nogc nothrow 236 { 237 if (!isFinite(groundTruth)) 238 return true; // no need to approximate where this is NaN or infinite 239 240 if (groundTruth == 0) // the approximaton should produce zero too if needed 241 { 242 return approx == 0; 243 } 244 245 if (approx == 0) 246 { 247 // If the approximation produces zero, the error should be below 140 dB 248 return ( abs(groundTruth) < 1e-7 ); 249 } 250 251 if ( ( abs(groundTruth / approx) - 1 ) >= epsilon) 252 { 253 import core.stdc.stdio; 254 debug printf("approxEquals (%g, %g, %g) failed\n", groundTruth, approx, epsilon); 255 debug printf("ratio is %f\n", abs(groundTruth / approx) - 1); 256 } 257 258 return ( abs(groundTruth / approx) - 1 ) < epsilon; 259 } 260 261 // test _mm_log_ps 262 for (double mantissa = 0.1; mantissa < 1.0; mantissa += 0.005) 263 { 264 foreach (exponent; -23..23) 265 { 266 double x = mantissa * 2.0 ^^ exponent; 267 double phobosValue = log(x); 268 __m128 v = _mm_log_ps(_mm_set1_ps(x)); 269 foreach(i; 0..4) 270 assert(approxEquals(phobosValue, v[i], 1.1e-6)); 271 } 272 } 273 274 // test _mm_exp_ps 275 for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.01) 276 { 277 foreach (exponent; -23..23) 278 { 279 double x = mantissa * 2.0 ^^ exponent; 280 281 // don't test too high numbers because they saturate FP precision pretty fast 282 if (x > 50) continue; 283 284 double phobosValue = exp(x); 285 __m128 v = _mm_exp_ps(_mm_set1_ps(x)); 286 foreach(i; 0..4) 287 assert(approxEquals(phobosValue, v[i], 3.4e-6)); 288 } 289 } 290 291 // test than exp(-inf) is 0 292 { 293 __m128 R = _mm_exp_ps(_mm_set1_ps(-float.infinity)); 294 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 295 assert(R.array == correct); 296 } 297 298 // test log baheviour with NaN and infinities 299 // the only guarantee for now is that _mm_log_ps(negative) yield a NaN 300 { 301 __m128 R = _mm_log_ps(_mm_setr_ps(+0.0f, -0.0f, -1.0f, float.nan)); 302 // DOESN'T PASS 303 // assert(isInfinity(R[0]) && R[0] < 0); // log(+0.0f) = -infinity 304 // DOESN'T PASS 305 // assert(isInfinity(R[1]) && R[1] < 0); // log(-0.0f) = -infinity 306 assert(isNaN(R[2])); // log(negative number) = NaN 307 308 // DOESN'T PASS 309 //assert(isNaN(R[3])); // log(NaN) = NaN 310 } 311 312 313 // test _mm_pow_ps 314 for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.01) 315 { 316 foreach (exponent; -8..4) 317 { 318 double powExponent = mantissa * 2.0 ^^ exponent; 319 320 for (double mantissa2 = 0.1; mantissa2 < 1.0; mantissa2 += 0.01) 321 { 322 foreach (exponent2; -4..4) 323 { 324 double powBase = mantissa2 * 2.0 ^^ exponent2; 325 double phobosValue = pow(powBase, powExponent); 326 float fPhobos = phobosValue; 327 if (!isFinite(fPhobos)) continue; 328 __m128 v = _mm_pow_ps(_mm_set1_ps(powBase), _mm_set1_ps(powExponent)); 329 330 foreach(i; 0..4) 331 { 332 if (!approxEquals(phobosValue, v[i], 1e-5)) 333 { 334 printf("%g ^^ %g\n", powBase, powExponent); 335 assert(false); 336 } 337 } 338 } 339 } 340 } 341 } 342 }