1 /** 2 * Transcendental function on 4 numbers at once. 3 * 4 * Copyright: Copyright Auburn Sounds 2016-2018. 5 * Copyright (C) 2007 Julien Pommier 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 * Authors: Guillaume Piolat 8 */ 9 module inteli.math; 10 11 /* Copyright (C) 2007 Julien Pommier 12 13 This software is provided 'as-is', without any express or implied 14 warranty. In no event will the authors be held liable for any damages 15 arising from the use of this software. 16 17 Permission is granted to anyone to use this software for any purpose, 18 including commercial applications, and to alter it and redistribute it 19 freely, subject to the following restrictions: 20 21 1. The origin of this software must not be misrepresented; you must not 22 claim that you wrote the original software. If you use this software 23 in a product, an acknowledgment in the product documentation would be 24 appreciated but is not required. 25 2. Altered source versions must be plainly marked as such, and must not be 26 misrepresented as being the original software. 27 3. This notice may not be removed or altered from any source distribution. 28 29 (this is the zlib license) 30 */ 31 import inteli.emmintrin; 32 import inteli.internals; 33 34 nothrow @nogc: 35 36 /// Natural `log` computed for a single 32-bit float. 37 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50 38 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 39 // #BONUS 40 float _mm_log_ss(float v) pure @safe 41 { 42 __m128 r = _mm_log_ps(_mm_set1_ps(v)); 43 return r.array[0]; 44 } 45 46 /// Natural logarithm computed for 4 simultaneous float. 47 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50 48 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 49 // #BONUS 50 __m128 _mm_log_ps(__m128 x) pure @safe 51 { 52 static immutable __m128i _psi_inv_mant_mask = [~0x7f800000, ~0x7f800000, ~0x7f800000, ~0x7f800000]; 53 static immutable __m128 _ps_cephes_SQRTHF = [0.707106781186547524, 0.707106781186547524, 0.707106781186547524, 0.707106781186547524]; 54 static immutable __m128 _ps_cephes_log_p0 = [7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2]; 55 static immutable __m128 _ps_cephes_log_p1 = [- 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1]; 56 static immutable __m128 _ps_cephes_log_p2 = [1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1]; 57 static immutable __m128 _ps_cephes_log_p3 = [- 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1]; 58 static immutable __m128 _ps_cephes_log_p4 = [+ 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1]; 59 static immutable __m128 _ps_cephes_log_p5 = [- 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1]; 60 static immutable __m128 _ps_cephes_log_p6 = [+ 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1]; 61 static immutable __m128 _ps_cephes_log_p7 = [- 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1]; 62 static immutable __m128 _ps_cephes_log_p8 = [+ 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1]; 63 static immutable __m128 _ps_cephes_log_q1 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4]; 64 static immutable __m128 _ps_cephes_log_q2 = [0.693359375, 0.693359375, 0.693359375, 0.693359375]; 65 66 /* the smallest non denormalized float number */ 67 static immutable __m128i _psi_min_norm_pos = [0x00800000, 0x00800000, 0x00800000, 0x00800000]; 68 69 __m128i emm0; 70 __m128 one = _ps_1; 71 __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps()); 72 x = _mm_max_ps(x, cast(__m128)_psi_min_norm_pos); /* cut off denormalized stuff */ 73 emm0 = _mm_srli_epi32(cast(__m128i)x, 23); 74 75 /* keep only the fractional part */ 76 x = _mm_and_ps(x, cast(__m128)_psi_inv_mant_mask); 77 x = _mm_or_ps(x, _ps_0p5); 78 79 emm0 = _mm_sub_epi32(emm0, _pi32_0x7f); 80 __m128 e = _mm_cvtepi32_ps(emm0); 81 e += one; 82 __m128 mask = _mm_cmplt_ps(x, _ps_cephes_SQRTHF); 83 __m128 tmp = _mm_and_ps(x, mask); 84 x -= one; 85 e -= _mm_and_ps(one, mask); 86 x += tmp; 87 __m128 z = x * x; 88 __m128 y = _ps_cephes_log_p0; 89 y = _mm_mul_ps(y, x); 90 y = _mm_add_ps(y, _ps_cephes_log_p1); 91 y = _mm_mul_ps(y, x); 92 y = _mm_add_ps(y, _ps_cephes_log_p2); 93 y = _mm_mul_ps(y, x); 94 y = _mm_add_ps(y, _ps_cephes_log_p3); 95 y = _mm_mul_ps(y, x); 96 y = _mm_add_ps(y, _ps_cephes_log_p4); 97 y = _mm_mul_ps(y, x); 98 y = _mm_add_ps(y, _ps_cephes_log_p5); 99 y = _mm_mul_ps(y, x); 100 y = _mm_add_ps(y, _ps_cephes_log_p6); 101 y = _mm_mul_ps(y, x); 102 y = _mm_add_ps(y, _ps_cephes_log_p7); 103 y = _mm_mul_ps(y, x); 104 y = _mm_add_ps(y, _ps_cephes_log_p8); 105 y = _mm_mul_ps(y, x); 106 y = _mm_mul_ps(y, z); 107 tmp = _mm_mul_ps(e, _ps_cephes_log_q1); 108 y = _mm_add_ps(y, tmp); 109 tmp = _mm_mul_ps(z, _ps_0p5); 110 y = _mm_sub_ps(y, tmp); 111 tmp = _mm_mul_ps(e, _ps_cephes_log_q2); 112 x = _mm_add_ps(x, y); 113 x = _mm_add_ps(x, tmp); 114 x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN 115 return x; 116 } 117 118 /// Natural `exp` computed for a single float. 119 /// This is an approximation, valid up to approximately -109dB of accuracy 120 /// IMPORTANT: NaN input not supported. 121 // #BONUS 122 float _mm_exp_ss(float v) pure @safe 123 { 124 __m128 r = _mm_exp_ps(_mm_set1_ps(v)); 125 return r.array[0]; 126 } 127 128 /// Natural `exp` computed for 4 simultaneous float in `x`. 129 /// This is an approximation, valid up to approximately -109dB of accuracy 130 /// IMPORTANT: NaN input not supported. 131 // #BONUS 132 __m128 _mm_exp_ps(__m128 x) pure @safe 133 { 134 static immutable __m128 _ps_exp_hi = [88.3762626647949f, 88.3762626647949f, 88.3762626647949f, 88.3762626647949f]; 135 static immutable __m128 _ps_exp_lo = [-88.3762626647949f, -88.3762626647949f, -88.3762626647949f, -88.3762626647949f]; 136 static immutable __m128 _ps_cephes_LOG2EF = [1.44269504088896341, 1.44269504088896341, 1.44269504088896341, 1.44269504088896341]; 137 static immutable __m128 _ps_cephes_exp_C1 = [0.693359375, 0.693359375, 0.693359375, 0.693359375]; 138 static immutable __m128 _ps_cephes_exp_C2 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4]; 139 static immutable __m128 _ps_cephes_exp_p0 = [1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4]; 140 static immutable __m128 _ps_cephes_exp_p1 = [1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3]; 141 static immutable __m128 _ps_cephes_exp_p2 = [8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3]; 142 static immutable __m128 _ps_cephes_exp_p3 = [4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2]; 143 static immutable __m128 _ps_cephes_exp_p4 = [1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1]; 144 static immutable __m128 _ps_cephes_exp_p5 = [5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1]; 145 146 __m128 tmp = _mm_setzero_ps(), fx; 147 __m128i emm0; 148 __m128 one = _ps_1; 149 150 x = _mm_min_ps(x, _ps_exp_hi); 151 x = _mm_max_ps(x, _ps_exp_lo); 152 153 /* express exp(x) as exp(g + n*log(2)) */ 154 fx = _mm_mul_ps(x, _ps_cephes_LOG2EF); 155 fx = _mm_add_ps(fx, _ps_0p5); 156 157 /* how to perform a floorf with SSE: just below */ 158 emm0 = _mm_cvttps_epi32(fx); 159 tmp = _mm_cvtepi32_ps(emm0); 160 161 /* if greater, substract 1 */ 162 __m128 mask = _mm_cmpgt_ps(tmp, fx); 163 mask = _mm_and_ps(mask, one); 164 fx = tmp - mask; 165 166 tmp = _mm_mul_ps(fx, _ps_cephes_exp_C1); 167 __m128 z = _mm_mul_ps(fx, _ps_cephes_exp_C2); 168 x -= tmp; 169 x -= z; 170 171 z = x * x; 172 173 __m128 y = _ps_cephes_exp_p0; 174 y = _mm_mul_ps(y, x); 175 y = _mm_add_ps(y, _ps_cephes_exp_p1); 176 y = _mm_mul_ps(y, x); 177 y = _mm_add_ps(y, _ps_cephes_exp_p2); 178 y = _mm_mul_ps(y, x); 179 y = _mm_add_ps(y, _ps_cephes_exp_p3); 180 y = _mm_mul_ps(y, x); 181 y = _mm_add_ps(y, _ps_cephes_exp_p4); 182 y = _mm_mul_ps(y, x); 183 y = _mm_add_ps(y, _ps_cephes_exp_p5); 184 y = _mm_mul_ps(y, z); 185 y = _mm_add_ps(y, x); 186 y += one; 187 188 /* build 2^n */ 189 emm0 = _mm_cvttps_epi32(fx); 190 191 emm0 = _mm_add_epi32(emm0, _pi32_0x7f); 192 emm0 = _mm_slli_epi32(emm0, 23); 193 __m128 pow2n = cast(__m128)emm0; 194 y *= pow2n; 195 return y; 196 } 197 198 /// Computes `base^exponent` for a single 32-bit float. 199 /// This is an approximation, valid up to approximately -100dB of accuracy 200 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 201 // #BONUS 202 float _mm_pow_ss(float base, float exponent) pure @safe 203 { 204 __m128 r = _mm_pow_ps(_mm_set1_ps(base), _mm_set1_ps(exponent)); 205 return r.array[0]; 206 } 207 208 /// Computes `base^exponent`, for 4 floats at once. 209 /// This is an approximation, valid up to approximately -100dB of accuracy 210 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 211 // #BONUS 212 __m128 _mm_pow_ps(__m128 base, __m128 exponents) pure @safe 213 { 214 return _mm_exp_ps(exponents * _mm_log_ps(base)); 215 } 216 217 /// Computes `base^exponent`, for 4 floats at once. 218 /// This is an approximation, valid up to approximately -100dB of accuracy 219 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite. 220 // #BONUS 221 __m128 _mm_pow_ps(__m128 base, float exponent) pure @safe 222 { 223 return _mm_exp_ps(_mm_set1_ps(exponent) * _mm_log_ps(base)); 224 } 225 226 unittest 227 { 228 import std.math; 229 230 bool approxEquals(double groundTruth, double approx, double epsilon) pure @trusted @nogc nothrow 231 { 232 if (!isFinite(groundTruth)) 233 return true; // no need to approximate where this is NaN or infinite 234 235 if (groundTruth == 0) // the approximaton should produce zero too if needed 236 { 237 return approx == 0; 238 } 239 240 if (approx == 0) 241 { 242 // If the approximation produces zero, the error should be below 140 dB 243 return ( abs(groundTruth) < 1e-7 ); 244 } 245 246 if ( ( abs(groundTruth / approx) - 1 ) >= epsilon) 247 { 248 import core.stdc.stdio; 249 debug printf("approxEquals (%g, %g, %g) failed\n", groundTruth, approx, epsilon); 250 debug printf("ratio is %f\n", abs(groundTruth / approx) - 1); 251 } 252 253 return ( abs(groundTruth / approx) - 1 ) < epsilon; 254 } 255 256 // test _mm_log_ps 257 for (double mantissa = 0.1; mantissa < 1.0; mantissa += 0.05) 258 { 259 foreach (exponent; -23..23) 260 { 261 double x = mantissa * 2.0 ^^ exponent; 262 double phobosValue = log(x); 263 __m128 v = _mm_log_ps(_mm_set1_ps(x)); 264 foreach(i; 0..4) 265 assert(approxEquals(phobosValue, v.array[i], 1.1e-6)); 266 } 267 } 268 269 // test _mm_exp_ps 270 for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1) 271 { 272 foreach (exponent; -23..23) 273 { 274 double x = mantissa * 2.0 ^^ exponent; 275 276 // don't test too high numbers because they saturate FP precision pretty fast 277 if (x > 50) continue; 278 279 double phobosValue = exp(x); 280 __m128 v = _mm_exp_ps(_mm_set1_ps(x)); 281 foreach(i; 0..4) 282 assert(approxEquals(phobosValue, v.array[i], 3.4e-6)); 283 } 284 } 285 286 // test than exp(-inf) is 0 287 { 288 __m128 R = _mm_exp_ps(_mm_set1_ps(-float.infinity)); 289 float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f]; 290 assert(R.array == correct); 291 } 292 293 // test log baheviour with NaN and infinities 294 // the only guarantee for now is that _mm_log_ps(negative) yield a NaN 295 { 296 __m128 R = _mm_log_ps(_mm_setr_ps(+0.0f, -0.0f, -1.0f, float.nan)); 297 // DOESN'T PASS 298 // assert(isInfinity(R[0]) && R[0] < 0); // log(+0.0f) = -infinity 299 // DOESN'T PASS 300 // assert(isInfinity(R[1]) && R[1] < 0); // log(-0.0f) = -infinity 301 assert(isNaN(R.array[2])); // log(negative number) = NaN 302 303 // DOESN'T PASS 304 //assert(isNaN(R[3])); // log(NaN) = NaN 305 } 306 307 308 // test _mm_pow_ps 309 for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1) 310 { 311 foreach (exponent; -8..4) 312 { 313 double powExponent = mantissa * 2.0 ^^ exponent; 314 315 for (double mantissa2 = 0.1; mantissa2 < 1.0; mantissa2 += 0.1) 316 { 317 foreach (exponent2; -4..4) 318 { 319 double powBase = mantissa2 * 2.0 ^^ exponent2; 320 double phobosValue = pow(powBase, powExponent); 321 float fPhobos = phobosValue; 322 if (!isFinite(fPhobos)) continue; 323 __m128 v = _mm_pow_ps(_mm_set1_ps(powBase), _mm_set1_ps(powExponent)); 324 325 foreach(i; 0..4) 326 { 327 if (!approxEquals(phobosValue, v.array[i], 1e-5)) 328 { 329 printf("%g ^^ %g\n", powBase, powExponent); 330 assert(false); 331 } 332 } 333 } 334 } 335 } 336 } 337 } 338 339 private: 340 341 static immutable __m128 _ps_1 = [1.0f, 1.0f, 1.0f, 1.0f]; 342 static immutable __m128 _ps_0p5 = [0.5f, 0.5f, 0.5f, 0.5f]; 343 static immutable __m128i _pi32_0x7f = [0x7f, 0x7f, 0x7f, 0x7f];