inteli.math source code

1 /**
2 * Transcendental function on 4 numbers at once.
3 *
4 * Copyright: Copyright Auburn Sounds 2016-2018.
5 *            Copyright (C) 2007  Julien Pommier
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 * Authors:   Guillaume Piolat
8 */
9 module inteli.math;
10 
11 /* Copyright (C) 2007  Julien Pommier
12 
13   This software is provided 'as-is', without any express or implied
14   warranty.  In no event will the authors be held liable for any damages
15   arising from the use of this software.
16 
17   Permission is granted to anyone to use this software for any purpose,
18   including commercial applications, and to alter it and redistribute it
19   freely, subject to the following restrictions:
20 
21   1. The origin of this software must not be misrepresented; you must not
22      claim that you wrote the original software. If you use this software
23      in a product, an acknowledgment in the product documentation would be
24      appreciated but is not required.
25   2. Altered source versions must be plainly marked as such, and must not be
26      misrepresented as being the original software.
27   3. This notice may not be removed or altered from any source distribution.
28 
29   (this is the zlib license)
30 */
31 import inteli.emmintrin;
32 import inteli.internals;
33 
34 nothrow @nogc:
35 
36 /// Natural `log` computed for a single 32-bit float.
37 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50
38 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
39 // #BONUS
40 float _mm_log_ss(float v) pure @safe
41 {
42     __m128 r = _mm_log_ps(_mm_set1_ps(v));
43     return r[0];
44 }
45 
46 /// Natural logarithm computed for 4 simultaneous float.
47 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50
48 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
49 // #BONUS
50 __m128 _mm_log_ps(__m128 x) pure @safe
51 {
52     static immutable __m128i _psi_inv_mant_mask = [~0x7f800000, ~0x7f800000, ~0x7f800000, ~0x7f800000];
53     static immutable __m128 _ps_cephes_SQRTHF = [0.707106781186547524, 0.707106781186547524, 0.707106781186547524, 0.707106781186547524];
54     static immutable __m128 _ps_cephes_log_p0 = [7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2];
55     static immutable __m128 _ps_cephes_log_p1 = [- 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1];
56     static immutable __m128 _ps_cephes_log_p2 = [1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1];
57     static immutable __m128 _ps_cephes_log_p3 = [- 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1];
58     static immutable __m128 _ps_cephes_log_p4 = [+ 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1];
59     static immutable __m128 _ps_cephes_log_p5 = [- 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1];
60     static immutable __m128 _ps_cephes_log_p6 = [+ 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1];
61     static immutable __m128 _ps_cephes_log_p7 = [- 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1];
62     static immutable __m128 _ps_cephes_log_p8 = [+ 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1];
63     static immutable __m128 _ps_cephes_log_q1 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4];
64     static immutable __m128 _ps_cephes_log_q2 = [0.693359375, 0.693359375, 0.693359375, 0.693359375];
65 
66     /* the smallest non denormalized float number */
67     static immutable __m128i _psi_min_norm_pos  = [0x00800000,   0x00800000,   0x00800000, 0x00800000];
68 
69     __m128i emm0;
70     __m128 one = _ps_1;
71     __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
72     x = _mm_max_ps(x, cast(__m128)_psi_min_norm_pos);  /* cut off denormalized stuff */
73     emm0 = _mm_srli_epi32(cast(__m128i)x, 23);
74 
75     /* keep only the fractional part */
76     x = _mm_and_ps(x, cast(__m128)_psi_inv_mant_mask);
77     x = _mm_or_ps(x, _ps_0p5);
78 
79     emm0 -= _pi32_0x7f;
80     __m128 e = _mm_cvtepi32_ps(emm0);
81     e += one;
82     __m128 mask = _mm_cmplt_ps(x, _ps_cephes_SQRTHF);
83     __m128 tmp = _mm_and_ps(x, mask);
84     x -= one;
85     e -= _mm_and_ps(one, mask);
86     x += tmp;
87     __m128 z = x * x;
88     __m128 y = _ps_cephes_log_p0;
89     y *= x;
90     y += _ps_cephes_log_p1;
91     y *= x;
92     y += _ps_cephes_log_p2;
93     y *= x;
94     y += _ps_cephes_log_p3;
95     y *= x;
96     y += _ps_cephes_log_p4;
97     y *= x;
98     y += _ps_cephes_log_p5;
99     y *= x;
100     y += _ps_cephes_log_p6;
101     y *= x;
102     y += _ps_cephes_log_p7;
103     y *= x;
104     y += _ps_cephes_log_p8;
105     y *= x;
106     y *= z;
107     tmp = e * _ps_cephes_log_q1;
108     y += tmp;
109     tmp = z * _ps_0p5;
110     y = y - tmp;
111     tmp = e * _ps_cephes_log_q2;
112     x += y;
113     x += tmp;
114     x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
115     return x;
116 }
117 
118 /// Natural `exp` computed for a single float.
119 /// This is an approximation, valid up to approximately -109dB of accuracy
120 /// IMPORTANT: NaN input not supported.
121 // #BONUS
122 float _mm_exp_ss(float v) pure @safe
123 {
124     __m128 r = _mm_exp_ps(_mm_set1_ps(v));
125     return r[0];
126 }
127 
128 /// Natural `exp` computed for 4 simultaneous float in `x`.
129 /// This is an approximation, valid up to approximately -109dB of accuracy
130 /// IMPORTANT: NaN input not supported.
131 // #BONUS
132 __m128 _mm_exp_ps(__m128 x) pure @safe
133 {
134     static immutable __m128 _ps_exp_hi         = [88.3762626647949f, 88.3762626647949f, 88.3762626647949f, 88.3762626647949f];
135     static immutable __m128 _ps_exp_lo         = [-88.3762626647949f, -88.3762626647949f, -88.3762626647949f, -88.3762626647949f];
136     static immutable __m128 _ps_cephes_LOG2EF  = [1.44269504088896341, 1.44269504088896341, 1.44269504088896341, 1.44269504088896341];
137     static immutable __m128 _ps_cephes_exp_C1  = [0.693359375, 0.693359375, 0.693359375, 0.693359375];
138     static immutable __m128 _ps_cephes_exp_C2  = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4];
139     static immutable __m128 _ps_cephes_exp_p0  = [1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4];
140     static immutable __m128 _ps_cephes_exp_p1  = [1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3];
141     static immutable __m128 _ps_cephes_exp_p2  = [8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3];
142     static immutable __m128 _ps_cephes_exp_p3  = [4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2];
143     static immutable __m128 _ps_cephes_exp_p4  = [1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1];
144     static immutable __m128 _ps_cephes_exp_p5  = [5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1];
145 
146     __m128 tmp = _mm_setzero_ps(), fx;
147     __m128i emm0;
148     __m128 one = _ps_1;
149 
150     x = _mm_min_ps(x, _ps_exp_hi);
151     x = _mm_max_ps(x, _ps_exp_lo);
152 
153     /* express exp(x) as exp(g + n*log(2)) */
154     fx = x * _ps_cephes_LOG2EF;
155     fx += _ps_0p5;
156 
157     /* how to perform a floorf with SSE: just below */
158     emm0 = _mm_cvttps_epi32(fx);
159     tmp  = _mm_cvtepi32_ps(emm0);
160 
161     /* if greater, substract 1 */
162     __m128 mask = _mm_cmpgt_ps(tmp, fx);    
163     mask = _mm_and_ps(mask, one);
164     fx = tmp - mask;
165 
166     tmp = fx * _ps_cephes_exp_C1;
167     __m128 z = fx * _ps_cephes_exp_C2;
168     x -= tmp;
169     x -= z;
170 
171     z = x * x;
172 
173     __m128 y = _ps_cephes_exp_p0;
174     y *= x;
175     y += _ps_cephes_exp_p1;
176     y *= x;
177     y += _ps_cephes_exp_p2;
178     y *= x;
179     y += _ps_cephes_exp_p3;
180     y *= x;
181     y += _ps_cephes_exp_p4;
182     y *= x;
183     y += _ps_cephes_exp_p5;
184     y *= z;
185     y += x;
186     y += one;
187 
188     /* build 2^n */
189     emm0 = _mm_cvttps_epi32(fx);
190     emm0 += _pi32_0x7f;
191     emm0 = _mm_slli_epi32(emm0, 23);
192     __m128 pow2n = cast(__m128)emm0;
193     y *= pow2n;
194     return y;
195 }
196 
197 /// Computes `base^exponent` for a single 32-bit float.
198 /// This is an approximation, valid up to approximately -100dB of accuracy
199 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
200 // #BONUS
201 float _mm_pow_ss(float base, float exponent) pure @safe
202 {
203     __m128 r = _mm_pow_ps(_mm_set1_ps(base), _mm_set1_ps(exponent));
204     return r[0];
205 }
206 
207 /// Computes `base^exponent`, for 4 floats at once.
208 /// This is an approximation, valid up to approximately -100dB of accuracy
209 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
210 // #BONUS
211 __m128 _mm_pow_ps(__m128 base, __m128 exponents) pure @safe
212 {
213     return _mm_exp_ps(exponents * _mm_log_ps(base));
214 }
215 
216 /// Computes `base^exponent`, for 4 floats at once.
217 /// This is an approximation, valid up to approximately -100dB of accuracy
218 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
219 // #BONUS
220 __m128 _mm_pow_ps(__m128 base, float exponent) pure @safe
221 {
222     return _mm_exp_ps(_mm_set1_ps(exponent) * _mm_log_ps(base));
223 }
224 
225 unittest
226 {
227     import std.math;
228 
229     bool approxEquals(double groundTruth, double approx, double epsilon) pure @trusted @nogc nothrow
230     {
231         if (!isFinite(groundTruth))
232             return true; // no need to approximate where this is NaN or infinite
233 
234         if (groundTruth == 0) // the approximaton should produce zero too if needed
235         {
236             return approx == 0;
237         }
238 
239         if (approx == 0)
240         {
241             // If the approximation produces zero, the error should be below 140 dB
242             return ( abs(groundTruth) < 1e-7 );
243         }
244 
245         if ( ( abs(groundTruth / approx) - 1 ) >= epsilon)
246         {
247             import core.stdc.stdio;
248             debug printf("approxEquals (%g, %g, %g) failed\n", groundTruth, approx, epsilon);
249             debug printf("ratio is %f\n", abs(groundTruth / approx) - 1);
250         }
251 
252         return ( abs(groundTruth / approx) - 1 ) < epsilon;
253     }
254 
255     // test _mm_log_ps
256     for (double mantissa = 0.1; mantissa < 1.0; mantissa += 0.05)
257     {
258         foreach (exponent; -23..23)
259         {
260             double x = mantissa * 2.0 ^^ exponent;
261             double phobosValue = log(x);
262             __m128 v = _mm_log_ps(_mm_set1_ps(x));
263             foreach(i; 0..4)
264                 assert(approxEquals(phobosValue, v[i], 1.1e-6));
265         }
266     }
267 
268     // test _mm_exp_ps    
269     for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1)
270     {
271         foreach (exponent; -23..23)
272         {
273             double x = mantissa * 2.0 ^^ exponent;
274 
275             // don't test too high numbers because they saturate FP precision pretty fast
276             if (x > 50) continue;
277 
278             double phobosValue = exp(x);
279             __m128 v = _mm_exp_ps(_mm_set1_ps(x));
280             foreach(i; 0..4)
281                assert(approxEquals(phobosValue, v[i], 3.4e-6));
282         }
283     }
284 
285     // test than exp(-inf) is 0
286     {
287         __m128 R = _mm_exp_ps(_mm_set1_ps(-float.infinity));
288         float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
289         assert(R.array == correct);
290     }
291 
292     // test log baheviour with NaN and infinities
293     // the only guarantee for now is that _mm_log_ps(negative) yield a NaN
294     {
295         __m128 R = _mm_log_ps(_mm_setr_ps(+0.0f, -0.0f, -1.0f, float.nan));
296       // DOESN'T PASS
297       //  assert(isInfinity(R[0]) && R[0] < 0); // log(+0.0f) = -infinity
298       // DOESN'T PASS
299       //  assert(isInfinity(R[1]) && R[1] < 0); // log(-0.0f) = -infinity
300         assert(isNaN(R[2])); // log(negative number) = NaN
301 
302         // DOESN'T PASS
303         //assert(isNaN(R[3])); // log(NaN) = NaN
304     }
305 
306 
307     // test _mm_pow_ps
308     for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1)
309     {
310         foreach (exponent; -8..4)
311         {
312             double powExponent = mantissa * 2.0 ^^ exponent;
313 
314             for (double mantissa2 = 0.1; mantissa2 < 1.0; mantissa2 += 0.1)
315             {
316                 foreach (exponent2; -4..4)
317                 {
318                     double powBase = mantissa2 * 2.0 ^^ exponent2;
319                     double phobosValue = pow(powBase, powExponent);
320                     float fPhobos = phobosValue;
321                     if (!isFinite(fPhobos)) continue;
322                      __m128 v = _mm_pow_ps(_mm_set1_ps(powBase), _mm_set1_ps(powExponent));
323 
324                     foreach(i; 0..4)
325                     {
326                         if (!approxEquals(phobosValue, v[i], 1e-5))
327                         {
328                             printf("%g ^^ %g\n", powBase, powExponent);
329                             assert(false);
330                         }
331                     }
332                 }
333             }
334         }
335     }
336 }
337 
338 private:
339 
340 static immutable __m128 _ps_1   = [1.0f, 1.0f, 1.0f, 1.0f];
341 static immutable __m128 _ps_0p5 = [0.5f, 0.5f, 0.5f, 0.5f];
342 static immutable __m128i _pi32_0x7f = [0x7f, 0x7f, 0x7f, 0x7f];