inteli.math source code

1 /**
2 * Transcendental function on 4 numbers at once.
3 *
4 * Copyright: Copyright Auburn Sounds 2016-2018.
5 *            Copyright (C) 2007  Julien Pommier
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 * Authors:   Guillaume Piolat
8 */
9 module inteli.math;
10 
11 /* Copyright (C) 2007  Julien Pommier
12 
13   This software is provided 'as-is', without any express or implied
14   warranty.  In no event will the authors be held liable for any damages
15   arising from the use of this software.
16 
17   Permission is granted to anyone to use this software for any purpose,
18   including commercial applications, and to alter it and redistribute it
19   freely, subject to the following restrictions:
20 
21   1. The origin of this software must not be misrepresented; you must not
22      claim that you wrote the original software. If you use this software
23      in a product, an acknowledgment in the product documentation would be
24      appreciated but is not required.
25   2. Altered source versions must be plainly marked as such, and must not be
26      misrepresented as being the original software.
27   3. This notice may not be removed or altered from any source distribution.
28 
29   (this is the zlib license)
30 */
31 import inteli.emmintrin;
32 import inteli.internals;
33 
34 nothrow @nogc:
35 
36 /// Natural `log` computed for a single 32-bit float.
37 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50
38 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
39 // #BONUS
40 float _mm_log_ss(float v) pure @safe
41 {
42     __m128 r = _mm_log_ps(_mm_set1_ps(v));
43     return r.array[0];
44 }
45 
46 /// Natural logarithm computed for 4 simultaneous float.
47 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50
48 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
49 // #BONUS
50 __m128 _mm_log_ps(__m128 x) pure @safe
51 {
52     static immutable __m128i _psi_inv_mant_mask = [~0x7f800000, ~0x7f800000, ~0x7f800000, ~0x7f800000];
53     static immutable __m128 _ps_cephes_SQRTHF = [0.707106781186547524, 0.707106781186547524, 0.707106781186547524, 0.707106781186547524];
54     static immutable __m128 _ps_cephes_log_p0 = [7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2];
55     static immutable __m128 _ps_cephes_log_p1 = [- 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1];
56     static immutable __m128 _ps_cephes_log_p2 = [1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1];
57     static immutable __m128 _ps_cephes_log_p3 = [- 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1];
58     static immutable __m128 _ps_cephes_log_p4 = [+ 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1];
59     static immutable __m128 _ps_cephes_log_p5 = [- 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1];
60     static immutable __m128 _ps_cephes_log_p6 = [+ 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1];
61     static immutable __m128 _ps_cephes_log_p7 = [- 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1];
62     static immutable __m128 _ps_cephes_log_p8 = [+ 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1];
63     static immutable __m128 _ps_cephes_log_q1 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4];
64     static immutable __m128 _ps_cephes_log_q2 = [0.693359375, 0.693359375, 0.693359375, 0.693359375];
65 
66     /* the smallest non denormalized float number */
67     static immutable __m128i _psi_min_norm_pos  = [0x00800000,   0x00800000,   0x00800000, 0x00800000];
68 
69     __m128i emm0;
70     __m128 one = _ps_1;
71     __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
72     x = _mm_max_ps(x, cast(__m128)_psi_min_norm_pos);  /* cut off denormalized stuff */
73     emm0 = _mm_srli_epi32(cast(__m128i)x, 23);
74 
75     /* keep only the fractional part */
76     x = _mm_and_ps(x, cast(__m128)_psi_inv_mant_mask);
77     x = _mm_or_ps(x, _ps_0p5);
78 
79     emm0 = _mm_sub_epi32(emm0, _pi32_0x7f);
80     __m128 e = _mm_cvtepi32_ps(emm0);
81     e += one;
82     __m128 mask = _mm_cmplt_ps(x, _ps_cephes_SQRTHF);
83     __m128 tmp = _mm_and_ps(x, mask);
84     x -= one;
85     e -= _mm_and_ps(one, mask);
86     x += tmp;
87     __m128 z = x * x;
88     __m128 y = _ps_cephes_log_p0;
89     y = _mm_mul_ps(y, x);
90     y = _mm_add_ps(y, _ps_cephes_log_p1);
91     y = _mm_mul_ps(y, x);
92     y = _mm_add_ps(y, _ps_cephes_log_p2);
93     y = _mm_mul_ps(y, x);
94     y = _mm_add_ps(y, _ps_cephes_log_p3);
95     y = _mm_mul_ps(y, x);
96     y = _mm_add_ps(y, _ps_cephes_log_p4);
97     y = _mm_mul_ps(y, x);
98     y = _mm_add_ps(y, _ps_cephes_log_p5);
99     y = _mm_mul_ps(y, x);
100     y = _mm_add_ps(y, _ps_cephes_log_p6);
101     y = _mm_mul_ps(y, x);
102     y = _mm_add_ps(y, _ps_cephes_log_p7);
103     y = _mm_mul_ps(y, x);
104     y = _mm_add_ps(y, _ps_cephes_log_p8);
105     y = _mm_mul_ps(y, x);
106     y = _mm_mul_ps(y, z);
107     tmp = _mm_mul_ps(e, _ps_cephes_log_q1);
108     y = _mm_add_ps(y, tmp);
109     tmp = _mm_mul_ps(z, _ps_0p5);
110     y = _mm_sub_ps(y, tmp);
111     tmp = _mm_mul_ps(e, _ps_cephes_log_q2);
112     x = _mm_add_ps(x, y);
113     x = _mm_add_ps(x, tmp);
114     x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
115     return x;
116 }
117 
118 /// Natural `exp` computed for a single float.
119 /// This is an approximation, valid up to approximately -109dB of accuracy
120 /// IMPORTANT: NaN input not supported.
121 // #BONUS
122 float _mm_exp_ss(float v) pure @safe
123 {
124     __m128 r = _mm_exp_ps(_mm_set1_ps(v));
125     return r.array[0];
126 }
127 
128 /// Natural `exp` computed for 4 simultaneous float in `x`.
129 /// This is an approximation, valid up to approximately -109dB of accuracy
130 /// IMPORTANT: NaN input not supported.
131 // #BONUS
132 __m128 _mm_exp_ps(__m128 x) pure @safe
133 {
134     static immutable __m128 _ps_exp_hi         = [88.3762626647949f, 88.3762626647949f, 88.3762626647949f, 88.3762626647949f];
135     static immutable __m128 _ps_exp_lo         = [-88.3762626647949f, -88.3762626647949f, -88.3762626647949f, -88.3762626647949f];
136     static immutable __m128 _ps_cephes_LOG2EF  = [1.44269504088896341, 1.44269504088896341, 1.44269504088896341, 1.44269504088896341];
137     static immutable __m128 _ps_cephes_exp_C1  = [0.693359375, 0.693359375, 0.693359375, 0.693359375];
138     static immutable __m128 _ps_cephes_exp_C2  = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4];
139     static immutable __m128 _ps_cephes_exp_p0  = [1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4];
140     static immutable __m128 _ps_cephes_exp_p1  = [1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3];
141     static immutable __m128 _ps_cephes_exp_p2  = [8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3];
142     static immutable __m128 _ps_cephes_exp_p3  = [4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2];
143     static immutable __m128 _ps_cephes_exp_p4  = [1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1];
144     static immutable __m128 _ps_cephes_exp_p5  = [5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1];
145 
146     __m128 tmp = _mm_setzero_ps(), fx;
147     __m128i emm0;
148     __m128 one = _ps_1;
149 
150     x = _mm_min_ps(x, _ps_exp_hi);
151     x = _mm_max_ps(x, _ps_exp_lo);
152 
153     /* express exp(x) as exp(g + n*log(2)) */
154     fx = _mm_mul_ps(x, _ps_cephes_LOG2EF);
155     fx = _mm_add_ps(fx, _ps_0p5);
156 
157     /* how to perform a floorf with SSE: just below */
158     emm0 = _mm_cvttps_epi32(fx);
159     tmp  = _mm_cvtepi32_ps(emm0);
160 
161     /* if greater, substract 1 */
162     __m128 mask = _mm_cmpgt_ps(tmp, fx);
163     mask = _mm_and_ps(mask, one);
164     fx = tmp - mask;
165 
166     tmp = _mm_mul_ps(fx, _ps_cephes_exp_C1);
167     __m128 z = _mm_mul_ps(fx, _ps_cephes_exp_C2);
168     x -= tmp;
169     x -= z;
170 
171     z = x * x;
172 
173     __m128 y = _ps_cephes_exp_p0;
174     y = _mm_mul_ps(y, x);
175     y = _mm_add_ps(y, _ps_cephes_exp_p1);
176     y = _mm_mul_ps(y, x);
177     y = _mm_add_ps(y, _ps_cephes_exp_p2);
178     y = _mm_mul_ps(y, x);
179     y = _mm_add_ps(y, _ps_cephes_exp_p3);
180     y = _mm_mul_ps(y, x);
181     y = _mm_add_ps(y, _ps_cephes_exp_p4);
182     y = _mm_mul_ps(y, x);
183     y = _mm_add_ps(y, _ps_cephes_exp_p5);
184     y = _mm_mul_ps(y, z);
185     y = _mm_add_ps(y, x);
186     y += one;
187 
188     /* build 2^n */
189     emm0 = _mm_cvttps_epi32(fx);
190 
191     emm0 = _mm_add_epi32(emm0, _pi32_0x7f);
192     emm0 = _mm_slli_epi32(emm0, 23);
193     __m128 pow2n = cast(__m128)emm0;
194     y *= pow2n;
195     return y;
196 }
197 
198 /// Computes `base^exponent` for a single 32-bit float.
199 /// This is an approximation, valid up to approximately -100dB of accuracy
200 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
201 // #BONUS
202 float _mm_pow_ss(float base, float exponent) pure @safe
203 {
204     __m128 r = _mm_pow_ps(_mm_set1_ps(base), _mm_set1_ps(exponent));
205     return r.array[0];
206 }
207 
208 /// Computes `base^exponent`, for 4 floats at once.
209 /// This is an approximation, valid up to approximately -100dB of accuracy
210 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
211 // #BONUS
212 __m128 _mm_pow_ps(__m128 base, __m128 exponents) pure @safe
213 {
214     return _mm_exp_ps(exponents * _mm_log_ps(base));
215 }
216 
217 /// Computes `base^exponent`, for 4 floats at once.
218 /// This is an approximation, valid up to approximately -100dB of accuracy
219 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
220 // #BONUS
221 __m128 _mm_pow_ps(__m128 base, float exponent) pure @safe
222 {
223     return _mm_exp_ps(_mm_set1_ps(exponent) * _mm_log_ps(base));
224 }
225 
226 unittest
227 {
228     import std.math;
229 
230     bool approxEquals(double groundTruth, double approx, double epsilon) pure @trusted @nogc nothrow
231     {
232         if (!isFinite(groundTruth))
233             return true; // no need to approximate where this is NaN or infinite
234 
235         if (groundTruth == 0) // the approximaton should produce zero too if needed
236         {
237             return approx == 0;
238         }
239 
240         if (approx == 0)
241         {
242             // If the approximation produces zero, the error should be below 140 dB
243             return ( abs(groundTruth) < 1e-7 );
244         }
245 
246         if ( ( abs(groundTruth / approx) - 1 ) >= epsilon)
247         {
248             import core.stdc.stdio;
249             debug printf("approxEquals (%g, %g, %g) failed\n", groundTruth, approx, epsilon);
250             debug printf("ratio is %f\n", abs(groundTruth / approx) - 1);
251         }
252 
253         return ( abs(groundTruth / approx) - 1 ) < epsilon;
254     }
255 
256     // test _mm_log_ps
257     for (double mantissa = 0.1; mantissa < 1.0; mantissa += 0.05)
258     {
259         foreach (exponent; -23..23)
260         {
261             double x = mantissa * 2.0 ^^ exponent;
262             double phobosValue = log(x);
263             __m128 v = _mm_log_ps(_mm_set1_ps(x));
264             foreach(i; 0..4)
265                 assert(approxEquals(phobosValue, v.array[i], 1.1e-6));
266         }
267     }
268 
269     // test _mm_exp_ps    
270     for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1)
271     {
272         foreach (exponent; -23..23)
273         {
274             double x = mantissa * 2.0 ^^ exponent;
275 
276             // don't test too high numbers because they saturate FP precision pretty fast
277             if (x > 50) continue;
278 
279             double phobosValue = exp(x);
280             __m128 v = _mm_exp_ps(_mm_set1_ps(x));
281             foreach(i; 0..4)
282                assert(approxEquals(phobosValue, v.array[i], 3.4e-6));
283         }
284     }
285 
286     // test than exp(-inf) is 0
287     {
288         __m128 R = _mm_exp_ps(_mm_set1_ps(-float.infinity));
289         float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
290         assert(R.array == correct);
291     }
292 
293     // test log baheviour with NaN and infinities
294     // the only guarantee for now is that _mm_log_ps(negative) yield a NaN
295     {
296         __m128 R = _mm_log_ps(_mm_setr_ps(+0.0f, -0.0f, -1.0f, float.nan));
297       // DOESN'T PASS
298       //  assert(isInfinity(R[0]) && R[0] < 0); // log(+0.0f) = -infinity
299       // DOESN'T PASS
300       //  assert(isInfinity(R[1]) && R[1] < 0); // log(-0.0f) = -infinity
301         assert(isNaN(R.array[2])); // log(negative number) = NaN
302 
303         // DOESN'T PASS
304         //assert(isNaN(R[3])); // log(NaN) = NaN
305     }
306 
307 
308     // test _mm_pow_ps
309     for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1)
310     {
311         foreach (exponent; -8..4)
312         {
313             double powExponent = mantissa * 2.0 ^^ exponent;
314 
315             for (double mantissa2 = 0.1; mantissa2 < 1.0; mantissa2 += 0.1)
316             {
317                 foreach (exponent2; -4..4)
318                 {
319                     double powBase = mantissa2 * 2.0 ^^ exponent2;
320                     double phobosValue = pow(powBase, powExponent);
321                     float fPhobos = phobosValue;
322                     if (!isFinite(fPhobos)) continue;
323                      __m128 v = _mm_pow_ps(_mm_set1_ps(powBase), _mm_set1_ps(powExponent));
324 
325                     foreach(i; 0..4)
326                     {
327                         if (!approxEquals(phobosValue, v.array[i], 1e-5))
328                         {
329                             printf("%g ^^ %g\n", powBase, powExponent);
330                             assert(false);
331                         }
332                     }
333                 }
334             }
335         }
336     }
337 }
338 
339 private:
340 
341 static immutable __m128 _ps_1   = [1.0f, 1.0f, 1.0f, 1.0f];
342 static immutable __m128 _ps_0p5 = [0.5f, 0.5f, 0.5f, 0.5f];
343 static immutable __m128i _pi32_0x7f = [0x7f, 0x7f, 0x7f, 0x7f];