1 /**
2 * Transcendental bonus functions.
3 *
4 * Copyright: Copyright Guillaumr Piolat 2016-2020.
5 *            Copyright (C) 2007  Julien Pommier
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.math;
9 
10 /* Copyright (C) 2007  Julien Pommier
11 
12   This software is provided 'as-is', without any express or implied
13   warranty.  In no event will the authors be held liable for any damages
14   arising from the use of this software.
15 
16   Permission is granted to anyone to use this software for any purpose,
17   including commercial applications, and to alter it and redistribute it
18   freely, subject to the following restrictions:
19 
20   1. The origin of this software must not be misrepresented; you must not
21      claim that you wrote the original software. If you use this software
22      in a product, an acknowledgment in the product documentation would be
23      appreciated but is not required.
24   2. Altered source versions must be plainly marked as such, and must not be
25      misrepresented as being the original software.
26   3. This notice may not be removed or altered from any source distribution.
27 
28   (this is the zlib license)
29 */
30 import inteli.emmintrin;
31 import inteli.internals;
32 
33 nothrow @nogc:
34 
35 /// Natural `log` computed for a single 32-bit float.
36 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50
37 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
38 // #BONUS
39 float _mm_log_ss(float v) pure @safe
40 {
41     __m128 r = _mm_log_ps(_mm_set1_ps(v));
42     return r.array[0];
43 }
44 
45 /// Natural logarithm computed for 4 simultaneous float.
46 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50
47 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
48 // #BONUS
49 __m128 _mm_log_ps(__m128 x) pure @safe
50 {
51     static immutable __m128i _psi_inv_mant_mask = [~0x7f800000, ~0x7f800000, ~0x7f800000, ~0x7f800000];
52     static immutable __m128 _ps_cephes_SQRTHF = [0.707106781186547524, 0.707106781186547524, 0.707106781186547524, 0.707106781186547524];
53     static immutable __m128 _ps_cephes_log_p0 = [7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2];
54     static immutable __m128 _ps_cephes_log_p1 = [- 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1];
55     static immutable __m128 _ps_cephes_log_p2 = [1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1];
56     static immutable __m128 _ps_cephes_log_p3 = [- 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1];
57     static immutable __m128 _ps_cephes_log_p4 = [+ 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1];
58     static immutable __m128 _ps_cephes_log_p5 = [- 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1];
59     static immutable __m128 _ps_cephes_log_p6 = [+ 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1];
60     static immutable __m128 _ps_cephes_log_p7 = [- 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1];
61     static immutable __m128 _ps_cephes_log_p8 = [+ 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1];
62     static immutable __m128 _ps_cephes_log_q1 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4];
63     static immutable __m128 _ps_cephes_log_q2 = [0.693359375, 0.693359375, 0.693359375, 0.693359375];
64 
65     /* the smallest non denormalized float number */
66     static immutable __m128i _psi_min_norm_pos  = [0x00800000,   0x00800000,   0x00800000, 0x00800000];
67 
68     __m128i emm0;
69     __m128 one = _ps_1;
70     __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
71     x = _mm_max_ps(x, cast(__m128)_psi_min_norm_pos);  /* cut off denormalized stuff */
72     emm0 = _mm_srli_epi32(cast(__m128i)x, 23);
73 
74     /* keep only the fractional part */
75     x = _mm_and_ps(x, cast(__m128)_psi_inv_mant_mask);
76     x = _mm_or_ps(x, _ps_0p5);
77 
78     emm0 = _mm_sub_epi32(emm0, _pi32_0x7f);
79     __m128 e = _mm_cvtepi32_ps(emm0);
80     e += one;
81     __m128 mask = _mm_cmplt_ps(x, _ps_cephes_SQRTHF);
82     __m128 tmp = _mm_and_ps(x, mask);
83     x -= one;
84     e -= _mm_and_ps(one, mask);
85     x += tmp;
86     __m128 z = x * x;
87     __m128 y = _ps_cephes_log_p0;
88     y *= x;
89     y += _ps_cephes_log_p1;
90     y *= x;
91     y += _ps_cephes_log_p2;
92     y *= x;
93     y += _ps_cephes_log_p3;
94     y *= x;
95     y += _ps_cephes_log_p4;
96     y *= x;
97     y += _ps_cephes_log_p5;
98     y *= x;
99     y += _ps_cephes_log_p6;
100     y *= x;
101     y += _ps_cephes_log_p7;
102     y *= x;
103     y += _ps_cephes_log_p8;
104     y *= x;
105 
106     y = y * z;
107     tmp = e * _ps_cephes_log_q1;
108     y += tmp;
109     tmp = z * _ps_0p5;
110     y = y - tmp;
111     tmp = e * _ps_cephes_log_q2;
112     x += y;
113     x += tmp;
114     x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
115     return x;
116 }
117 
118 /// Natural `exp` computed for a single float.
119 /// This is an approximation, valid up to approximately -109dB of accuracy
120 /// IMPORTANT: NaN input not supported.
121 // #BONUS
122 float _mm_exp_ss(float v) pure @safe
123 {
124     __m128 r = _mm_exp_ps(_mm_set1_ps(v));
125     return r.array[0];
126 }
127 
128 /// Natural `exp` computed for 4 simultaneous float in `x`.
129 /// This is an approximation, valid up to approximately -109dB of accuracy
130 /// IMPORTANT: NaN input not supported.
131 // #BONUS
132 __m128 _mm_exp_ps(__m128 x) pure @safe
133 {
134     static immutable __m128 _ps_exp_hi         = [88.3762626647949f, 88.3762626647949f, 88.3762626647949f, 88.3762626647949f];
135     static immutable __m128 _ps_exp_lo         = [-88.3762626647949f, -88.3762626647949f, -88.3762626647949f, -88.3762626647949f];
136     static immutable __m128 _ps_cephes_LOG2EF  = [1.44269504088896341, 1.44269504088896341, 1.44269504088896341, 1.44269504088896341];
137     static immutable __m128 _ps_cephes_exp_C1  = [0.693359375, 0.693359375, 0.693359375, 0.693359375];
138     static immutable __m128 _ps_cephes_exp_C2  = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4];
139     static immutable __m128 _ps_cephes_exp_p0  = [1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4];
140     static immutable __m128 _ps_cephes_exp_p1  = [1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3];
141     static immutable __m128 _ps_cephes_exp_p2  = [8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3];
142     static immutable __m128 _ps_cephes_exp_p3  = [4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2];
143     static immutable __m128 _ps_cephes_exp_p4  = [1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1];
144     static immutable __m128 _ps_cephes_exp_p5  = [5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1];
145 
146     __m128 tmp = _mm_setzero_ps(), fx;
147     __m128i emm0;
148     __m128 one = _ps_1;
149 
150     x = _mm_min_ps(x, _ps_exp_hi);
151     x = _mm_max_ps(x, _ps_exp_lo);
152 
153     /* express exp(x) as exp(g + n*log(2)) */
154     fx = x * _ps_cephes_LOG2EF;
155     fx += _ps_0p5;
156 
157     /* how to perform a floorf with SSE: just below */
158     emm0 = _mm_cvttps_epi32(fx);
159     tmp  = _mm_cvtepi32_ps(emm0);
160 
161     /* if greater, substract 1 */
162     __m128 mask = _mm_cmpgt_ps(tmp, fx);
163     mask = _mm_and_ps(mask, one);
164     fx = tmp - mask;
165 
166     tmp = fx * _ps_cephes_exp_C1;
167     __m128 z = fx * _ps_cephes_exp_C2;
168     x -= tmp;
169     x -= z;
170 
171     z = x * x;
172 
173     __m128 y = _ps_cephes_exp_p0;
174     y *= x;
175     y += _ps_cephes_exp_p1;
176     y *= x;
177     y += _ps_cephes_exp_p2;
178     y *= x;
179     y += _ps_cephes_exp_p3;
180     y *= x;
181     y += _ps_cephes_exp_p4;
182     y *= x;
183     y += _ps_cephes_exp_p5;
184     y *= z;
185     y += x;
186     y += one;
187 
188     /* build 2^n */
189     emm0 = _mm_cvttps_epi32(fx);
190 
191     emm0 = _mm_add_epi32(emm0, _pi32_0x7f);
192     emm0 = _mm_slli_epi32(emm0, 23);
193     __m128 pow2n = cast(__m128)emm0;
194     y *= pow2n;
195     return y;
196 }
197 
198 /// Computes `base^exponent` for a single 32-bit float.
199 /// This is an approximation, valid up to approximately -100dB of accuracy
200 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
201 // #BONUS
202 float _mm_pow_ss(float base, float exponent) pure @safe
203 {
204     __m128 r = _mm_pow_ps(_mm_set1_ps(base), _mm_set1_ps(exponent));
205     return r.array[0];
206 }
207 
208 /// Computes `base^exponent`, for 4 floats at once.
209 /// This is an approximation, valid up to approximately -100dB of accuracy
210 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
211 // #BONUS
212 __m128 _mm_pow_ps(__m128 base, __m128 exponents) pure @safe
213 {
214     return _mm_exp_ps(exponents * _mm_log_ps(base));
215 }
216 
217 /// Computes `base^exponent`, for 4 floats at once.
218 /// This is an approximation, valid up to approximately -100dB of accuracy
219 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
220 // #BONUS
221 __m128 _mm_pow_ps(__m128 base, float exponent) pure @safe
222 {
223     return _mm_exp_ps(_mm_set1_ps(exponent) * _mm_log_ps(base));
224 }
225 
226 unittest
227 {
228     import std.math;
229 
230     bool approxEquals(double groundTruth, double approx, double epsilon) pure @trusted @nogc nothrow
231     {
232         if (!isFinite(groundTruth))
233             return true; // no need to approximate where this is NaN or infinite
234 
235         if (groundTruth == 0) // the approximaton should produce zero too if needed
236         {
237             return approx == 0;
238         }
239 
240         if (approx == 0)
241         {
242             // If the approximation produces zero, the error should be below 140 dB
243             return ( abs(groundTruth) < 1e-7 );
244         }
245 
246         if ( ( abs(groundTruth / approx) - 1 ) >= epsilon)
247         {
248             assert(false);
249         }
250 
251         return ( abs(groundTruth / approx) - 1 ) < epsilon;
252     }
253 
254     // test _mm_log_ps
255     for (double mantissa = 0.1; mantissa < 1.0; mantissa += 0.05)
256     {
257         foreach (exponent; -23..23)
258         {
259             double x = mantissa * 2.0 ^^ exponent;
260             double phobosValue = log(x);
261             __m128 v = _mm_log_ps(_mm_set1_ps(x));
262             foreach(i; 0..4)
263                 assert(approxEquals(phobosValue, v.array[i], 1.1e-6));
264         }
265     }
266 
267     // test _mm_exp_ps    
268     for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1)
269     {
270         foreach (exponent; -23..23)
271         {
272             double x = mantissa * 2.0 ^^ exponent;
273 
274             // don't test too high numbers because they saturate FP precision pretty fast
275             if (x > 50) continue;
276 
277             double phobosValue = exp(x);
278             __m128 v = _mm_exp_ps(_mm_set1_ps(x));
279             foreach(i; 0..4)
280             {
281                 if (!approxEquals(phobosValue, v.array[i], 3.4e-6))
282                 {
283                     assert(false);
284                 }
285             }
286         }
287     }
288 
289     // test than exp(-inf) is 0
290     {
291         __m128 R = _mm_exp_ps(_mm_set1_ps(-float.infinity));
292         float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
293         assert(R.array == correct);
294     }
295 
296     // test log baheviour with NaN and infinities
297     // the only guarantee for now is that _mm_log_ps(negative) yield a NaN
298     {
299         __m128 R = _mm_log_ps(_mm_setr_ps(+0.0f, -0.0f, -1.0f, float.nan));
300       // DOESN'T PASS
301       //  assert(isInfinity(R[0]) && R[0] < 0); // log(+0.0f) = -infinity
302       // DOESN'T PASS
303       //  assert(isInfinity(R[1]) && R[1] < 0); // log(-0.0f) = -infinity
304         assert(isNaN(R.array[2])); // log(negative number) = NaN
305 
306         // DOESN'T PASS
307         //assert(isNaN(R[3])); // log(NaN) = NaN
308     }
309 
310 
311     // test _mm_pow_ps
312     for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1)
313     {
314         foreach (exponent; -8..4)
315         {
316             double powExponent = mantissa * 2.0 ^^ exponent;
317 
318             for (double mantissa2 = 0.1; mantissa2 < 1.0; mantissa2 += 0.1)
319             {
320                 foreach (exponent2; -4..4)
321                 {
322                     double powBase = mantissa2 * 2.0 ^^ exponent2;
323                     double phobosValue = pow(powBase, powExponent);
324                     float fPhobos = phobosValue;
325                     if (!isFinite(fPhobos)) continue;
326                      __m128 v = _mm_pow_ps(_mm_set1_ps(powBase), _mm_set1_ps(powExponent));
327 
328                     foreach(i; 0..4)
329                     {
330                         if (!approxEquals(phobosValue, v.array[i], 1e-5))
331                         {
332                             printf("%g ^^ %g\n", powBase, powExponent);
333                             assert(false);
334                         }
335                     }
336                 }
337             }
338         }
339     }
340 }
341 
342 private:
343 
344 static immutable __m128 _ps_1   = [1.0f, 1.0f, 1.0f, 1.0f];
345 static immutable __m128 _ps_0p5 = [0.5f, 0.5f, 0.5f, 0.5f];
346 static immutable __m128i _pi32_0x7f = [0x7f, 0x7f, 0x7f, 0x7f];