1 /**
2 * Transcendental function on 4 numbers at once.
3 *
4 * Copyright: Copyright Auburn Sounds 2016-2018.
5 *            Copyright (C) 2007  Julien Pommier
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 * Authors:   Guillaume Piolat
8 */
9 module inteli.math;
10 
11 /* Copyright (C) 2007  Julien Pommier
12 
13   This software is provided 'as-is', without any express or implied
14   warranty.  In no event will the authors be held liable for any damages
15   arising from the use of this software.
16 
17   Permission is granted to anyone to use this software for any purpose,
18   including commercial applications, and to alter it and redistribute it
19   freely, subject to the following restrictions:
20 
21   1. The origin of this software must not be misrepresented; you must not
22      claim that you wrote the original software. If you use this software
23      in a product, an acknowledgment in the product documentation would be
24      appreciated but is not required.
25   2. Altered source versions must be plainly marked as such, and must not be
26      misrepresented as being the original software.
27   3. This notice may not be removed or altered from any source distribution.
28 
29   (this is the zlib license)
30 */
31 import inteli.emmintrin;
32 import inteli.internals;
33 
34 static immutable __m128 _ps_1   = [1.0f, 1.0f, 1.0f, 1.0f];
35 static immutable __m128 _ps_0p5 = [0.5f, 0.5f, 0.5f, 0.5f];
36 
37 /* the smallest non denormalized float number */
38 static immutable __m128i _psi_min_norm_pos  = [0x00800000,   0x00800000,   0x00800000, 0x00800000];
39 
40 static immutable __m128i _pi32_0x7f = [0x7f, 0x7f, 0x7f, 0x7f];
41 
42 
43 nothrow @nogc:
44 
45 /// Natural `log` computed for a single 32-bit float.
46 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50
47 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
48 // #BONUS
49 float _mm_log_ss(float v) pure @safe
50 {
51     __m128 r = _mm_log_ps(_mm_set1_ps(v));
52     return r[0];
53 }
54 
55 /// Natural logarithm computed for 4 simultaneous float.
56 /// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50
57 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
58 // #BONUS
59 __m128 _mm_log_ps(__m128 x) pure @safe
60 {
61     static immutable __m128i _psi_inv_mant_mask = [~0x7f800000, ~0x7f800000, ~0x7f800000, ~0x7f800000];
62     static immutable __m128 _ps_cephes_SQRTHF = [0.707106781186547524, 0.707106781186547524, 0.707106781186547524, 0.707106781186547524];
63     static immutable __m128 _ps_cephes_log_p0 = [7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2];
64     static immutable __m128 _ps_cephes_log_p1 = [- 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1];
65     static immutable __m128 _ps_cephes_log_p2 = [1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1];
66     static immutable __m128 _ps_cephes_log_p3 = [- 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1];
67     static immutable __m128 _ps_cephes_log_p4 = [+ 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1];
68     static immutable __m128 _ps_cephes_log_p5 = [- 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1];
69     static immutable __m128 _ps_cephes_log_p6 = [+ 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1];
70     static immutable __m128 _ps_cephes_log_p7 = [- 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1];
71     static immutable __m128 _ps_cephes_log_p8 = [+ 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1];
72     static immutable __m128 _ps_cephes_log_q1 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4];
73     static immutable __m128 _ps_cephes_log_q2 = [0.693359375, 0.693359375, 0.693359375, 0.693359375];
74 
75     __m128i emm0;
76     __m128 one = _ps_1;
77     __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
78     x = _mm_max_ps(x, cast(__m128)_psi_min_norm_pos);  /* cut off denormalized stuff */
79     emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
80 
81     /* keep only the fractional part */
82     x = _mm_and_ps(x, cast(__m128)_psi_inv_mant_mask);
83     x = _mm_or_ps(x, _ps_0p5);
84 
85     emm0 = _mm_sub_epi32(emm0, cast(__m128i)_pi32_0x7f);
86     __m128 e = _mm_cvtepi32_ps(emm0);
87     e += one;
88     __m128 mask = _mm_cmplt_ps(x, _ps_cephes_SQRTHF);
89     __m128 tmp = _mm_and_ps(x, mask);
90     x -= one;
91     e -= _mm_and_ps(one, mask);
92     x += tmp;
93     __m128 z = x * x;
94     __m128 y = _ps_cephes_log_p0;
95     y *= x;
96     y += _ps_cephes_log_p1;
97     y *= x;
98     y += _ps_cephes_log_p2;
99     y *= x;
100     y += _ps_cephes_log_p3;
101     y *= x;
102     y += _ps_cephes_log_p4;
103     y *= x;
104     y += _ps_cephes_log_p5;
105     y *= x;
106     y += _ps_cephes_log_p6;
107     y *= x;
108     y += _ps_cephes_log_p7;
109     y *= x;
110     y += _ps_cephes_log_p8;
111     y *= x;
112     y *= z;
113     tmp = e * _ps_cephes_log_q1;
114     y += tmp;
115     tmp = z * _ps_0p5;
116     y = y - tmp;
117     tmp = e * _ps_cephes_log_q2;
118     x += y;
119     x += tmp;
120     x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
121     return x;
122 }
123 
124 /// Natural `exp` computed for a single float.
125 /// This is an approximation, valid up to approximately -109dB of accuracy
126 /// IMPORTANT: NaN input not supported.
127 // #BONUS
128 float _mm_exp_ss(float v) pure @safe
129 {
130     __m128 r = _mm_exp_ps(_mm_set1_ps(v));
131     return r[0];
132 }
133 
134 /// Natural `exp` computed for 4 simultaneous float in `x`.
135 /// This is an approximation, valid up to approximately -109dB of accuracy
136 /// IMPORTANT: NaN input not supported.
137 // #BONUS
138 __m128 _mm_exp_ps(__m128 x) pure @safe
139 {
140     static immutable __m128 _ps_exp_hi         = [88.3762626647949f, 88.3762626647949f, 88.3762626647949f, 88.3762626647949f];
141     static immutable __m128 _ps_exp_lo         = [-88.3762626647949f, -88.3762626647949f, -88.3762626647949f, -88.3762626647949f];
142     static immutable __m128 _ps_cephes_LOG2EF  = [1.44269504088896341, 1.44269504088896341, 1.44269504088896341, 1.44269504088896341];
143     static immutable __m128 _ps_cephes_exp_C1  = [0.693359375, 0.693359375, 0.693359375, 0.693359375];
144     static immutable __m128 _ps_cephes_exp_C2  = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4];
145     static immutable __m128 _ps_cephes_exp_p0  = [1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4];
146     static immutable __m128 _ps_cephes_exp_p1  = [1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3];
147     static immutable __m128 _ps_cephes_exp_p2  = [8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3];
148     static immutable __m128 _ps_cephes_exp_p3  = [4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2];
149     static immutable __m128 _ps_cephes_exp_p4  = [1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1];
150     static immutable __m128 _ps_cephes_exp_p5  = [5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1];
151 
152     __m128 tmp = _mm_setzero_ps(), fx;
153     __m128i emm0;
154     __m128 one = _ps_1;
155 
156     x = _mm_min_ps(x, _ps_exp_hi);
157     x = _mm_max_ps(x, _ps_exp_lo);
158 
159     /* express exp(x) as exp(g + n*log(2)) */
160     fx = x * _ps_cephes_LOG2EF;
161     fx += _ps_0p5;
162 
163     /* how to perform a floorf with SSE: just below */
164     emm0 = _mm_cvttps_epi32(fx);
165     tmp  = _mm_cvtepi32_ps(emm0);
166 
167     /* if greater, substract 1 */
168     __m128 mask = _mm_cmpgt_ps(tmp, fx);    
169     mask = _mm_and_ps(mask, one);
170     fx = tmp - mask;
171 
172     tmp = fx * _ps_cephes_exp_C1;
173     __m128 z = fx * _ps_cephes_exp_C2;
174     x -= tmp;
175     x -= z;
176 
177     z = x * x;
178 
179     __m128 y = _ps_cephes_exp_p0;
180     y *= x;
181     y += _ps_cephes_exp_p1;
182     y *= x;
183     y += _ps_cephes_exp_p2;
184     y *= x;
185     y += _ps_cephes_exp_p3;
186     y *= x;
187     y += _ps_cephes_exp_p4;
188     y *= x;
189     y += _ps_cephes_exp_p5;
190     y *= z;
191     y += x;
192     y += one;
193 
194     /* build 2^n */
195     emm0 = _mm_cvttps_epi32(fx);
196     emm0 = _mm_add_epi32(emm0, cast(__m128i)_pi32_0x7f);
197     emm0 = _mm_slli_epi32(emm0, 23);
198     __m128 pow2n = _mm_castsi128_ps(emm0);
199     y = _mm_mul_ps(y, pow2n);
200     return y;
201 }
202 
203 /// Computes `base^exponent` for a single 32-bit float.
204 /// This is an approximation, valid up to approximately -100dB of accuracy
205 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
206 // #BONUS
207 float _mm_pow_ss(float base, float exponent) pure @safe
208 {
209     __m128 r = _mm_pow_ps(_mm_set1_ps(base), _mm_set1_ps(exponent));
210     return r[0];
211 }
212 
213 /// Computes `base^exponent`, for 4 floats at once.
214 /// This is an approximation, valid up to approximately -100dB of accuracy
215 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
216 // #BONUS
217 __m128 _mm_pow_ps(__m128 base, __m128 exponents) pure @safe
218 {
219     return _mm_exp_ps(exponents * _mm_log_ps(base));
220 }
221 
222 /// Computes `base^exponent`, for 4 floats at once.
223 /// This is an approximation, valid up to approximately -100dB of accuracy
224 /// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
225 // #BONUS
226 __m128 _mm_pow_ps(__m128 base, float exponent) pure @safe
227 {
228     return _mm_exp_ps(_mm_set1_ps(exponent) * _mm_log_ps(base));
229 }
230 
231 unittest
232 {
233     import std.math;
234 
235     bool approxEquals(double groundTruth, double approx, double epsilon) pure @trusted @nogc nothrow
236     {
237         if (!isFinite(groundTruth))
238             return true; // no need to approximate where this is NaN or infinite
239 
240         if (groundTruth == 0) // the approximaton should produce zero too if needed
241         {
242             return approx == 0;
243         }
244 
245         if (approx == 0)
246         {
247             // If the approximation produces zero, the error should be below 140 dB
248             return ( abs(groundTruth) < 1e-7 );
249         }
250 
251         if ( ( abs(groundTruth / approx) - 1 ) >= epsilon)
252         {
253             import core.stdc.stdio;
254             debug printf("approxEquals (%g, %g, %g) failed\n", groundTruth, approx, epsilon);
255             debug printf("ratio is %f\n", abs(groundTruth / approx) - 1);
256         }
257 
258         return ( abs(groundTruth / approx) - 1 ) < epsilon;
259     }
260 
261     // test _mm_log_ps
262     for (double mantissa = 0.1; mantissa < 1.0; mantissa += 0.005)
263     {
264         foreach (exponent; -23..23)
265         {
266             double x = mantissa * 2.0 ^^ exponent;
267             double phobosValue = log(x);
268             __m128 v = _mm_log_ps(_mm_set1_ps(x));
269             foreach(i; 0..4)
270                 assert(approxEquals(phobosValue, v[i], 1.1e-6));
271         }
272     }
273 
274     // test _mm_exp_ps    
275     for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.01)
276     {
277         foreach (exponent; -23..23)
278         {
279             double x = mantissa * 2.0 ^^ exponent;
280 
281             // don't test too high numbers because they saturate FP precision pretty fast
282             if (x > 50) continue;
283 
284             double phobosValue = exp(x);
285             __m128 v = _mm_exp_ps(_mm_set1_ps(x));
286             foreach(i; 0..4)
287                assert(approxEquals(phobosValue, v[i], 3.4e-6));
288         }
289     }
290 
291     // test than exp(-inf) is 0
292     {
293         __m128 R = _mm_exp_ps(_mm_set1_ps(-float.infinity));
294         float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
295         assert(R.array == correct);
296     }
297 
298     // test log baheviour with NaN and infinities
299     // the only guarantee for now is that _mm_log_ps(negative) yield a NaN
300     {
301         __m128 R = _mm_log_ps(_mm_setr_ps(+0.0f, -0.0f, -1.0f, float.nan));
302       // DOESN'T PASS
303       //  assert(isInfinity(R[0]) && R[0] < 0); // log(+0.0f) = -infinity
304       // DOESN'T PASS
305       //  assert(isInfinity(R[1]) && R[1] < 0); // log(-0.0f) = -infinity
306         assert(isNaN(R[2])); // log(negative number) = NaN
307 
308         // DOESN'T PASS
309         //assert(isNaN(R[3])); // log(NaN) = NaN
310     }
311 
312 
313     // test _mm_pow_ps
314     for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.01)
315     {
316         foreach (exponent; -8..4)
317         {
318             double powExponent = mantissa * 2.0 ^^ exponent;
319 
320             for (double mantissa2 = 0.1; mantissa2 < 1.0; mantissa2 += 0.01)
321             {
322                 foreach (exponent2; -4..4)
323                 {
324                     double powBase = mantissa2 * 2.0 ^^ exponent2;
325                     double phobosValue = pow(powBase, powExponent);
326                     float fPhobos = phobosValue;
327                     if (!isFinite(fPhobos)) continue;
328                      __m128 v = _mm_pow_ps(_mm_set1_ps(powBase), _mm_set1_ps(powExponent));
329 
330                     foreach(i; 0..4)
331                     {
332                         if (!approxEquals(phobosValue, v[i], 1e-5))
333                         {
334                             printf("%g ^^ %g\n", powBase, powExponent);
335                             assert(false);
336                         }
337                     }
338                 }
339             }
340         }
341     }
342 }