forked from fp64lib/fp64lib
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fp64_asinx.S
428 lines (367 loc) · 10.4 KB
/
fp64_asinx.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
/* Copyright (c) 2020 Uwe Bissinger
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of the copyright holders nor the names of
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE. */
/* $Id$ */
#include "fp64def.h"
#include "asmdef.h"
/* float64_t fp64_asin(float64_4 phi )
returns the arcus cos of phi (inverse function to fp64_sin(x)
Basic algorithm:
fp64_asin( x ) {
if( (x1 = fabs(x)) > 1.0 )
return NaN
sign = sign(x)
if( x1 <= 0.5 )
x2 = 2*x1*x1
factor = x1
else
x2 = 1 - x1
factor = sqrt(2*x2)
res = horner( x2, coeff_nominator )
x3 = horner( x2, coeff_denominator )
res = res / x3 * factor
if( x1 <= 0.5 )
res = res - PI/2
if( sign )
res = -res
return sign * res
*/
FUNCTION fp64_asin
; case | x | asin(x) | acos(x)
;------+-------+----------+--------
; 1a | NaN | NaN | NaN
; 1b | +/-Inf| NaN | NaN
; 2 | 0.0 | 0.0 | PI/2
; 3 | <-1.0 | NaN | NaN
; 4 | >+1.0 | NaN | NaN
; 5 | -1.0 | -PI/2 | PI
; 6 | +1.0 | +PI/2 | 0
; 7 | else | asin(x) | acos(x)
; cases 1a, 1b: NaN, +Inf, -Inf
; cases 3, 4: |x| > 1.0
.L_nan:
XJMP _U(__fp64_nan)
; case 2: 0 --> 0 (for asin) or PI/2 (for acos)
.L_zero:
sbrs r0, 1 ; acos?
XJMP _U(__fp64_szero) ; no: return 0.0 for asin
clt ; yes: return +PI/2 for acos
XJMP _U(__fp64_pi2)
/* float64_t fp64_acos(float64_4 phi )
returns the arcus cosine of phi (inverse function to fp64_cos(x)
*/
ENTRY fp64_acos
inc r1
inc r1 ; set bit 1 of r1 as flag for acos
/* float64_t fp64_asin(float64_4 phi )
returns the arcus sine of phi (inverse function to fp64_sin(x)
*/
ENTRY fp64_asin
sts __sinFlags, r1 ; either r1 = 0 (default) or r1 = 0x02 (acos)
clr r1
XCALL _U(__fp64_splitA)
lds r0, __sinFlags
bld r0, 7 ; save sign of x in __sinFlags
sts __sinFlags, r0
brcs .L_nan ; NaN or +/- INF
breq .L_zero ; x = 0 --> result of asin = 0
cpi rAE1, 0x03 ; fabs(x) < 1.0?
brmi 10f ; definitely yes, x < 2^-255 --> go ahead with approximation by poly(2*x^2)*x
brne .L_nan ; definitely no, x >= 2.0 --> cases 3,4, return NaN
; maybe yes, 2^-255 < x < 2.0
cpi rAE0, 0xfe
brlo 10f ; |x| < 0.5 --> go ahead with approximation by poly(2*x^2)*x
breq 20f ; 0.5 <= |x| < 1.0 --> --> go ahead with approximation by poly(1-x)*sqrt(2*x)
; now here: 1 <= |x| < 2
; check, whether |x| is == 1.0, i.e. significand = 0x80....0
XCALL _U(__fp64_cpc0A5)
brne .L_nan
cpi rA6, 0x80
brne .L_nan
; cases 5,6: fabs(x) = 1.0 --> return +/- PI/2 for asin, 0 or PI for acos
sbrs r0,1 ; acos?
XJMP _U(__fp64_pi2) ; no: return +/- PI/2
sbrs r0, 7 ; x == -1?
XJMP _U(__fp64_szero) ; no --> x == 1 --> return +0
clt
XCALL _U(__fp64_pi2) ; yes, return +PI/2
adiw rA6, 0x10 ; modify exponent to become 0x400
ret
; 0.5 < fabs(x) < 1.0
; approximate asin(x) with MiniMax(1-x)*sqrt(2*x)
20: inc r0 ; set bit 0 --> |x| > 0.5
10: ; fabs(x) < 0.5 approximate asin(x) with MiniMax
; bld r0, 7
sts __sinFlags, r0 ; save sign and function
clt
push rAx3 ; as all registers may be used, save them
push rAx2
push rAx1
push rAx0
push rBx11
push rBx10
push rBx9
push rBx8
push rBx7
push rBx6
push rBx5
push rBx4
push rBx3
push rBx2
push rBx1
push rBx0
push YH
push YL
push XH
push XL
XCALL _U(__fp64_movBAx) ; B = A = x1
lds r0, __sinFlags
sbrc r0, 0
rjmp 21f
11:
; x1 <= 0.5
; compute x2 = 2 * x1 * x1
; and put x1 = fabs(x) as factor on the stack
push rA7 ; save factor = x1 = fabs(x)
push rA6
push rA5
push rA4
push rA3
push rA2
push rA1
push rA0
push rAE1
push rAE0
XCALL _U(__fp64_mulsd3_pse)
adiw rAE0, 1 ; x2 = 2*x1*x1
rjmp 12f
; x1 > 0.5
; compute x2 = 1 - x
; and put sqrt(2*x2) as factor on the stack
21:
mov rA7, r1 ; A = 1.0
ldi rA6, 0x80
mov rA5, r1
mov rA4, r1
X_movw rA2, rA4
X_movw rA0, rA4
ldi rAE0, 0xff
ldi rAE1, 0x03
clt ; clear sign
XCALL _U(__fp64_sub_pse) ; x2 = 1 - x1
clt
push rA7 ; save x2 = 1 - x1
push rA6
push rA5
push rA4
push rA3
push rA2
push rA1
push rA0
push rAE1
push rAE0
adiw rAE0, 1 ; 2*x2
XCALL _U(__fp64_sqrt_pse) ; factor = sqrt(2*x2)
pop rBE0 ; retrieve x2
pop rBE1
pop rB0
pop rB1
pop rB2
pop rB3
pop rB4
pop rB5
pop rB6
pop rB7
;rcall __fp64_saveAB
push rA7 ; save factor = sqrt(2*x2)
push rA6
push rA5
push rA4
push rA3
push rA2
push rA1
push rA0
push rAE1
push rAE0
XCALL _U(__fp64_movABx) ; A = B = x2
12:
; start polynom calculation
; A: x2
; B: undefined
; stack: factor
; rcall __fp64_saveAB
push rA7 ; save x2
push rA6
push rA5
push rA4
push rA3
push rA2
push rA1
push rA0
push rAE1
push rAE0
ldi XL, lo8(.L_tableAsinDenom) ; calculate Denominator
ldi XH, hi8(.L_tableAsinDenom)
XCALL _U(__fp64_powser)
pop rBE0 ; retrieve x2
pop rBE1
pop rB0
pop rB1
pop rB2
pop rB3
pop rB4
pop rB5
pop rB6
pop rB7
push rA7 ; save x3 = horner( x2, coeff_nenner )
push rA6
push rA5
push rA4
push rA3
push rA2
push rA1
push rA0
push rAE1
push rAE0
XCALL _U(__fp64_movABx) ; A = B = x3
ldi XL, lo8(.L_tableAsinNom) ; calculate Nominator
ldi XH, hi8(.L_tableAsinNom)
XCALL _U(__fp64_powser) ; res = horner( x2, coeff_zähler )
pop rBE0 ; retrieve Denominator x3 = horner( x2, coeff_nenner )
pop rBE1
pop rB0
pop rB1
pop rB2
pop rB3
pop rB4
pop rB5
pop rB6
pop rB7
; rcall __fp64_saveAB
XCALL _U(__fp64_divsd3_pse) ; res = res / x3
pop rBE0 ; retrieve either x1 = fabs(x) or x1 = sqrt(2*fabs(x))
pop rBE1
pop rB0
pop rB1
pop rB2
pop rB3
pop rB4
pop rB5
pop rB6
pop rB7
XCALL _U(__fp64_mulsd3_pse) ; res = res / x3 * x1 or res = res / x3 * sqrt(2*fabs(x))
lds r0, __sinFlags ; retrieve sign(x)
sbrc r0, 1 ; acos?
rjmp 23f ; yes, handle acos
; no, handle asin
sbrs r0, 0 ; |x| > 0,5?
rjmp 19f ; no, result is already ok
; yes: adjust result to PI/2 - res or res - PI/2
XCALL _U(__fp64_ldb_pi2)
XCALL _U(__fp64_sub_pse) ; compute res - PI/2
lds r0, __sinFlags ; retrieve sign(x)
sbrs r0, 7 ; if( sign )
rjmp 19f
18:
bld rA7, 7 ; res = -res
subi rA7, 0x80
19:
pop XL ; restore all used registers
pop XH
pop YL
pop YH
pop rBx0
pop rBx1
pop rBx2
pop rBx3
pop rBx4
pop rBx5
pop rBx6
pop rBx7
pop rBx8
pop rBx9
pop rBx10
pop rBx11
pop rAx0
pop rAx1
pop rAx2
pop rAx3
lds r0, __sinFlags ; retrieve sign(x)
sbrs r0,1 ; asin?
bst r0, 7 ; yes, return res*sign
.L_retA:
XJMP _U(__fp64_rpretA) ; return res
23: ; handle acos --> r0 bit 1 set
XCALL _U(__fp64_ldb_pi2) ; B = PI/2
sbrc r0, 0 ; |x| <= 0.5?
adiw rBE0, 1 ; no: B = PI/2*2 = PI
; A: res
; B: PI/2 or PI (for |x|>0.5)
tst r0 ; x < 0 ?
brmi 24f
; x >= 0 --> either pi/2 - res or res
sbrc r0, 0 ; |x| <= 0,5?
rjmp 19b ; no, return res
rjmp 25f ; yes, return -res + pi/2
24: ; x < 0 --> either pi/2 + res or pi - res
sbrc r0, 0 ; |x| <= 0,5?
25:
subi rA7, 0x80 ; no, change sign of res --> -res + pi
bst rA7, 7
; rcall __fp64_saveAB
XCALL _U(__fp64_add_pse) ; compute res + pi/2 or -res + pi
rjmp 19b
.L_tableAsinNom:
; Ideal polynom:
; + 0.0000084705471128435769021718764878041684288 * x^5
; + 0.0012557428614630796315205218507940285622 * x^4
; - 0.04333483170641685705612351801 * x^3
; + 0.35290206232981519813422591897720574012 * x^2
; - 1.035234033892197627842731209 * x
; + 0.99999999999999999442491073135027586203
.byte 5 ; polynom power = 5 --> 6 entries
.byte 0x00, 0x8e, 0x1c, 0xb9, 0x0b, 0x50, 0x6c, 0xce, 0x03, 0xee ; 0.000008470547112843576902172332924136939494644
.byte 0x00, 0xa4, 0x97, 0xbd, 0x0b, 0x59, 0xc9, 0x25, 0x03, 0xf5 ; 0.001255742861463079621886854142509548637463
.byte 0x80, 0xb1, 0x7f, 0xdd, 0x4f, 0x4e, 0xbe, 0x1d, 0x03, 0xfa ; -0.04333483170641685630619655000828061020002
.byte 0x00, 0xb4, 0xaf, 0x94, 0x40, 0xcb, 0x86, 0x69, 0x03, 0xfd ; 0.35290206232981519813422591897720574012
.byte 0x80, 0x84, 0x82, 0x8c, 0x7f, 0xa2, 0xf6, 0x65, 0x03, 0xff ; -1.035234033892197619275421516249480191618
.byte 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0xff ; 1.0000000000000000000000000000000000000000
.byte 0x00 ; byte needed for code alignment to even adresses!
.L_tableAsinDenom:
; Ideal polynom:
; + 0.0028820878185134035637440105959294542908 *x^4
; - 0.06355588484963171659942148390 * x^3
; + 0.42736600959872448854098334016758333519 * x^2
; - 1.118567367225532923662371649 * x
; + 1
.byte 4 ; polynom power = 4 --> 5 entries
.byte 0x00, 0xbc, 0xe1, 0x68, 0xec, 0xba, 0x20, 0x29, 0x03, 0xf6 ; 0.002882087818513403567558667228709623486793
.byte 0x80, 0x82, 0x29, 0x96, 0x77, 0x2e, 0x19, 0xc7, 0x03, 0xfb ; -0.06355588484963171740094178829849624889903
.byte 0x00, 0xda, 0xcf, 0xb7, 0xb5, 0x4c, 0x0d, 0xee, 0x03, 0xfd ; 0.4273660095987244916804215222327911760658
.byte 0x80, 0x8f, 0x2d, 0x37, 0x2a, 0x4d, 0xa1, 0x57, 0x03, 0xff ; -1.118567367225532932506482097778643947095
.byte 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0xff ; 1.0000000000000000000000000000000000000000
.byte 0x00 ; byte needed for code alignment to even adresses!
ENDFUNC
.data
__sinFlags: .skip 1