; Copyright 2004 Castle Technology Ltd
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
;     http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.
;
        GET objmacs.s

        GBLL    FloatingPointArgsInRegs
FloatingPointArgsInRegs SETL {FALSE}
        [ FloatingPointArgsInRegs
        ! 0, "WARNING: Floating point arguments ARE being passed in FP registers"
        ]

        CodeArea

;        EXPORT  atan2
        EXPORT  atan2f
        EXPORT  fma
        EXPORT  fmaf
        EXPORT  log1p
        EXPORT  log1pf

        IMPORT  _ll_muluu

        ^       0,sp
Xsue    #       4
Xmhi    #       4
Xmlo    #       4
Ysue    #       4
Ymhi    #       4
Ymlo    #       4
Zsue    #       4
Zmhi    #       4
Zm2     #       4
Zm3     #       4
Zm4     #       4
fma_framesize * :INDEX:@

; An "extended extended" number with extra bits - 128 bits
; of mantissa. Exponent range same as normal exponent.
        ^       0,sp
XYsue   #       4
XYmhi   #       4
XYmmh   #       4
XYmml   #       4
XYmlo   #       4

; In: a1->128-bit mantissa, a2 = amount to denormalise by.
; Out: a1-a4 = denormalised mantissa (bit 127 = sticky)
m128_denormalise
        CMP     a2,#128
        BHS     m128_flush
        FunctionEntry ,"v1"
        MOV     ip,a2
        LDMIA   a1,{a1-a4}
        CMP     ip,#32
        BLO     %FT10
05      TEQ     a4,#0
        MOV     a4,a3
        ORRNE   a4,a4,#1
        MOV     a3,a2
        MOV     a2,a1
        MOV     a1,#0
        SUB     ip,ip,#32
        CMP     ip,#32
        BHS     %BT05
10      RSB     v1,ip,#32
        MOVS    lr,a4,LSL v1
        MOV     a4,a4,LSR ip
        ORRNE   a4,a4,#1
        ORR     a4,a4,a3,LSL v1
        MOV     a3,a3,LSR ip
        ORR     a3,a3,a2,LSL v1
        MOV     a2,a2,LSR ip
        ORR     a2,a2,a1,LSL v1
        MOV     a1,a1,LSR ip
        Return  ,"v1"

m128_flush
        LDMIA   a1,{a1-a4}
        ORRS    ip,a1,a2
        ORRS    ip,ip,a3
        ORRS    ip,ip,a4
        MOV     a1,#0
        MOV     a2,#0
        MOV     a3,#0
        MOVEQ   a4,#0
        MOVNE   a4,#1
        Return  ,,LinkNotStacked

; fma (fused multiply-add) implementation
fma
        [ :LNOT:FloatingPointArgsInRegs
        STMFD   sp!,{a1-a4}
        LDFD    f0,[sp],#8
        LDFD    f1,[sp],#8
        LDFD    f2,[sp,#0]
        ]
        FunctionEntry ,"v1-v6"
        ; Convert to extended precision, because they're easier to
        ; work with, and it eliminates subnormals.
        ; 64x64->128+64->53 is just as easy as 53x53->106+53->53
        ASSERT  fma_framesize = 3*12+8
        STFE    f2,[sp,#-12-8]!
        STFE    f1,[sp,#-12]!
        STFE    f0,[sp,#-12]!

        LDR     a1,Xsue
        LDR     a2,Ysue

        BIC     a3,a1,#&80000000        ; a1,a2 = exponents
        BIC     a4,a2,#&80000000

        ; Check for NaNs/infinities (exponent > &6000 will suffice
        ; because we know operands are only doubles, so max normal
        ; exponent is &43FF)
        CMP     a3,#&6000
        CMPLO   a4,#&6000
        BHS     fma_naninfmult

        ; If either operand has zero exponent, it's a zero, so
        ; zero result
        TEQ     a3,#0
        TEQNE   a4,#0
        BEQ     fma_zeromult

        ADD     v5,a3,a4
        SUB     v5,v5,#&3F00            ; v5 = prospective exponent
        SUB     v5,v5,#&00FE            ; (exponent range &379C-&47FE)
        EOR     v6,a1,a2                ; v6 = result sign
        AND     v6,v6,#&80000000

        ; Multiply the two mantissas
        ; Assemble the result in {v1,v2,v3,v4} (v1 high)
        ; Just break it down into 32x32->64 multiplies

        LDR     a1,Xmlo
        LDR     a2,Ymlo
        BL      _ll_muluu
        MOV     v4,a1
        MOV     v3,a2

        LDR     a1,Xmhi
        LDR     a2,Ymhi
        BL      _ll_muluu
        MOV     v2,a1
        MOV     v1,a2

        LDR     a1,Xmlo
        LDR     a2,Ymhi
        BL      _ll_muluu
        ADDS    v3,v3,a1
        ADCS    v2,v2,a2
        ADC     v1,v1,#0

        LDR     a1,Xmhi
        LDR     a2,Ymlo
        BL      _ll_muluu
        ADDS    v3,v3,a1
        ADCS    v2,v2,a2
        ADCS    v1,v1,#0

        ; May need to normalise, but by at most 1 bit
        ; given that operands were normalised.
        BMI     %FT01

        ADDS    v4,v4,v4
        ADCS    v3,v3,v3
        ADCS    v2,v2,v2
        ADC     v1,v1,v1
        SUB     v5,v5,#1
01
        ; Write x*y into XY
        ORR     a4,v5,v6
        ASSERT  :INDEX:XYsue = 0
        STMIA   sp,{a4,v1-v4}

        LDR     a1,Zsue
        BIC     a2,a1,#&80000000

        ; Check for z being inf/NaN. Again, exp>=&6000 is fine
        CMP     a2,#&6000
        BHS     fma_naninfadd

        ; Extend Z to 128 bits
        MOV     lr,#0
        STR     lr,Zm3
        STR     lr,Zm4

        ; Normal addition now of two extended extended numbers
        ; Mantissas are 128-bit, but we only have 106 significant
        ; bits. Use the bottom bit as a sticky bit in forthcoming
        ; normalisations, the other excess as guard/round bits.

        EOR     v4,a1,v6
        SUBS    a3,v5,a2
        ; a1 = z sign/exponent
        ; a2 = z exponent
        ; a3 = exponent difference (x*y) compared to z; flags set on this
        ; v4 = sign difference
        ; v5 = (x*y) exponent
        ; v6 = (x*y) sign
        ; XY and Z are 160-bit numbers
        BHI     fma_op2shift
        BEQ     fma_shiftdone
fma_op1shift
        MOV     v5, a2
        ADR     a1,XYmhi
        RSB     a2,a3,#0
        BL      m128_denormalise
        ASSERT  :INDEX:XYmhi=4
        STMIB   sp,{a1-a4}
        B       fma_shiftdone

fma_op2shift
        ADR     a1,Zmhi
        MOV     a2,a3
        BL      m128_denormalise
        ADR     lr,Zmhi
        STMIA   lr,{a1-a4}
        B       fma_shiftdone

; Now v4 = sign difference
;     v5 = prospective exponent
;     v6 = operand 1 sign
fma_shiftdone
        TEQ     v4,#0
        ASSERT  :INDEX:XYmhi=4
        LDMIB   sp,{a1-a4}
        ADR     lr,Zmhi
        LDMIA   lr,{v1-v4}
        BMI     fma_difference

fma_sum
        ADDS    a4,a4,v4
        ADCS    a3,a3,v3
        ADCS    a2,a2,v2
        ADCS    a1,a1,v1
        BCC     fma_adddone
        ADD     v5,v5,#1
        MOVS    a1,a1,RRX
        MOVS    a2,a2,RRX
        MOVS    a3,a3,RRX
        MOVS    a4,a4,RRX
        ORRCS   a4,a4,#1
        B       fma_adddone

fma_difference
        SUBS    a4,a4,v4
        SBCS    a3,a3,v3
        SBCS    a2,a2,v2
        SBCS    a1,a1,v1
        BCS     fma_diffnorm
        ; Subtraction came out negative
        EOR     v6,v6,#&80000000
        RSBS    a4,a4,#0
        RSCS    a3,a3,#0
        RSCS    a2,a2,#0
        RSCS    a1,a1,#0
fma_diffnorm
        ; Maximum normalisation is 106-odd bits. N flag indicates
        ; if we're already normalised.
        BMI     fma_adddone
        ; Try 1 bit to start with
        ADDS    a4,a4,a4
        ADCS    a3,a3,a3
        ADCS    a2,a2,a2
        ADCS    a1,a1,a1
        SUB     v5,v5,#1
        BMI     fma_adddone
        ; Okay, still not normalised. We can infer that the
        ; exponent difference was 0 or 1, so the round/sticky bits are 0.
        ; Check for an exact zero result first.
        ORR     lr,a1,a2
        ORR     lr,lr,a3
        ORRS    lr,lr,a4
        BEQ     fma_zerosub

20      TEQ     a1,#0
        MOVEQ   a1,a2
        MOVEQ   a2,a3
        MOVEQ   a3,a4
        MOVEQ   a4,#0
        SUBEQ   v5,v5,#32
        BEQ     %BT20

        MOV     lr,#0
        MOVS    ip,a1,LSR #16
        MOVEQ   a1,a1,LSL #16
        ADDEQ   lr,lr,#16
        MOVS    ip,a1,LSR #24
        MOVEQ   a1,a1,LSL #8
        ADDEQ   lr,lr,#8
        MOVS    ip,a1,LSR #28
        MOVEQ   a1,a1,LSL #4
        ADDEQ   lr,lr,#4
        MOVS    ip,a1,LSR #30
        MOVEQ   a1,a1,LSL #2
        ADDEQ   lr,lr,#2
        MOVS    ip,a1,LSR #31
        MOVEQ   a1,a1,LSL #1
        ADDEQ   lr,lr,#1

        RSBS    ip,lr,#32
        ORR     a1,a1,a2,LSR ip
        MOV     a2,a2,LSL lr
        ORR     a2,a2,a3,LSR ip
        MOV     a3,a3,LSL lr
        ORR     a3,a3,a4,LSR ip
        MOV     a4,a4,LSL lr
        SUB     v5,v5,lr

fma_adddone

; We now have an answer. v5 is our exponent; a1-a4 are 128
; bits of mantissa (bottom bit sticky). v6 is our sign.
; First, transfer the sticky bit to bit 63.

        ORRS    a3,a3,a4
        ORRNE   a2,a2,#1

fma_return
; Now just pack it back into an extended number, and convert
; to double. Hey presto.

        ORR     lr,v5,v6
        STR     lr,[sp,#0]
        STMIB   sp,{a1,a2}
        LDFE    f0,[sp],#fma_framesize
        MVFD    f0,f0
        Return  ,"v1-v6"

; Subtracted equal quantities. Result is +0 (rounding to nearest).
fma_zerosub
        MOV     v5,#0
        MOV     v6,#0
        B       fma_return

; One of x and y is an infinity or NaN. First check if z is a NaN;
; if it is then copy it into x to make sure MUF doesn't raise an
; invalid for 0*inf. [C99 says fma(0,inf,nan) is allowed to raise
; invalid, but the current IEEE 754R draft says it shouldn't.]
; Then fall through to the code that handles fma(0,finite,z).
fma_naninfmult
        CMF     f2,#0
        MVFVSD  f0,f2

; One of x and y is zero, the other is finite. The result is zero.
; We can thus just calculate x*y+z in normal arithmetic, to get
; all the appropriate exceptions out.
fma_zeromult
        MUFD    f0,f0,f1
        ADFD    f0,f0,f2
        ADD     sp,sp,#fma_framesize
        Return  ,"v1-v6"

; z is an infinity or NaN. We know x*y is finite and non-zero,
; so the result is z. Just return it.
fma_naninfadd
        MVFD    f0,f2
        ADD     sp,sp,#fma_framesize
        Return  ,"v1-v6"

; fmaf implementation is MUCH simpler. Note that this DOES
; raise "invalid" for fmaf(0,INFINITY,NAN), unlike fma. To avoid
; this would seriously affect its performance and prevent inlining,
; and we're claiming FP_FAST_FMAF.
fmaf
        [ :LNOT:FloatingPointArgsInRegs
        STMFD   sp!, {r0-r3}
        LDFD    f0, [sp], #8
        LDFD    f1, [sp], #8
        LDFD    f2, [sp, #0]
        ]
        MVFS    f0, f0
        MVFS    f1, f1
        MVFS    f2, f2
        FMLE    f0, f0, f1              ; IVO possible
        ADFS    f0, f0, f2              ; UFL, OFL, INX possible
        Return  ,,LinkNotStacked

; This implementation based on Goldberg [1991], who showed that
; log1p(x) = (x * log(1+x)) / ((1+x) - 1) has error < 5*epsilon
; for 0 <= x < 0.75 if log is accurate. That should mean this is
; more than adequate to get a good double result in 0 <= x < 0.5.
; Experimentally, it does appear good for -0.5 < x < 0.5.
; For |x| >= 0.5 we just calculate log(1+x) in extended precision,
; as we won't be losing accuracy in the addition.
log1p
        [ :LNOT: FloatingPointArgsInRegs
        STMFD   sp!, {r0, r1}
        LDFD    f0, [sp], #8
        ]
        CNF     f0, #0.5
        BLE     log1p_bigneg
        ADFE    f1, f0, #1              ; if (1+x) == 1, return x
        CMF     f1, #1                  ; gets INX right
        Return  ,,LinkNotStacked,EQ
        CMF     f0, #0.5
        BPL     log1p_bigpos
        LGNE    f2, f1
        MUFE    f2, f2, f0
        SUFE    f3, f1, #1
        DVFD    f0, f2, f3
        Return  ,,LinkNotStacked

log1p_bigneg
        CNF     f0, #1                  ; To avoid inexact
        ADFGEE  f0, f0, #1
        LGND    f0, f0
        Return  ,,LinkNotStacked

log1p_bigpos
        LGND    f0, f1
        Return  ,,LinkNotStacked

log1pf
        [ :LNOT: FloatingPointArgsInRegs
        STMFD   sp!, {r0, r1}
        LDFD    f0, [sp], #8
        ]
        MVFS    f0, f0
        CNF     f0, #0.5
        BLE     log1pf_bigneg
        ADFE    f1, f0, #1              ; if (1+x) == 1, return x
        CMF     f1, #1                  ; gets INX right
        Return  ,,LinkNotStacked,EQ
        CMF     f0, #0.5
        BPL     log1pf_bigpos
        LGNE    f2, f1
        MUFE    f2, f2, f0
        SUFE    f3, f1, #1
        DVFS    f0, f2, f3
        Return  ,,LinkNotStacked

log1pf_bigneg
        CNF     f0, #1                  ; To avoid inexact
        ADFGEE  f0, f0, #1
        LGND    f0, f0
        Return  ,,LinkNotStacked

log1pf_bigpos
        LGNS    f0, f1
        Return  ,,LinkNotStacked

        MACRO
$name   Atan2 $p
        Function $name, leaf
        [ :LNOT:FloatingPointArgsInRegs
        STMFD   sp!, {r0-r3}
        LDFD    f0, [sp], #8
        LDFD    f1, [sp], #8
        ]
        [ "$p" = "S"
        MVFS    f0, f0
        MVFS    f1, f1
        ]
        ; Try to deal with x != �y case as fast as possible.
        CMF     f0, f1
        CNFNE   f0, f1
        POLNE$p f0, f1, f0
        Return  ,,LinkNotStacked,NE
        ; If x == �y, then may be special cases (inf,inf) and (0,0), which
        ; POL rejects as invalid operations.
        ; Handle these by regularising arguments, eg: (+inf,-inf) -> (+1,-1)
        ; (+0,+0) -> (+0,+1)
        CMF     f0, #0
        BNE     %FT10
      [ FloatingPointArgsInRegs
        [ "$p" = "S"
        STFS    f1, [sp, #-4]!
        LDR     r2, [sp], #4
        |
        STFD    f1, [sp, #-8]!
        LDR     r2, [sp], #8
        ]
      ]
        TEQ     r2, #0
        MVFPL$p f1, #1
        MNFMI$p f1, #1
        POL$p   f0, f1, f0
        Return  ,,LinkNotStacked

10      MVFGT$p f2, #1
        MNFLE$p f2, #1
        CMF     f0, f1
        MVFEQ$p f1, f2
        MNFNE$p f1, f2
        POL$p   f0, f1, f2
        Return  ,,LinkNotStacked
        MEND

;atan2   Atan2   D
atan2f  Atan2   S

        END