; Copyright 2008 Castle Technology Ltd
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
;     http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.
;
        GET     objmacs.s

        CodeArea

        EXPORT  _ll_from_u
        EXPORT  _ll_from_l
        EXPORT  _ll_to_l

        EXPORT  _ll_add
        EXPORT  _ll_addlu
        EXPORT  _ll_addls
        EXPORT  _ll_adduu
        EXPORT  _ll_addss
        EXPORT  _ll_sub
        EXPORT  _ll_sublu
        EXPORT  _ll_subls
        EXPORT  _ll_subuu
        EXPORT  _ll_subss
        EXPORT  _ll_rsb
        EXPORT  _ll_rsblu
        EXPORT  _ll_rsbls
        EXPORT  _ll_rsbuu
        EXPORT  _ll_rsbss
        EXPORT  _ll_mul
        EXPORT  _ll_mullu
        EXPORT  _ll_mulls
        EXPORT  _ll_muluu
        EXPORT  _ll_mulss
        EXPORT  _ll_udiv
        EXPORT  _ll_urdv
        EXPORT  _ll_udiv10
        EXPORT  _ll_sdiv
        EXPORT  _ll_srdv
        EXPORT  _ll_sdiv10

        EXPORT  _ll_not
        EXPORT  _ll_neg
        EXPORT  _ll_and
        EXPORT  _ll_or
        EXPORT  _ll_eor
        EXPORT  _ll_shift_l
        EXPORT  _ll_ushift_r
        EXPORT  _ll_sshift_r

        EXPORT  _ll_cmpu
        EXPORT  _ll_cmpge
        EXPORT  _ll_cmple

        IMPORT  __rt_div0
        IMPORT  __rt_udiv

                GBLL    HaveCLZ
HaveCLZ         SETL    {FALSE}

XOS_EnterOS     *       &16
XOS_LeaveOS     *       &7C

CPUArch_pre_v4  *       0
CPUArch_v4      *       1
CPUArch_v4T     *       2
CPUArch_v5      *       3
CPUArch_v5T     *       4
CPUArch_v5TE    *       5
CPUArch_v5TEJ   *       6
CPUArch_v6      *       7

cp15    CP      15
c0      CN      0

        ; Routine to determine the CPU architecture
        ; (needs to be called from init somewhere)
        ; In:  v6 = static base, USR mode
ReadCPUArch
        Push    "a1,lr"
        SWI     XOS_EnterOS
        MRC     cp15, 0, lr, c0, c0, 0
        ANDS    a1, lr, #&F000
        MOVEQ   lr, #0 ; 0 if pre-ARM7
        TEQNE   a1, #&7000
        MOVEQ   a1, lr, LSR #22
        ANDEQ   a1, a1, #2 ; ARM7 may be v3 or v4T
        MOVNE   a1, lr, LSR #16
        ANDNE   a1, a1, #&F ; post-ARM7 may be v4 onwards
;        STR     a1, [v6, #O__architecture]
        SWI     XOS_LeaveOS
      [ {CONFIG}=26
        TEQVSP  pc, #0
      |
        MSRVS   CPSR_c, #0
      ]
        Pop     "a1,pc"

        ; CPUArch
        ; Determine the architecture of the CPU
        ; Ideally this should be cached in static workspace
        ; but that can't be done as a retrofit to old versions!
        ; $r: output register to hold one of the constants above
        ; $tmp: scratch register
        MACRO
$l      CPUArch $r, $tmp
 [ 1=1
        MOV     $r, #CPUArch_pre_v4 ; eg 7500FE
 |
  [ 1=1
        MOV     $r, #CPUArch_v4 ; eg StrongARM
  |
   [ 1=1
        MOV     $r, #CPUArch_v5TE ; eg XScale
   |
        LoadStaticBase $r, $tmp
        LDR     $r, [$r, #O__architecture]
   ]
  ]
 ]
        MEND


        ; Convert uint32_t to uint64_t or int64_t
        ; In:  a1
        ; Out: (a1,a2)
_ll_from_u
        MOV     a2, #0
        Return  ,, LinkNotStacked

        ; Convert int32_t to int64_t or uint64_t
        ; In:  a1
        ; Out: (a1,a2)
_ll_from_l
        MOV     a2, a1, ASR #31
        Return  ,, LinkNotStacked

        ; Convert int64_t or uint64_t to int32_t or uint32_t
        ; In:  (a1,a2)
        ; Out: a1
_ll_to_l
        Return  ,, LinkNotStacked


        ; Add two 64-bit numbers
        ; In:  (a1,a2),(a3,a4)
        ; Out: (a1,a2)
_ll_add
        ADDS    a1, a1, a3
        ADC     a2, a2, a4
        Return  ,, LinkNotStacked

        ; Add a uint32_t to a 64-bit number
        ; In:  (a1,a2),a3
        ; Out: (a1,a2)
_ll_addlu
        ADDS    a1, a1, a3
        ADC     a2, a2, #0
        Return  ,, LinkNotStacked

        ; Add an int32_t to a 64-bit number
        ; In:  (a1,a2),a3
        ; Out: (a1,a2)
_ll_addls
        ADDS    a1, a1, a3
        ADC     a2, a2, a3, ASR #31
        Return  ,, LinkNotStacked

       ; Create a 64-bit number by adding two uint32_t numbers
       ; In:  a1,a2
       ; Out: (a1,a2)
_ll_adduu
        ADDS    a1, a1, a2
        MOVCC   a2, #0
        MOVCS   a2, #1
        Return  ,, LinkNotStacked

       ; Create a 64-bit number by adding two int32_t numbers
       ; In:  a1,a2
       ; Out: (a1,a2)
_ll_addss
        MOV     ip, a1, ASR #31
        ADDS    a1, a1, a2
        ADC     a2, ip, a2, ASR #31
        Return  ,, LinkNotStacked


        ; Subtract two 64-bit numbers
        ; In:  (a1,a2),(a3,a4)
        ; Out: (a1,a2)
_ll_sub
        SUBS    a1, a1, a3
        SBC     a2, a2, a4
        Return  ,, LinkNotStacked

        ; Subtract a uint32_t from a 64-bit number
        ; In:  (a1,a2),a3
        ; Out: (a1,a2)
_ll_sublu
        SUBS    a1, a1, a3
        SBC     a2, a2, #0
        Return  ,, LinkNotStacked

        ; Subtract an int32_t from a 64-bit number
        ; In:  (a1,a2),a3
        ; Out: (a1,a2)
_ll_subls
        SUBS    a1, a1, a3
        SBC     a2, a2, a3, ASR #31
        Return  ,, LinkNotStacked

       ; Create a 64-bit number by subtracting two uint32_t numbers
       ; In:  a1,a2
       ; Out: (a1,a2)
_ll_subuu
        SUBS    a1, a1, a2
        MOVCC   a2, #-1
        MOVCS   a2, #0   ; carry = not borrow
        Return  ,, LinkNotStacked

       ; Create a 64-bit number by subtracting two int32_t numbers
       ; In:  a1,a2
       ; Out: (a1,a2)
_ll_subss
        MOV     ip, a1, ASR #31
        SUBS    a1, a1, a2
        SBC     a2, ip, a2, ASR #31
        Return  ,, LinkNotStacked


        ; Reverse-subtract two 64-bit numbers
        ; In:  (a1,a2),(a3,a4)
        ; Out: (a1,a2)
_ll_rsb
        RSBS    a1, a1, a3
        RSC     a2, a2, a4
        Return  ,, LinkNotStacked

        ; Subtract a 64-bit number from a uint32_t
        ; In:  (a1,a2),a3
        ; Out: (a1,a2)
_ll_rsblu
        RSBS    a1, a1, a3
        RSC     a2, a2, #0
        Return  ,, LinkNotStacked

        ; Subtract a 64-bit number from an int32_t
        ; In:  (a1,a2),a3
        ; Out: (a1,a2)
_ll_rsbls
        RSBS    a1, a1, a3
        RSC     a2, a2, a3, ASR #31
        Return  ,, LinkNotStacked

       ; Create a 64-bit number by reverse-subtracting two uint32_t numbers
       ; In:  a1,a2
       ; Out: (a1,a2)
_ll_rsbuu
        RSBS    a1, a1, a2
        MOVCC   a2, #-1
        MOVCS   a2, #0   ; carry = not borrow
        Return  ,, LinkNotStacked

       ; Create a 64-bit number by reverse-subtracting two int32_t numbers
       ; In:  a1,a2
       ; Out: (a1,a2)
_ll_rsbss
        MOV     ip, a1, ASR #31
        RSBS    a1, a1, a2
        RSB     a2, ip, a2, ASR #31
        Return  ,, LinkNotStacked


        ; Multiply two 64-bit numbers
        ; In:  (a1,a2),(a3,a4)
        ; Out: (a1,a2)
_ll_mul
        FunctionEntry
        CPUArch ip, lr
        CMP     ip, #CPUArch_v4
        BCC     mul_hardway
        ; Have UMULL instruction
        MOV     ip, a1
        UMULL   a1, lr, a3, a1
        MLA     lr, ip, a4, lr
        MLA     a2, a3, a2, lr
        Return
mul_hardway
        ; No UMULL instruction
        ; Break the operation down thus:
        ;              aaaaaaaa bbbb cccc
        ;            * dddddddd eeee ffff
        ;              ------------------
        ;                     cccc * ffff
        ;                bbbb * ffff
        ;                cccc * eeee
        ;           bbbb * eeee
        ;   aaaaaaaa * eeeeffff
        ; + dddddddd * bbbbcccc
        MUL     a2, a3, a2          ; msw starts as aaaaaaaa * eeeeffff
        MLA     a2, a4, a1, a2      ; msw += dddddddd * bbbbcccc

        MOV     lr, a3, LSR #16     ; lr = eeee from now on
        MOV     ip, a1, LSR #16     ; ip = bbbb from now on
        SUB     a4, a3, lr, LSL #16 ; a4 = ffff
        SUB     a3, a1, ip, LSL #16 ; a3 = cccc
        MUL     a1, a3, a4          ; lsw starts as cccc * ffff

        MUL     a4, ip, a4
        MUL     a3, lr, a3
        ADDS    a3, a4, a3          ; a3 = (bbbb * ffff + cccc * eeee) [0:31]
        MOV     a4, a3, RRX         ; a4 = (bbbb * ffff + cccc * eeee) [1:32]

        ADDS    a1, a1, a3, LSL #16 ; lsw now complete
        ADC     a2, a2, a4, LSR #15
        MLA     a2, ip, lr, a2      ; msw completed by adding bbbb * eeee
        Return

        ; Multiply a 64-bit number by a uint32_t
        ; In:  (a1,a2),a3
        ; Out: (a1,a2)
_ll_mullu
        FunctionEntry
        CPUArch ip, lr
        CMP     ip, #CPUArch_v4
        MOVCC   a4, #0
        BCC     mul_hardway
        ; Have UMULL instruction
        UMULL   a1, lr, a3, a1
        MLA     a2, a3, a2, lr
        Return

        ; Multiply a 64-bit number by an int32_t
        ; In:  (a1,a2),a3
        ; Out: (a1,a2)
_ll_mulls
        MOV     a4, a3, ASR #31
        B       _ll_mul

       ; Create a 64-bit number by multiplying two uint32_t numbers
       ; In:  a1,a2
       ; Out: (a1,a2)
_ll_muluu
        FunctionEntry
        CPUArch ip, lr
        CMP     ip, #CPUArch_v4
        BCC     %FT50
        ; Have UMULL instruction
        MOV     lr, a1
        UMULL   a1, a2, lr, a2
        Return
50     ; No UMULL instruction
        MOV     a3, a2
        MOV     a2, #0
        MOV     a4, #0
        B       mul_hardway

       ; Create a 64-bit number by multiplying two int32_t numbers
       ; In:  a1,a2
       ; Out: (a1,a2)
_ll_mulss
        FunctionEntry
        CPUArch ip, lr
        CMP     ip, #CPUArch_v4
        BCC     %FT50
        ; Have SMULL instruction
        MOV     lr, a1
        SMULL   a1, a2, lr, a2
        Return
50     ; No SMULL instruction
        MOV     a3, a2
        MOV     a2, a1, ASR #31
        MOV     a4, a3, ASR #31
        B       mul_hardway


        ; Emulate CLZ instruction for architectures that lack it
        ; Pinched from AsmUtils
soft_clz
        ORRS    a4, a1, a1, LSR #1
        MOVEQ   a1, #32
        ORRNE   a1, a4, a4, LSR #2
        Return  ,, LinkNotStacked, EQ
        ORR     a1, a1, a1, LSR #4
        LDR     a2, =&06C9C57D
        ORR     a1, a1, a1, LSR #8
        ADR     a3, clz_table
        ORR     a1, a1, a1, LSR #16
        MLAS    a1, a2, a1, a2
        LDRNEB  a1, [a3, a1, LSR #27]
        Return  ,, LinkNotStacked
clz_table
        = 32, 31, 14, 30, 22, 13, 29, 19,  2, 21, 12, 10, 25, 28, 18,  8
        =  1, 15, 23, 20,  3, 11, 26,  9, 16, 24,  4, 27, 17,  5,  6,  7

        ; Divide a uint64_t by another, returning both quotient and remainder
        ; In:  dividend (a1,a2), divisor (a3,a4)
        ; Out: quotient (a1,a2), remainder (a3,a4)
_ll_udiv
        FunctionEntry , "a1-v6,sl,fp"
        ; Register usage:
        ; v1,v2 = quotient (initially 0)
        ; v3,v4 = remainder (initially dividend)
        ; v5,v6 = divisor
        ; sl = CPU architecture
        ; fp used as a scratch register
        ; note none of our callees use sl or fp in their usual sense
        Pop     "v3-v6"
_ll_udiv_lateentry
        MOV     v1, #0
        MOV     v2, #0

        ; Calculate a floating point underestimate of the
        ; reciprocal of the divisor. The representation used is
        ;   mantissa: 16 bits
        ;   exponent: number of binary places below integers of lsb of mantissa
        ; The way the mantissa and exponent are calculated
        ; depends upon the number of leading zeros in the divisor.
        CPUArch sl, lr
        CMP     sl, #CPUArch_v5
        CLZCS   a1, v6
        MOVCC   a1, v6
        BLCC    soft_clz
        MOV     fp, a1 ; fp = leading zeros in divisor
        CMP     fp, #16
        BCS     %FT10
        ; Divisor has 0..15 leading zeros.
        MOV     a2, v6, LSL fp
        MOVS    a1, v5
        MOVEQS  a1, a2, LSL #16
        MOVNE   a1, #1 ; round up to account for loss of accuracy
        ADD     a1, a1, a2, LSR #16 ; divisor for calculating mantissa
        B       %FT40

10      CMP     v6, #0
        BEQ     %FT20
        ; Divisor has 16..31 leading zeros.
        SUB     a2, fp, #16
        RSB     a3, fp, #48
        MOVS    a1, v5, LSL a2
        MOVNE   a1, #1 ; round up to account for loss of accuracy
        ADD     a1, a1, v6, LSL a2
        ADD     a1, a1, v5, LSR a3 ; divisor for calculating mantissa
        B       %FT40

20      CMP     sl, #CPUArch_v5
        CLZCS   a1, v5
        MOVCC   a1, v5
        BLCC    soft_clz
        ADD     fp, a1, #32 ; fp = leading zeros in divisor
        CMP     fp, #48
        BCS     %FT30
        ; Divisor has 32..47 leading zeros.
        MOV     a2, v5, LSL a1
        MOVS    a1, a2, LSL #16
        MOVNE   a1, #1 ; round up to account for loss of accuracy
        ADD     a1, a1, a2, LSR #16 ; divisor for calculating mantissa
        B       %FT40

30      CMP     v5, #0
        BEQ     %FT99
        ; Divisor has 48..63 leading zeros.
        SUB     a2, a1, #16
        MOV     a1, v5, LSL a2 ; divisor for calculating mantissa
        ; drop through

40      MOV     a2, #&80000000 ; dividend for calculating mantissa
        BL      __rt_udiv ; a1 = mantissa &8000..&10000
        RSB     a2, fp, #15+64 ; a2 = exponent
        TST     a1, #&10000
        MOVNE   a1, #&8000 ; force any &10000 mantissas into 16 bits
        SUBNE   a2, a2, #1

50      ; Main iteration loop:
        ; each time round loop, calculate a close underestimate of
        ; the quotient by multiplying through the "remainder" by the
        ; approximate reciprocal of the divisor.
        ; a1 = mantissa
        ; a2 = exponent

        ; Perform 16 (a1) * 64 (v3,v4) -> 80 (a3,a4,lr) multiply
        CMP     sl, #CPUArch_v4
        BCC     %FT51

        ; Have UMULL instruction
        UMULL   a3, ip, v3, a1
        UMULL   a4, lr, v4, a1
        ADDS    a4, ip, a4
        ADC     lr, lr, #0
        B       %FT60

51      ; No UMULL instruction
        ;        aaaa bbbb cccc dddd
        ;      *                eeee
        ;        -------------------
        ;                dddd * eeee
        ;           cccc * eeee
        ;      bbbb * eeee
        ; aaaa * eeee
        MOV     ip, v4, LSR #16
        MOV     fp, v3, LSR #16
        SUB     a4, v4, ip, LSL #16
        SUB     a3, v3, fp, LSL #16
        MUL     ip, a1, ip
        MUL     fp, a1, fp
        MUL     a4, a1, a4
        MUL     a3, a1, a3
        MOV     lr, ip, LSR #16
        MOV     ip, ip, LSL #16
        ORR     ip, ip, fp, LSR #16
        MOV     fp, fp, LSL #16
        ADDS    a3, a3, fp
        ADCS    a4, a4, ip
        ADC     lr, lr, #0

60      ; Shift down by exponent
        ; First a word at a time, if necessary:
        SUBS    ip, a2, #32
        BCC     %FT62
61      MOV     a3, a4
        MOV     a4, lr
        MOV     lr, #0
        SUBS    ip, ip, #32
        BCS     %BT61
62      ; Then by bits, if necessary:
        ADDS    ip, ip, #32
        BEQ     %FT70
        RSB     fp, ip, #32
        MOV     a3, a3, LSR ip
        ORR     a3, a3, a4, LSL fp
        MOV     a4, a4, LSR ip
        ORR     a4, a4, lr, LSL fp

70      ; Now (a3,a4) contains an underestimate of the quotient.
        ; Add it to the running total for the quotient, then
        ; multiply through by divisor and subtract from the remainder.

        ; Sometimes (a3,a4) = 0, in which case this step can be skipped.
        ORRS    lr, a3, a4
        BEQ     %FT80

        ADDS    v1, v1, a3
        ADC     v2, v2, a4

        CMP     sl, #CPUArch_v4
        MOVCS   lr, a3
        UMULLCS a3, ip, v5, lr
        MLACS   a4, v5, a4, ip
        MLACS   a4, v6, lr, a4
        BCS     %FT75

        ; No UMULL instruction
        ; Proceeed as for mul_hardway
        MUL     a4, v5, a4
        MLA     a4, v6, a3, a4

        MOV     ip, a3, LSR #16
        MOV     lr, v5, LSR #16
        SUB     fp, a3, ip, LSL #16
        SUB     lr, v5, lr, LSL #16
        MUL     a3, fp, lr
        Push    "ip"

        MUL     ip, lr, ip
        MOV     lr, v5, LSR #16
        MUL     fp, lr, fp
        ADDS    fp, ip, fp
        MOV     ip, fp, RRX

        ADDS    a3, a3, fp, LSL #16
        ADC     a4, a4, ip, LSR #15
        Pop     "ip"
        MLA     a4, ip, lr, a4

75      SUBS    v3, v3, a3
        SBC     v4, v4, a4

80      ; Termination condition for iteration loop is
        ;   remainder < divisor
        ; OR
        ;   quotient increment == 0
        CMP     v3, v5
        SBCS    lr, v4, v6
        TEQCC   lr, lr     ; set Z if r < d (and preserve C)
        ORRCSS  lr, a3, a4 ; else Z = a3 and a4 both 0
        BNE     %BT50

        ; The final multiple of the divisor can get lost in rounding
        ; so subtract one more divisor if necessary
        CMP     v3, v5
        SBCS    lr, v4, v6
        BCC     %FT85
        ADDS    v1, v1, #1
        ADC     v2, v2, #0
        SUBS    v3, v3, v5
        SBC     v4, v4, v6
85
        Push    "v1-v4"
        Return  , "a1-v6,sl,fp"

99      ; Division by zero
        Pop     "v1-v6,sl,fp,lr"
        B       __rt_div0

        ; Divide a uint64_t by another, returning both quotient and remainder
        ; In:  divisor (a1,a2), dividend (a3,a4)
        ; Out: quotient (a1,a2), remainder (a3,a4)
_ll_urdv
        FunctionEntry , "a1-v6,sl,fp"
        Pop     "v5,v6"
        Pop     "v3,v4"
        B       _ll_udiv_lateentry

        ; Divide a uint64_t by 10, returning both quotient and remainder
        ; In:  (a1,a2)
        ; Out: quotient (a1,a2), remainder (a3,a4)
_ll_udiv10
        Push    "a1"
        CPUArch ip, a4
        CMP     ip, #CPUArch_v4
        BCC     %FT50
        ; Have UMULL instruction
        ; Multiply by 0.6 (= &0.999 recurring)
        ; and subtract multiplication by 0.5 (LSR #1).
        ; Ignore fractional parts for now.
        LDR     ip, =&99999999
        UMULL   a4, a3, a1, ip
        UMULL   a4, ip, a2, ip
        MOVS    a2, a2, LSR #1
        MOVS    a1, a1, RRX
        ADCS    a1, a1, #0
        ADC     a2, a2, #0
        SUBS    a1, a4, a1
        SBC     a2, ip, a2
        ADDS    a1, a1, ip
        ADC     a2, a2, #0
        ADDS    a1, a1, a3
        ADC     a2, a2, #0
        ; It can be shown mathematically that this is an underestimate
        ; of the true quotient by up to 2.5. Compensate by detecting
        ; over-large remainders.
40      MOV     ip, #10
        MUL     a3, a1, ip ; quotient * 10 (MSW is unimportant)
        Pop     "a4"
        SUB     a3, a4, a3 ; remainder between 0 and 25
        ; Bring the remainder back within range.
        ; For a number x <= 68, x / 10 == (x * 13) >> 7
        MOV     a4, #13
        MUL     a4, a3, a4
        MOV     a4, a4, LSR #7
        ADDS    a1, a1, a4
        ADC     a2, a2, #0
        MUL     a4, ip, a4
        SUB     a3, a3, a4
        MOV     a4, #0
        Return  ,, LinkNotStacked

50      ; No UMULL instruction
        ; Multiply by 0.6 (= &0.999 recurring)
        ; and subtract multiplication by 0.5 (LSR #1).
        ; Ignore fractional parts for now.
        Push    "v1,lr"
        LDR     lr, =&9999
        MOV     ip, a2, LSR #16     ; MS halfword
        SUB     v1, a2, ip, LSL #16
        MOV     a4, a1, LSR #16
        SUB     a3, a1, a4, LSL #16 ; LS halfword
        MUL     a3, lr, a3          ; multiply through by &9999
        MUL     a4, lr, a4
        MUL     v1, lr, v1
        MUL     ip, lr, ip
        MOVS    a2, a2, LSR #1      ; find half the dividend
        MOVS    a1, a1, RRX
        ADCS    a1, a1, #0          ; round upwards
        ADC     a2, a2, #0
        ADD     a4, a4, a4, LSR #16 ; can't unsigned overflow
        ADD     a4, a4, a3, LSR #16 ; can't unsigned overflow
        SUBS    a1, a4, a1
        SBC     a2, ip, a2
        ADDS    a1, a1, v1
        ADC     a2, a2, #0
        ADDS    a1, a1, v1, ROR #16
        ADC     a2, a2, v1, LSR #16
        ADDS    a1, a1, ip
        ADC     a2, a2, #0
        ADDS    a1, a1, ip, ROR #16
        ADC     a2, a2, ip, LSR #16
        ; It can be shown mathematically that this is an underestimate
        ; of the true quotient by up to 4.5. Compensate by detecting
        ; over-large remainders.
        Pop     "v1,lr"
        B       %BT40

        ; Divide an int64_t by another, returning both quotient and remainder
        ; In:  dividend (a1,a2), divisor (a3,a4)
        ; Out: quotient (a1,a2), remainder (a3,a4)
        ; Remainder has same sign as dividend - required by C99, although
        ; earlier versions of C allowed the sign to match the divisor
_ll_sdiv
        FunctionEntry , "v1"
        MOVS    v1, a4, LSR #31
        BEQ     %FT10
        ; Find absolute divisor
        RSBS    a3, a3, #0
        RSC     a4, a4, #0
10      EORS    v1, v1, a2, ASR #31
        BPL     %FT20
        ; Find absolute dividend
        RSBS    a1, a1, #0
        RSC     a2, a2, #0
20      BL      _ll_udiv
        TEQ     v1, #0
        BPL     %FT30
        ; Remainder is negative (sign(dividend) == -1)
        RSBS    a3, a3, #0
        RSC     a4, a4, #0
30      TST     v1, #1
        BEQ     %FT40
        ; Quotient is negative (sign(divisor) != sign(dividend))
        RSBS    a1, a1, #0
        RSC     a2, a2, #0
40
        Return  , "v1"

        ; Divide an int64_t by another, returning both quotient and remainder
        ; In:  divisor (a1,a2), dividend (a3,a4)
        ; Out: quotient (a1,a2), remainder (a3,a4)
        ; Remainder has same sign as dividend - required by C99, although
        ; earlier versions of C allowed the sign to match the divisor
_ll_srdv
        FunctionEntry , "v1"
        MOVS    v1, a2, LSR #31
        BEQ     %FT10
        ; Find absolute divisor
        RSBS    a1, a1, #0
        RSC     a2, a2, #0
10      EORS    v1, v1, a4, ASR #31
        BPL     %FT20
        ; Find absolute dividend
        RSBS    a3, a3, #0
        RSC     a4, a4, #0
20      BL      _ll_urdv
        TEQ     v1, #0
        BPL     %FT30
        ; Remainder is negative (sign(dividend) == -1)
        RSBS    a3, a3, #0
        RSC     a4, a4, #0
30      TST     v1, #1
        BEQ     %FT40
        ; Quotient is negative (sign(divisor) != sign(dividend))
        RSBS    a1, a1, #0
        RSC     a2, a2, #0
40
        Return  , "v1"

        ; Divide an int64_t by 10, returning both quotient and remainder
        ; Remainder has same sign as dividend - required by C99, although
        ; earlier versions of C allowed the sign to match the divisor
        ; In:  (a1,a2)
        ; Out: quotient (a1,a2), remainder (a3,a4)
_ll_sdiv10
        FunctionEntry , "v1"
        MOVS    v1, a2
        BPL     %FT10
        RSBS    a1, a1, #0 ; find abs(divisor)
        RSC     a2, a2, #0
10      BL      _ll_udiv10
        TEQ     v1, #0
        Return  , "v1",, PL
        RSBS    a1, a1, #0
        RSC     a2, a2, #0
        RSBS    a3, a3, #0
        RSC     a4, a4, #0
        Return  , "v1"

        ; Find the bitwise NOT of a 64-bit number
        ; In:  (a1,a2)
        ; Out: (a1,a2)
_ll_not
        MVN     a1, a1
        MVN     a2, a2
        Return  ,, LinkNotStacked

        ; Find the negative of a 64-bit number
        ; In:  (a1,a2)
        ; Out: (a1,a2)
_ll_neg
        RSBS    a1, a1, #0
        RSC     a2, a2, #0
        Return  ,, LinkNotStacked

        ; Find the bitwise AND of two 64-bit numbers
        ; In:  (a1,a2)
        ; Out: (a1,a2)
_ll_and
        AND     a1, a1, a3
        AND     a2, a2, a4
        Return  ,, LinkNotStacked

        ; Find the bitwise OR of two 64-bit numbers
        ; In:  (a1,a2)
        ; Out: (a1,a2)
_ll_or
        ORR     a1, a1, a3
        ORR     a2, a2, a4
        Return  ,, LinkNotStacked

        ; Find the bitwise exclusive OR of two 64-bit numbers
        ; In:  (a1,a2)
        ; Out: (a1,a2)
_ll_eor
        EOR     a1, a1, a3
        EOR     a2, a2, a4
        Return  ,, LinkNotStacked

        ; Shift a 64-bit number left
        ; In:  (a1,a2),a3
        ; Out: (a1,a2)
_ll_shift_l
        RSBS    ip, a3, #32
        MOVHI   a2, a2, LSL a3
        ORRHI   a2, a2, a1, LSR ip
        MOVHI   a1, a1, LSL a3
        Return  ,, LinkNotStacked, HI
        SUB     ip, a3, #32
        MOV     a2, a1, LSL ip
        MOV     a1, #0
        Return  ,, LinkNotStacked

        ; Logical-shift a 64-bit number right
        ; In:  (a1,a2),a3
        ; Out: (a1,a2)
_ll_ushift_r
        RSBS    ip, a3, #32
        MOVHI   a1, a1, LSR a3
        ORRHI   a1, a1, a2, LSL ip
        MOVHI   a2, a2, LSR a3
        Return  ,, LinkNotStacked, HI
        SUB     ip, a3, #32
        MOV     a1, a2, LSR ip
        MOV     a2, #0
        Return  ,, LinkNotStacked

        ; Arithmetic-shift a 64-bit number right
        ; In:  (a1,a2),a3
        ; Out: (a1,a2)
_ll_sshift_r
        RSBS    ip, a3, #32
        MOVHI   a1, a1, LSR a3
        ORRHI   a1, a1, a2, LSL ip
        MOVHI   a2, a2, ASR a3
        Return  ,, LinkNotStacked, HI
        SUB     ip, a3, #32
        MOV     a1, a2, ASR ip
        MOV     a2, a1, ASR #31
        Return  ,, LinkNotStacked

        ; Compare two uint64_t numbers, or test two int64_t numbers for equality
        ; In:  (a1,a2),(a3,a4)
        ; Out: Z set if equal, Z clear if different
        ;      C set if unsigned higher or same, C clear if unsigned lower
        ;      all registers preserved
_ll_cmpu
        CMP     a2, a4
        CMPEQ   a1, a3
        MOV     pc, lr ; irrespective of calling standard

        ; Compare two int64_t numbers for testing GE or LT
        ; In:  (a1,a2),(a3,a4)
        ; Out: N == V if signed greater than or equal, N != V if signed less than
        ;      a1, a2 corrupted
_ll_cmpge
        SUBS    a1, a1, a3
        SBCS    a2, a2, a4
        MOV     pc, lr ; irrespective of calling standard

        ; Compare two int64_t numbers for testing LE or GT
        ; In:  (a1,a2),(a3,a4)
        ; Out: N == V if signed less than or equal, N != V if signed greater than
        ;      (ie subsequent instructions need to use GE/LT condition instead of LE/GT)
        ;      a1, a2 corrupted
_ll_cmple
        SUBS    a1, a3, a1
        SBCS    a2, a4, a2
        MOV     pc, lr ; irrespective of calling standard

; Now the floating point functions...

        EXPORT  _ll_uto_d
        EXPORT  _ll_sto_d
        EXPORT  _ll_uto_f
        EXPORT  _ll_sto_f
        EXPORT  _ll_ufrom_d
        EXPORT  _ll_sfrom_d
        EXPORT  _ll_ufrom_f
        EXPORT  _ll_sfrom_f
        EXPORT  llrint
        EXPORT  llrintf
        EXPORT  llround
        EXPORT  llroundf

; bit  31      rounding direction
; bits 30..26  exceptions (30=INX,29=UFL,28=OFL,27=DVZ,26=IVO)
; bit  24      flush to zero
; bits 23..22  rounding mode
; bit  18      "round" version of to-nearest (halfway case round away from zero)
; bit  17      rounded convert (as opposed to towards zero)
; bit  16      attempt to convert NaN
; bits 9..7    in type
; bits 6..4    out type
; bits 3..0    function

FE_EX_RDIR      *       &80000000
FE_EX_EXCEPT_MASK *     &7C000000
FE_EX_INEXACT   *       &40000000
FE_EX_UNDERFLOW *       &20000000
FE_EX_OVERFLOW  *       &10000000
FE_EX_DIVBYZERO *       &08000000
FE_EX_INVALID   *       &04000000
FE_EX_FLUSHZERO *       &01000000
FE_EX_ROUND_MASK *      &00C00000
FE_EX_CVT_RND   *       &00040000
FE_EX_CVT_R     *       &00020000
FE_EX_CVT_NAN   *       &00010000
FE_EX_INTYPE_MASK *     &00000380
FE_EX_OUTTYPE_MASK *    &00000070
FE_EX_TYPE_MASK *       &00000070
FE_EX_FN_MASK   *       &0000000F

FE_EX_ROUND_NEAREST *   &00000000
FE_EX_ROUND_PLUSINF *   &00400000
FE_EX_ROUND_MINUSINF *  &00800000
FE_EX_ROUND_ZERO *      &00C00000

FE_EX_BASETYPE_FLOAT *  0
FE_EX_BASETYPE_DOUBLE * 1
FE_EX_BASETYPE_UNSIGNED * 2
FE_EX_BASETYPE_INT *    4
FE_EX_BASETYPE_LONGLONG * FE_EX_BASETYPE_INT+FE_EX_BASETYPE_DOUBLE
FE_EX_BASETYPE_UINT *   FE_EX_BASETYPE_INT+FE_EX_BASETYPE_UNSIGNED
FE_EX_BASETYPE_ULONGLONG * FE_EX_BASETYPE_LONGLONG+FE_EX_BASETYPE_UNSIGNED

FE_EX_TYPE_FLOAT *      FE_EX_BASETYPE_FLOAT :SHL: 4
FE_EX_TYPE_DOUBLE *     FE_EX_BASETYPE_DOUBLE :SHL: 4
FE_EX_TYPE_INT  *       FE_EX_BASETYPE_INT :SHL: 4
FE_EX_TYPE_LONGLONG *   FE_EX_BASETYPE_LONGLONG :SHL: 4
FE_EX_TYPE_UINT *       FE_EX_BASETYPE_UINT :SHL: 4
FE_EX_TYPE_ULONGLONG *  FE_EX_BASETYPE_ULONGLONG :SHL: 4

FE_EX_INTYPE_FLOAT *    FE_EX_BASETYPE_FLOAT :SHL: 7
FE_EX_INTYPE_DOUBLE *   FE_EX_BASETYPE_DOUBLE :SHL: 7
FE_EX_INTYPE_INT  *     FE_EX_BASETYPE_INT :SHL: 7
FE_EX_INTYPE_LONGLONG * FE_EX_BASETYPE_LONGLONG :SHL: 7
FE_EX_INTYPE_UINT *     FE_EX_BASETYPE_UINT :SHL: 7
FE_EX_INTYPE_ULONGLONG * FE_EX_BASETYPE_ULONGLONG :SHL: 7

FE_EX_OUTTYPE_FLOAT *    FE_EX_BASETYPE_FLOAT :SHL: 4
FE_EX_OUTTYPE_DOUBLE *   FE_EX_BASETYPE_DOUBLE :SHL: 4
FE_EX_OUTTYPE_UNSIGNED * FE_EX_BASETYPE_UNSIGNED :SHL: 4
FE_EX_OUTTYPE_INT  *     FE_EX_BASETYPE_INT :SHL: 4
FE_EX_OUTTYPE_LONGLONG * FE_EX_BASETYPE_LONGLONG :SHL: 4
FE_EX_OUTTYPE_UINT *     FE_EX_BASETYPE_UINT :SHL: 4
FE_EX_OUTTYPE_ULONGLONG * FE_EX_BASETYPE_ULONGLONG :SHL: 4

FE_EX_FN_ADD    *       1
FE_EX_FN_SUB    *       2
FE_EX_FN_MUL    *       3
FE_EX_FN_DIV    *       4
FE_EX_FN_REM    *       5
FE_EX_FN_RND    *       6
FE_EX_FN_SQRT   *       7
FE_EX_FN_CVT    *       8
FE_EX_FN_CMP    *       9
FE_EX_FN_RAISE  *       15

_ll_uto_d
        MOV     a3,#&42000000
        B       dfltll_normalise
_ll_sto_d
        ANDS    a3,a2,#&80000000
        BPL     %FT10
        RSBS    a1,a1,#0
        RSC     a2,a2,#0
10      ORR     a3,a3,#&42000000
dfltll_normalise
        SUB     a3,a3,#&00300000
        MOVS    a4,a2
        MOVNE   a4,#32
        MOVEQS  a2,a1
        Return  ,,LinkNotStacked,EQ
 [ HaveCLZ
        CLZ     ip,a2
        MOV     a2,a2,LSL ip
        SUB     a4,a4,ip
 |
        MOVS    ip,a2,LSR #16
        SUBEQ   a4,a4,#16
        MOVEQS  a2,a2,LSL #16
        TST     a2,#&FF000000
        SUBEQ   a4,a4,#8
        MOVEQ   a2,a2,LSL #8
        TST     a2,#&F0000000
        SUBEQ   a4,a4,#4
        MOVEQ   a2,a2,LSL #4
        TST     a2,#&C0000000
        SUBEQ   a4,a4,#2
        MOVEQS  a2,a2,LSL #2
        MOVPL   a2,a2,LSL #1
        SUBPL   a4,a4,#1
 ]
        ADD     a3,a3,a4,LSL #20
        ORR     ip,a2,a1,LSR a4
        RSB     a4,a4,#32
        MOV     a4,a1,LSL a4
        MOVS    a2,a4,LSL #21
        MOVNE   a2,#FE_EX_INEXACT
        STMDB   sp!,{a2,lr}
        MOVS    a2,a4,LSL #22
        ANDEQ   a2,a4,a4,LSR #1
        MOVEQS  a2,a2,LSR #11
        MOV     a2,a4,LSR #11
        ADCS    a2,a2,ip,LSL #21
        ADC     a1,a3,ip,LSR #11
        MOVS    a4,a4,LSL #22
        LDMIA   sp!,{ip,lr}
        TST     ip,#FE_EX_INEXACT
        BNE     __fpl_exception
        Return  ,,LinkNotStacked

_ll_uto_f
        MOV     a3,#&3F800000
        B       fltll_normalise
_ll_sto_f
        ANDS    a3,a2,#&80000000
        BPL     %FT10
        RSBS    a1,a1,#0
        RSC     a2,a2,#0
10      ORR     a3,a3,#&3F800000
fltll_normalise
        ADD     a3,a3,#&0F000000
        MOVS    a4,a2
        MOVNE   a4,#32
        MOVEQS  a2,a1
        Return  ,,LinkNotStacked,EQ
 [ HaveCLZ
        CLZ     ip,a2
        MOV     a2,a2,LSL ip
        SUB     a4,a4,ip
 |
        MOVS    ip,a2,LSR #16
        SUBEQ   a4,a4,#16
        MOVEQS  a2,a2,LSL #16
        TST     a2,#&FF000000
        SUBEQ   a4,a4,#8
        MOVEQ   a2,a2,LSL #8
        TST     a2,#&F0000000
        SUBEQ   a4,a4,#4
        MOVEQ   a2,a2,LSL #4
        TST     a2,#&C0000000
        SUBEQ   a4,a4,#2
        MOVEQS  a2,a2,LSL #2
        MOVPL   a2,a2,LSL #1
        SUBPL   a4,a4,#1
 ]
        ORR     a2,a2,a1,LSR a4
        ADD     a3,a3,a4,LSL #23
        RSB     a4,a4,#32
        MOVS    ip,a1,LSL a4
        ORRS    ip,ip,a2,LSL #25
        ADC     a1,a3,a2,LSR #8
        ADC     ip,pc,#0
        ORRNES  ip,ip,#4,2
        BICCS   a1,a1,#1
        MOVS    ip,ip,LSL #30
        BNE     __fpl_exception
        Return  ,,LinkNotStacked

_ll_ufrom_d
        MOVS    a3,a1,ASR #20
        MOV     a4,a1,LSL #11
        ORR     a4,a4,a2,LSR #21
        MOV     ip,a2,LSL #11
        ORRNE   a4,a4,#&80000000
        BMI     ll_ufrom_d_neg
        SUB     a3,a3,#&4E
        RSBS    a3,a3,#&03F0
        BLT     ll_ufrom_d_ivo
        CMP     a3,#&50
        MOVGE   a3,#&50
        MOV     a2,a4,LSR a3
        MOV     a1,ip,LSR a3
        RSBS    a3,a3,#32
        ORRHI   a1,a1,a4,LSL a3
        RSB     a3,a3,#0
        ORRLS   a1,a1,a4,LSR a3
        RSBS    a3,a3,#0
        MOVGE   ip,ip,LSL a3
        MOVLT   ip,ip,LSR #1
        ADDS    a3,a3,#32
        ORRGE   ip,ip,a4,LSL a3
        MOVGE   a4,#0
        CMP     a4,#1
        ORRCS   ip,ip,#1
        TST     ip,ip
        Return  ,,LinkNotStacked,EQ
        LDR     ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_ULONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INEXACT
        B       __fpl_exception
ll_ufrom_d_neg
        ADD     a3,a1,#&40000000
        CMN     a3,#&00100000
        BGE     ll_ufrom_d_ivo
        ORRS    a3,a2,a1,LSL #1
        MOV     a1,#0
        MOV     a2,#0
        LDRNE   ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_ULONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INEXACT
        BNE     __fpl_exception
        Return  ,,LinkNotStacked
ll_ufrom_d_ivo
        LDR     ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_ULONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INVALID
        MOV     a4,a1,LSL #1
        CMP     a4,#&FFE00000
        CMPEQ   a2,#0
        ORRHI   ip,ip,#FE_EX_CVT_NAN
        B       __fpl_exception
        LTORG


        MACRO
$func   DblRound $round

$func
        MOVS    a3,a1,ASR #20           ; a3 = exponent, and 21 sign bits
        MOV     a4,a1,LSL #11
        ORR     a4,a4,a2,LSR #21
        MOV     ip,a2,LSL #11           ; ip = low mantissa
        ORRNE   a4,a4,#&80000000        ; a4 = high mantissa, unit bit forced on if neg
        BMI     $func._neg
        SUB     a3,a3,#&4E
        RSBS    a3,a3,#&03F0            ; a3 = &43E-exp = shift to get b.p. at bottom
        BLE     $func._ivo              ; (must shift right, to get bit 63 clear)
        CMP     a3,#80                  ; clamp to a shift by 80
        MOVGE   a3,#80
        MOV     a2,a4,LSR a3            ; a2 & a1 = shifted
        MOV     a1,ip,LSR a3
        RSBS    a3,a3,#32
        ORRHI   a1,a1,a4,LSL a3
        RSB     a3,a3,#0
        ORRLS   a1,a1,a4,LSR a3
        RSBS    a3,a3,#0
        MOVGE   ip,ip,LSL a3
        MOVLT   ip,ip,LSR #1
        ADDS    a3,a3,#32
        ORRGE   ip,ip,a4,LSL a3
        MOVGE   a4,#0
        CMP     a4,#1
        ORRCS   ip,ip,#1
        TST     ip,ip
        Return  ,,LinkNotStacked,EQ
 [ "$round" = "rint"
        TEQ     ip,#&80000000
        LDR     ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INEXACT+FE_EX_CVT_R
        MVNEQS  a4,a1,LSL #31
        BMI     __fpl_exception
        ADDS    a1,a1,#1                ; Can't overflow, as any argument >= 2^52
        ADCS    a2,a2,#0                ; is an integer, so won't get here
        B       __fpl_exception
 ]
 [ "$round" = "round"
        LDR     ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INEXACT+FE_EX_CVT_RND
        BPL     __fpl_exception
        ADDS    a1,a1,#1
        ADCS    a2,a2,#0
        B       __fpl_exception
 ]
 [ "$round" = ""
        LDR     ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INEXACT
        B       __fpl_exception
 ]
$func._neg
        ADD     a3,a3,#&03F0
        CMN     a3,#&0410
        BICEQ   a4,a4,#&80000000        ; clear sign bit if exponent = 0
        RSBS    a3,a3,#&2E
        BLT     $func._ivo
        BEQ     $func._minint
$func._neg_noovf
        CMP     a3,#&50
        MOVGE   a3,#&50
        MOV     a2,a4,LSR a3
        MOV     a1,ip,LSR a3
        RSBS    a3,a3,#32
        ORRHI   a1,a1,a4,LSL a3
        RSB     a3,a3,#0
        ORRLS   a1,a1,a4,LSR a3
        RSBS    a1,a1,#0
        RSC     a2,a2,#0
        RSBS    a3,a3,#0
        MOVGE   ip,ip,LSL a3
        MOVLT   ip,ip,LSR #1
        ADDS    a3,a3,#32
        ORRGE   ip,ip,a4,LSL a3
        MOVGE   a4,#0
        CMP     a4,#1
        ORRCS   ip,ip,#1
        TST     ip,ip
        Return  ,,LinkNotStacked,EQ
 [ "$round" = "rint"
        TEQ     ip,#&80000000
        LDR     ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INEXACT+FE_EX_CVT_R
        MVNEQS  a4,a1,LSL #31
        BMI     __fpl_exception
        SUBS    a1,a1,#1
        SBC     a2,a2,#0
        B       __fpl_exception
 ]
 [ "$round" = "round"
        LDR     ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INEXACT+FE_EX_CVT_RND
        BPL     __fpl_exception
        SUBS    a1,a1,#1
        SBCS    a2,a2,#0
        B       __fpl_exception
 ]
 [ "$round" = ""
        LDR     ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INEXACT
        B       __fpl_exception
 ]
$func._minint
        TEQ     ip,#0
        TEQEQ   a4,#&80000000
        BEQ     $func._neg_noovf
$func._ivo
 [ "$round" = "rint"
        LDR     ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INVALID+FE_EX_CVT_R
 |
        LDR     ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INVALID
 ]
        MOV     a4,a1,LSL #1
        CMP     a4,#&FFE00000
        CMPEQ   a2,#0
        ORRHI   ip,ip,#FE_EX_CVT_NAN
        B       __fpl_exception
        MEND

_ll_sfrom_d     DblRound
llrint          DblRound  rint
llround         DblRound  round
        LTORG

_ll_ufrom_f
        MOVS    a3,a1,ASR #23
        MOV     a4,a1,LSL #8
        ORRNE   a4,a4,#&80000000
        BMI     ll_ufrom_f_negative
        RSBS    a3,a3,#&BE
        BCC     ll_ufrom_f_ivo
        MOV     a2,a4,LSR a3
        SUBS    ip,a3,#32
        MOVCS   a1,a4,LSR ip
        RSBCC   ip,a3,#32
        MOVCC   a1,a4,LSL ip
        Return  ,,LinkNotStacked,CC
        RSBS    a3,a3,#&40
        MOVPL   a4,a4,LSL a3
        MOVMI   a4,a4,LSR #1
        TST     a4,a4
        Return  ,,LinkNotStacked,EQ
        LDR     ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_ULONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INEXACT
        B       __fpl_exception
ll_ufrom_f_negative
        MOV     ip,a1,LSL #1
        CMP     ip,#&7F000000
        BCS     ll_ufrom_f_ivo
        MOV     a1,#0
        MOV     a2,#0
        CMP     ip,#0
        LDRNE   ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_ULONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INEXACT
        BNE     __fpl_exception
        Return  ,,LinkNotStacked
ll_ufrom_f_ivo
        LDR     ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_ULONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INVALID
        MOV     a4,a1,LSL #1
        CMP     a4,#&FF000000
        ORRHI   ip,ip,#FE_EX_CVT_NAN
        B       __fpl_exception
        LTORG

        MACRO
$func   FltRound $round

$func
        MOVS    a3,a1,ASR #23
        MOV     a4,a1,LSL #8
        ORRNE   a4,a4,#&80000000
        BMI     $func.negative
        RSBS    a3,a3,#&BE
        BLS     $func.ivo
        MOV     a2,a4,LSR a3
        SUBS    ip,a3,#32
        MOVCS   a1,a4,LSR ip
        RSBCC   ip,a3,#32
        MOVCC   a1,a4,LSL ip
        Return  ,,LinkNotStacked,CC
        RSBS    a3,a3,#64
        MOVPL   a4,a4,LSL a3
        MOVMI   a4,a4,LSR #1
        TST     a4,a4
        Return  ,,LinkNotStacked,EQ
 [ "$round" = "rint"
        LDR     ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INEXACT+FE_EX_CVT_R
        TEQ     a4,#&80000000
        MVNEQS  a4,a1,LSL #31
        ADDPL   a1,a1,#1                ; Can't overflow, as any argument >= 2^23
                                        ; is an integer, so won't get here
 ]
 [ "$round" = "round"
        LDR     ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INEXACT+FE_EX_CVT_RND
        ADDMI   a1,a1,#1
 ]
 [ "$round" = ""
        LDR     ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INEXACT
 ]
        B       __fpl_exception
$func.negative
        CMP     a1,#&DF000000
        BHI     $func.ivo
        ANDS    a3,a3,#&FF
        BICEQ   a4,a4,#&80000000
        RSB     a3,a3,#&BE
        MOV     a2,a4,LSR a3
        SUBS    ip,a3,#32
        MOVCS   a1,a4,LSR ip
        RSBCC   ip,a3,#32
        MOVCC   a1,a4,LSL ip
        RSBS    a1,a1,#0
        RSC     a2,a2,#0
        RSBS    a3,a3,#&40
        MOVPL   a4,a4,LSL a3
        MOVMI   a4,a4,LSR #1
        TST     a4,a4
        Return  ,,LinkNotStacked,EQ
 [ "$round" = "rint"
        LDR     ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INEXACT+FE_EX_CVT_R
        TEQ     a4,#&80000000
        MVNEQS  a4,a1,LSL #31
        SUBPL   a1,a1,#1                ; Can't overflow, as any argument >= 2^23
                                        ; is an integer, so won't get here
 ]
 [ "$round" = "round"
        LDR     ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INEXACT+FE_EX_CVT_RND
        SUBMI   a1,a1,#1
 ]
 [ "$round" = ""
        LDR     ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INEXACT
 ]
        B       __fpl_exception
$func.ivo
 [ "$round" = "rint"
        LDR     ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INVALID+FE_EX_CVT_R
 |
        LDR     ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INVALID
 ]
        MOV     a4,a1,LSL #1
        CMP     a4,#&FF000000
        ORRHI   ip,ip,#FE_EX_CVT_NAN
        B       __fpl_exception
        MEND

_ll_sfrom_f     FltRound

; Extra complication because of callee narrowing

llrintf
        STMFD   sp!,{a1-a2}
        LDFD    f0,[sp],#8
        STFS    f0,[sp,#-4]!
        LDR     a1,[sp],#4
        ; fall through

_ll_sfrom_f_r   FltRound  rint

llroundf
        STMFD   sp!,{a1-a2}
        LDFD    f0,[sp],#8
        STFS    f0,[sp,#-4]!
        LDR     a1,[sp],#4
        ; fall through

_ll_sfrom_f_rnd FltRound  round




; FP support code.

; __fpl_exception receives all exception-generating results. This includes
; all inexact results, so it is responsible for rounding.
;
; ip on entry tells it what to do, and consists of the FE_EX_xxx flags

__fpl_exception
        TST     ip,#FE_EX_OUTTYPE_DOUBLE
        MOVEQ   a3,a2
        STMDB   sp!,{a1-a4}
        RFS     a2
        BIC     a4,ip,a2,LSL #10       ; BIC out enabled exceptions
        ANDS    a4,a4,#FE_EX_UNDERFLOW+FE_EX_OVERFLOW
        ORRNE   ip,ip,#FE_EX_INEXACT
        MOV     a4,ip,LSL #1
        MOV     a4,a4,LSR #27           ; move exceptions down to bottom
        ORR     a2,a2,a4                ; OR them into cumulative FPSR bits
        AND     a4,a2,#&100             ; extract ND bit
        ORR     ip,ip,a4,LSL #16        ; pop it in our word
        WFS     a2
        AND     a4,ip,#FE_EX_FN_MASK
        TEQ     a4,#FE_EX_FN_CVT
        ANDNE   a4,ip,#FE_EX_TYPE_MASK
        ORRNE   ip,ip,a4,LSL #3
        MOVEQ   a4,#FE_EX_CVT_R
        BICEQS  a4,a4,ip
        ORREQ   ip,ip,#FE_EX_ROUND_ZERO
        ; If we actually had trap handlers we should worry about
        ; FE_EX_CVT_RND here (eg have FE_EX_ROUND_NEAREST)
        TST     ip,#FE_EX_UNDERFLOW
        BNE     underflow
        TST     ip,#FE_EX_OVERFLOW
        BNE     overflow
        TST     ip,#FE_EX_INEXACT
        BNE     inexact
        TST     ip,#FE_EX_DIVBYZERO
        BNE     divide_by_zero
; invalid
        TST     a2,#&00010000           ; IOE bit
        LDMIA   sp!,{a1-a4}
        BEQ     return_NaN
        B       _fp_trapveneer
overflow
        TST     a2,#&00040000           ; OFE bit
        LDMIA   sp!,{a1-a4}
        BEQ     ovf_return
        B       _fp_trapveneer
underflow
        TST     a2,#&00080000           ; UFE bit
        LDMIA   sp!,{a1-a4}
        BEQ     return_result
        B       _fp_trapveneer
divide_by_zero
        TST     a2,#&00020000           ; DZE bit
        LDMIA   sp!,{a1-a4}
        BNE     _fp_trapveneer
        EOR     a3,a1,a3
        B       return_Inf
inexact
        TST     a2,#&00100000           ; IXE bit
        LDMIA   sp!,{a1-a4}
        BEQ     return_result
        B       _fp_trapveneer
return_result
        TST     ip,#FE_EX_OUTTYPE_DOUBLE
        Return  ,,LinkNotStacked
ovf_return
        AND     a3,a1,#&80000000
return_Inf
        AND     a3,a3,#&80000000
        TST     ip,#FE_EX_OUTTYPE_DOUBLE
        ADRNE   a1,prototype_double_Inf
        LDMNEIA a1,{a1,a2}
        ORRNE   a1,a1,a3
        LDREQ   a1,prototype_single_Inf
        ORREQ   a1,a1,a3
        Return  ,,LinkNotStacked
return_NaN
        AND     a3,a1,#&80000000
        TST     ip,#FE_EX_OUTTYPE_DOUBLE
        ADRNE   a1,prototype_double_NaN
        LDMNEIA a1,{a1,a2}
        ORRNE   a1,a1,a3
        LDREQ   a1,prototype_single_NaN
        ORREQ   a1,a1,a3
        B       __fpl_return_NaN
prototype_double_Inf
        DCD     &7FF00000,&00000000
        DCD     &7FEFFFFF,&FFFFFFFF
prototype_single_Inf
        DCD     &7F800000
        DCD     &7F7FFFFF
prototype_double_NaN
        DCD     &7FF80000,&00000000
prototype_single_NaN
        DCD     &7FC00000

__fpl_return_NaN
        AND     a4,ip,#FE_EX_FN_MASK
;        CMP     a4,#FE_EX_FN_CMP
;        MOVEQ   a1,#8
;        BEQ     __fpl_cmpreturn
        CMP     a4,#FE_EX_FN_CVT
        ANDEQ   a4,ip,#FE_EX_OUTTYPE_INT
        TEQEQ   a4,#FE_EX_OUTTYPE_INT
        Return  ,,LinkNotStacked,NE
        TST     ip,#FE_EX_CVT_NAN
        BNE     return_zero
        TST     ip,#FE_EX_OUTTYPE_UNSIGNED
        BNE     return_umaxint
        TST     ip,#FE_EX_OUTTYPE_DOUBLE       ; long long?
        MOV     a3,a1
        MVNEQ   a1,#&80000000
        MVNNE   a2,#&80000000
        MVNNE   a1,#0
        TST     a3,#&80000000
        MVNNE   a1,a1
        MVNNE   a2,a2
        Return  ,,LinkNotStacked
return_zero
        MOV     a1,#0
        MOV     a2,#0
        Return  ,,LinkNotStacked
return_umaxint
        MVN     a1,#0
        MVN     a2,#0
        TST     a3,#&80000000
        MVNNE   a1,a1
        MVNNE   a2,a2
        Return  ,,LinkNotStacked


        IMPORT  feraiseexcept
_fp_trapveneer
        ; This would be a bit backwards for some people, but it works for us...
        ; we know the relevant traps are enabled, so feraiseexcept won't
        ; return. Honest...
        MOV     a1,ip,LSR #26
        AND     a1,a1,#&1F
        B       feraiseexcept

        END