Commit 280c51dc authored by Ben Avison's avatar Ben Avison
Browse files

Created cleanroom implementation of 64-bit integer support for licencing...

Created cleanroom implementation of 64-bit integer support for licencing reasons. Tested fairly thoroughly (in fact, this testing showed up a bug in the old code which unbalanced the stack for 50% of cases where you divided a negative long long by 10). Hopefully should work faster than the old code too!

Reapplied all relevant old tags.

Real commit date 2008-03-18.
parent 6bd573f9
; Copyright 2008 Castle Technology Ltd
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
; http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.
;
GET objmacs.s
CodeArea
EXPORT _ll_from_u
EXPORT _ll_from_l
EXPORT _ll_to_l
EXPORT _ll_add
EXPORT _ll_addlu
EXPORT _ll_addls
EXPORT _ll_adduu
EXPORT _ll_addss
EXPORT _ll_sub
EXPORT _ll_sublu
EXPORT _ll_subls
EXPORT _ll_subuu
EXPORT _ll_subss
EXPORT _ll_rsb
EXPORT _ll_rsblu
EXPORT _ll_rsbls
EXPORT _ll_rsbuu
EXPORT _ll_rsbss
EXPORT _ll_mul
EXPORT _ll_mullu
EXPORT _ll_mulls
EXPORT _ll_muluu
EXPORT _ll_mulss
EXPORT _ll_udiv
EXPORT _ll_urdv
EXPORT _ll_udiv10
EXPORT _ll_sdiv
EXPORT _ll_srdv
EXPORT _ll_sdiv10
EXPORT _ll_not
EXPORT _ll_neg
EXPORT _ll_and
EXPORT _ll_or
EXPORT _ll_eor
EXPORT _ll_shift_l
EXPORT _ll_ushift_r
EXPORT _ll_sshift_r
EXPORT _ll_cmpu
EXPORT _ll_cmpge
EXPORT _ll_cmple
IMPORT __rt_div0
IMPORT __rt_udiv
GBLL HaveCLZ
HaveCLZ SETL {FALSE}
XOS_EnterOS * &16
XOS_LeaveOS * &7C
CPUArch_pre_v4 * 0
CPUArch_v4 * 1
CPUArch_v4T * 2
CPUArch_v5 * 3
CPUArch_v5T * 4
CPUArch_v5TE * 5
CPUArch_v5TEJ * 6
CPUArch_v6 * 7
cp15 CP 15
c0 CN 0
; Routine to determine the CPU architecture
; (needs to be called from init somewhere)
; In: v6 = static base, USR mode
ReadCPUArch
Push "a1,lr"
SWI XOS_EnterOS
MRC cp15, 0, lr, c0, c0, 0
ANDS a1, lr, #&F000
MOVEQ lr, #0 ; 0 if pre-ARM7
TEQNE a1, #&7000
MOVEQ a1, lr, LSR #22
ANDEQ a1, a1, #2 ; ARM7 may be v3 or v4T
MOVNE a1, lr, LSR #16
ANDNE a1, a1, #&F ; post-ARM7 may be v4 onwards
; STR a1, [v6, #O__architecture]
SWI XOS_LeaveOS
[ {CONFIG}=26
TEQVSP pc, #0
|
MSRVS CPSR_c, #0
]
Pop "a1,pc"
; CPUArch
; Determine the architecture of the CPU
; Ideally this should be cached in static workspace
; but that can't be done as a retrofit to old versions!
; $r: output register to hold one of the constants above
; $tmp: scratch register
MACRO
$l CPUArch $r, $tmp
[ 1=1
MOV $r, #CPUArch_pre_v4 ; eg 7500FE
|
[ 1=1
MOV $r, #CPUArch_v4 ; eg StrongARM
|
[ 1=1
MOV $r, #CPUArch_v5TE ; eg XScale
|
LoadStaticBase $r, $tmp
LDR $r, [$r, #O__architecture]
]
]
]
MEND
; Convert uint32_t to uint64_t or int64_t
; In: a1
; Out: (a1,a2)
_ll_from_u
MOV a2, #0
Return ,, LinkNotStacked
; Convert int32_t to int64_t or uint64_t
; In: a1
; Out: (a1,a2)
_ll_from_l
MOV a2, a1, ASR #31
Return ,, LinkNotStacked
; Convert int64_t or uint64_t to int32_t or uint32_t
; In: (a1,a2)
; Out: a1
_ll_to_l
Return ,, LinkNotStacked
; Add two 64-bit numbers
; In: (a1,a2),(a3,a4)
; Out: (a1,a2)
_ll_add
ADDS a1, a1, a3
ADC a2, a2, a4
Return ,, LinkNotStacked
; Add a uint32_t to a 64-bit number
; In: (a1,a2),a3
; Out: (a1,a2)
_ll_addlu
ADDS a1, a1, a3
ADC a2, a2, #0
Return ,, LinkNotStacked
; Add an int32_t to a 64-bit number
; In: (a1,a2),a3
; Out: (a1,a2)
_ll_addls
ADDS a1, a1, a3
ADC a2, a2, a3, ASR #31
Return ,, LinkNotStacked
; Create a 64-bit number by adding two uint32_t numbers
; In: a1,a2
; Out: (a1,a2)
_ll_adduu
ADDS a1, a1, a2
MOVCC a2, #0
MOVCS a2, #1
Return ,, LinkNotStacked
; Create a 64-bit number by adding two int32_t numbers
; In: a1,a2
; Out: (a1,a2)
_ll_addss
MOV ip, a1, ASR #31
ADDS a1, a1, a2
ADC a2, ip, a2, ASR #31
Return ,, LinkNotStacked
; Subtract two 64-bit numbers
; In: (a1,a2),(a3,a4)
; Out: (a1,a2)
_ll_sub
SUBS a1, a1, a3
SBC a2, a2, a4
Return ,, LinkNotStacked
; Subtract a uint32_t from a 64-bit number
; In: (a1,a2),a3
; Out: (a1,a2)
_ll_sublu
SUBS a1, a1, a3
SBC a2, a2, #0
Return ,, LinkNotStacked
; Subtract an int32_t from a 64-bit number
; In: (a1,a2),a3
; Out: (a1,a2)
_ll_subls
SUBS a1, a1, a3
SBC a2, a2, a3, ASR #31
Return ,, LinkNotStacked
; Create a 64-bit number by subtracting two uint32_t numbers
; In: a1,a2
; Out: (a1,a2)
_ll_subuu
SUBS a1, a1, a2
MOVCC a2, #-1
MOVCS a2, #0 ; carry = not borrow
Return ,, LinkNotStacked
; Create a 64-bit number by subtracting two int32_t numbers
; In: a1,a2
; Out: (a1,a2)
_ll_subss
MOV ip, a1, ASR #31
SUBS a1, a1, a2
SBC a2, ip, a2, ASR #31
Return ,, LinkNotStacked
; Reverse-subtract two 64-bit numbers
; In: (a1,a2),(a3,a4)
; Out: (a1,a2)
_ll_rsb
RSBS a1, a1, a3
RSC a2, a2, a4
Return ,, LinkNotStacked
; Subtract a 64-bit number from a uint32_t
; In: (a1,a2),a3
; Out: (a1,a2)
_ll_rsblu
RSBS a1, a1, a3
RSC a2, a2, #0
Return ,, LinkNotStacked
; Subtract a 64-bit number from an int32_t
; In: (a1,a2),a3
; Out: (a1,a2)
_ll_rsbls
RSBS a1, a1, a3
RSC a2, a2, a3, ASR #31
Return ,, LinkNotStacked
; Create a 64-bit number by reverse-subtracting two uint32_t numbers
; In: a1,a2
; Out: (a1,a2)
_ll_rsbuu
RSBS a1, a1, a2
MOVCC a2, #-1
MOVCS a2, #0 ; carry = not borrow
Return ,, LinkNotStacked
; Create a 64-bit number by reverse-subtracting two int32_t numbers
; In: a1,a2
; Out: (a1,a2)
_ll_rsbss
MOV ip, a1, ASR #31
RSBS a1, a1, a2
RSB a2, ip, a2, ASR #31
Return ,, LinkNotStacked
; Multiply two 64-bit numbers
; In: (a1,a2),(a3,a4)
; Out: (a1,a2)
_ll_mul
FunctionEntry
CPUArch ip, lr
CMP ip, #CPUArch_v4
BCC mul_hardway
; Have UMULL instruction
MOV ip, a1
UMULL a1, lr, a3, a1
MLA lr, ip, a4, lr
MLA a2, a3, a2, lr
Return
mul_hardway
; No UMULL instruction
; Break the operation down thus:
; aaaaaaaa bbbb cccc
; * dddddddd eeee ffff
; ------------------
; cccc * ffff
; bbbb * ffff
; cccc * eeee
; bbbb * eeee
; aaaaaaaa * eeeeffff
; + dddddddd * bbbbcccc
MUL a2, a3, a2 ; msw starts as aaaaaaaa * eeeeffff
MLA a2, a4, a1, a2 ; msw += dddddddd * bbbbcccc
MOV lr, a3, LSR #16 ; lr = eeee from now on
MOV ip, a1, LSR #16 ; ip = bbbb from now on
SUB a4, a3, lr, LSL #16 ; a4 = ffff
SUB a3, a1, ip, LSL #16 ; a3 = cccc
MUL a1, a3, a4 ; lsw starts as cccc * ffff
MUL a4, ip, a4
MUL a3, lr, a3
ADDS a3, a4, a3 ; a3 = (bbbb * ffff + cccc * eeee) [0:31]
MOV a4, a3, RRX ; a4 = (bbbb * ffff + cccc * eeee) [1:32]
ADDS a1, a1, a3, LSL #16 ; lsw now complete
ADC a2, a2, a4, LSR #15
MLA a2, ip, lr, a2 ; msw completed by adding bbbb * eeee
Return
; Multiply a 64-bit number by a uint32_t
; In: (a1,a2),a3
; Out: (a1,a2)
_ll_mullu
FunctionEntry
CPUArch ip, lr
CMP ip, #CPUArch_v4
MOVCC a4, #0
BCC mul_hardway
; Have UMULL instruction
UMULL a1, lr, a3, a1
MLA a2, a3, a2, lr
Return
; Multiply a 64-bit number by an int32_t
; In: (a1,a2),a3
; Out: (a1,a2)
_ll_mulls
MOV a4, a3, ASR #31
B _ll_mul
; Create a 64-bit number by multiplying two uint32_t numbers
; In: a1,a2
; Out: (a1,a2)
_ll_muluu
FunctionEntry
CPUArch ip, lr
CMP ip, #CPUArch_v4
BCC %FT50
; Have UMULL instruction
MOV lr, a1
UMULL a1, a2, lr, a2
Return
50 ; No UMULL instruction
MOV a3, a2
MOV a2, #0
MOV a4, #0
B mul_hardway
; Create a 64-bit number by multiplying two int32_t numbers
; In: a1,a2
; Out: (a1,a2)
_ll_mulss
FunctionEntry
CPUArch ip, lr
CMP ip, #CPUArch_v4
BCC %FT50
; Have SMULL instruction
MOV lr, a1
SMULL a1, a2, lr, a2
Return
50 ; No SMULL instruction
MOV a3, a2
MOV a2, a1, ASR #31
MOV a4, a3, ASR #31
B mul_hardway
; Emulate CLZ instruction for architectures that lack it
; Pinched from AsmUtils
soft_clz
ORRS a4, a1, a1, LSR #1
MOVEQ a1, #32
ORRNE a1, a4, a4, LSR #2
Return ,, LinkNotStacked, EQ
ORR a1, a1, a1, LSR #4
LDR a2, =&06C9C57D
ORR a1, a1, a1, LSR #8
ADR a3, clz_table
ORR a1, a1, a1, LSR #16
MLAS a1, a2, a1, a2
LDRNEB a1, [a3, a1, LSR #27]
Return ,, LinkNotStacked
clz_table
= 32, 31, 14, 30, 22, 13, 29, 19, 2, 21, 12, 10, 25, 28, 18, 8
= 1, 15, 23, 20, 3, 11, 26, 9, 16, 24, 4, 27, 17, 5, 6, 7
; Divide a uint64_t by another, returning both quotient and remainder
; In: dividend (a1,a2), divisor (a3,a4)
; Out: quotient (a1,a2), remainder (a3,a4)
_ll_udiv
FunctionEntry , "a1-v6,sl,fp"
; Register usage:
; v1,v2 = quotient (initially 0)
; v3,v4 = remainder (initially dividend)
; v5,v6 = divisor
; sl = CPU architecture
; fp used as a scratch register
; note none of our callees use sl or fp in their usual sense
Pop "v3-v6"
_ll_udiv_lateentry
MOV v1, #0
MOV v2, #0
; Calculate a floating point underestimate of the
; reciprocal of the divisor. The representation used is
; mantissa: 16 bits
; exponent: number of binary places below integers of lsb of mantissa
; The way the mantissa and exponent are calculated
; depends upon the number of leading zeros in the divisor.
CPUArch sl, lr
CMP sl, #CPUArch_v5
CLZCS a1, v6
MOVCC a1, v6
BLCC soft_clz
MOV fp, a1 ; fp = leading zeros in divisor
CMP fp, #16
BCS %FT10
; Divisor has 0..15 leading zeros.
MOV a2, v6, LSL fp
MOVS a1, v5
MOVEQS a1, a2, LSL #16
MOVNE a1, #1 ; round up to account for loss of accuracy
ADD a1, a1, a2, LSR #16 ; divisor for calculating mantissa
B %FT40
10 CMP v6, #0
BEQ %FT20
; Divisor has 16..31 leading zeros.
SUB a2, fp, #16
RSB a3, fp, #48
MOVS a1, v5, LSL a2
MOVNE a1, #1 ; round up to account for loss of accuracy
ADD a1, a1, v6, LSL a2
ADD a1, a1, v5, LSR a3 ; divisor for calculating mantissa
B %FT40
20 CMP sl, #CPUArch_v5
CLZCS a1, v5
MOVCC a1, v5
BLCC soft_clz
ADD fp, a1, #32 ; fp = leading zeros in divisor
CMP fp, #48
BCS %FT30
; Divisor has 32..47 leading zeros.
MOV a2, v5, LSL a1
MOVS a1, a2, LSL #16
MOVNE a1, #1 ; round up to account for loss of accuracy
ADD a1, a1, a2, LSR #16 ; divisor for calculating mantissa
B %FT40
30 CMP v5, #0
BEQ %FT99
; Divisor has 48..63 leading zeros.
SUB a2, a1, #16
MOV a1, v5, LSL a2 ; divisor for calculating mantissa
; drop through
40 MOV a2, #&80000000 ; dividend for calculating mantissa
BL __rt_udiv ; a1 = mantissa &8000..&10000
RSB a2, fp, #15+64 ; a2 = exponent
TST a1, #&10000
MOVNE a1, #&8000 ; force any &10000 mantissas into 16 bits
SUBNE a2, a2, #1
50 ; Main iteration loop:
; each time round loop, calculate a close underestimate of
; the quotient by multiplying through the "remainder" by the
; approximate reciprocal of the divisor.
; a1 = mantissa
; a2 = exponent
; Perform 16 (a1) * 64 (v3,v4) -> 80 (a3,a4,lr) multiply
CMP sl, #CPUArch_v4
BCC %FT51
; Have UMULL instruction
UMULL a3, ip, v3, a1
UMULL a4, lr, v4, a1
ADDS a4, ip, a4
ADC lr, lr, #0
B %FT60
51 ; No UMULL instruction
; aaaa bbbb cccc dddd
; * eeee
; -------------------
; dddd * eeee
; cccc * eeee
; bbbb * eeee
; aaaa * eeee
MOV ip, v4, LSR #16
MOV fp, v3, LSR #16
SUB a4, v4, ip, LSL #16
SUB a3, v3, fp, LSL #16
MUL ip, a1, ip
MUL fp, a1, fp
MUL a4, a1, a4
MUL a3, a1, a3
MOV lr, ip, LSR #16
MOV ip, ip, LSL #16
ORR ip, ip, fp, LSR #16
MOV fp, fp, LSL #16
ADDS a3, a3, fp
ADCS a4, a4, ip
ADC lr, lr, #0
60 ; Shift down by exponent
; First a word at a time, if necessary:
SUBS ip, a2, #32
BCC %FT62
61 MOV a3, a4
MOV a4, lr
MOV lr, #0
SUBS ip, ip, #32
BCS %BT61
62 ; Then by bits, if necessary:
ADDS ip, ip, #32
BEQ %FT70
RSB fp, ip, #32
MOV a3, a3, LSR ip
ORR a3, a3, a4, LSL fp
MOV a4, a4, LSR ip
ORR a4, a4, lr, LSL fp
70 ; Now (a3,a4) contains an underestimate of the quotient.
; Add it to the running total for the quotient, then
; multiply through by divisor and subtract from the remainder.
; Sometimes (a3,a4) = 0, in which case this step can be skipped.
ORRS lr, a3, a4
BEQ %FT80
ADDS v1, v1, a3
ADC v2, v2, a4
CMP sl, #CPUArch_v4
MOVCS lr, a3
UMULLCS a3, ip, v5, lr
MLACS a4, v5, a4, ip
MLACS a4, v6, lr, a4
BCS %FT75
; No UMULL instruction
; Proceeed as for mul_hardway
MUL a4, v5, a4
MLA a4, v6, a3, a4
MOV ip, a3, LSR #16
MOV lr, v5, LSR #16
SUB fp, a3, ip, LSL #16
SUB lr, v5, lr, LSL #16
MUL a3, fp, lr
Push "ip"
MUL ip, lr, ip
MOV lr, v5, LSR #16
MUL fp, lr, fp
ADDS fp, ip, fp
MOV ip, fp, RRX
ADDS a3, a3, fp, LSL #16
ADC a4, a4, ip, LSR #15
Pop "ip"
MLA a4, ip, lr, a4
75 SUBS v3, v3, a3
SBC v4, v4, a4
80 ; Termination condition for iteration loop is
; remainder < divisor
; OR
; quotient increment == 0
CMP v3, v5
SBCS lr, v4, v6
TEQCC lr, lr ; set Z if r < d (and preserve C)
ORRCSS lr, a3, a4 ; else Z = a3 and a4 both 0
BNE %BT50
; The final multiple of the divisor can get lost in rounding
; so subtract one more divisor if necessary
CMP v3, v5
SBCS lr, v4, v6
BCC %FT85
ADDS v1, v1, #1
ADC v2, v2, #0
SUBS v3, v3, v5
SBC v4, v4, v6
85
Push "v1-v4"
Return , "a1-v6,sl,fp"
99 ; Division by zero
Pop "v1-v6,sl,fp,lr"
B __rt_div0
; Divide a uint64_t by another, returning both quotient and remainder
; In: divisor (a1,a2), dividend (a3,a4)
; Out: quotient (a1,a2), remainder (a3,a4)
_ll_urdv
FunctionEntry , "a1-v6,sl,fp"
Pop "v5,v6"
Pop "v3,v4"
B _ll_udiv_lateentry
; Divide a uint64_t by 10, returning both quotient and remainder
; In: (a1,a2)
; Out: quotient (a1,a2), remainder (a3,a4)
_ll_udiv10
Push "a1"
CPUArch ip, a4
CMP ip, #CPUArch_v4
BCC %FT50
; Have UMULL instruction
; Multiply by 0.6 (= &0.999 recurring)
; and subtract multiplication by 0.5 (LSR #1).
; Ignore fractional parts for now.
LDR ip, =&99999999
UMULL a4, a3, a1, ip
UMULL a4, ip, a2, ip
MOVS a2, a2, LSR #1
MOVS a1, a1, RRX
ADCS a1, a1, #0
ADC a2, a2, #0
SUBS a1, a4, a1
SBC a2, ip, a2
ADDS a1, a1, ip
ADC a2, a2, #0
ADDS a1, a1, a3
ADC a2, a2, #0
; It can be shown mathematically that this is an underestimate
; of the true quotient by up to 2.5. Compensate by detecting
; over-large remainders.
40 MOV ip, #10
MUL a3, a1, ip ; quotient * 10 (MSW is unimportant)
Pop "a4"
SUB a3, a4, a3 ; remainder between 0 and 25
; Bring the remainder back within range.
; For a number x <= 68, x / 10 == (x * 13) >> 7
MOV a4, #13
MUL a4, a3, a4
MOV a4, a4, LSR #7
ADDS a1, a1, a4
ADC a2, a2, #0
MUL a4, ip, a4
SUB a3, a3, a4
MOV a4, #0
Return ,, LinkNotStacked
50 ; No UMULL instruction
; Multiply by 0.6 (= &0.999 recurring)
; and subtract multiplication by 0.5 (LSR #1).
; Ignore fractional parts for now.
Push "v1,lr"
LDR lr, =&9999
MOV ip, a2, LSR #16 ; MS halfword
SUB v1, a2, ip, LSL #16
MOV a4, a1, LSR #16
SUB a3, a1, a4, LSL #16 ; LS halfword
MUL a3, lr, a3 ; multiply through by &9999
MUL a4, lr, a4
MUL v1, lr, v1
MUL ip, lr, ip
MOVS a2, a2, LSR #1 ; find half the dividend
MOVS a1, a1, RRX
ADCS a1, a1, #0 ; round upwards
ADC a2, a2, #0
ADD a4, a4, a4, LSR #16 ; can't unsigned overflow
ADD a4, a4, a3, LSR #16 ; can't unsigned overflow
SUBS a1, a4, a1
SBC a2, ip, a2
ADDS a1, a1, v1
ADC a2, a2, #0
ADDS a1, a1, v1, ROR #16
ADC a2, a2, v1, LSR #16
ADDS a1, a1, ip
ADC a2, a2, #0
ADDS a1, a1, ip, ROR #16
ADC a2, a2, ip, LSR #16
; It can be shown mathematically that this is an underestimate
; of the true quotient by up to 4.5. Compensate by detecting
; over-large remainders.
Pop "v1,lr"
B %BT40
; Divide an int64_t by another, returning both quotient and remainder
; In: dividend (a1,a2), divisor (a3,a4)
; Out: quotient (a1,a2), remainder (a3,a4)
; Remainder has same sign as dividend - required by C99, although
; earlier versions of C allowed the sign to match the divisor
_ll_sdiv
FunctionEntry , "v1"
MOVS v1, a4, LSR #31
BEQ %FT10
; Find absolute divisor
RSBS a3, a3, #0
RSC a4, a4, #0
10 EORS v1, v1, a2, ASR #31
BPL %FT20
; Find absolute dividend
RSBS a1, a1, #0
RSC a2, a2, #0
20 BL _ll_udiv
TEQ v1, #0
BPL %FT30
; Remainder is negative (sign(dividend) == -1)
RSBS a3, a3, #0
RSC a4, a4, #0
30 TST v1, #1
BEQ %FT40
; Quotient is negative (sign(divisor) != sign(dividend))
RSBS a1, a1, #0
RSC a2, a2, #0
40
Return , "v1"
; Divide an int64_t by another, returning both quotient and remainder
; In: divisor (a1,a2), dividend (a3,a4)
; Out: quotient (a1,a2), remainder (a3,a4)
; Remainder has same sign as dividend - required by C99, although
; earlier versions of C allowed the sign to match the divisor
_ll_srdv
FunctionEntry , "v1"
MOVS v1, a2, LSR #31
BEQ %FT10
; Find absolute divisor
RSBS a1, a1, #0
RSC a2, a2, #0
10 EORS v1, v1, a4, ASR #31
BPL %FT20
; Find absolute dividend
RSBS a3, a3, #0
RSC a4, a4, #0
20 BL _ll_urdv
TEQ v1, #0
BPL %FT30
; Remainder is negative (sign(dividend) == -1)
RSBS a3, a3, #0
RSC a4, a4, #0
30 TST v1, #1
BEQ %FT40
; Quotient is negative (sign(divisor) != sign(dividend))
RSBS a1, a1, #0
RSC a2, a2, #0
40
Return , "v1"
; Divide an int64_t by 10, returning both quotient and remainder
; Remainder has same sign as dividend - required by C99, although
; earlier versions of C allowed the sign to match the divisor
; In: (a1,a2)
; Out: quotient (a1,a2), remainder (a3,a4)
_ll_sdiv10
FunctionEntry , "v1"
MOVS v1, a2
BPL %FT10
RSBS a1, a1, #0 ; find abs(divisor)
RSC a2, a2, #0
10 BL _ll_udiv10
TEQ v1, #0
Return , "v1",, PL
RSBS a1, a1, #0
RSC a2, a2, #0
RSBS a3, a3, #0
RSC a4, a4, #0
Return , "v1"
; Find the bitwise NOT of a 64-bit number
; In: (a1,a2)
; Out: (a1,a2)
_ll_not
MVN a1, a1
MVN a2, a2
Return ,, LinkNotStacked
; Find the negative of a 64-bit number
; In: (a1,a2)
; Out: (a1,a2)
_ll_neg
RSBS a1, a1, #0
RSC a2, a2, #0
Return ,, LinkNotStacked
; Find the bitwise AND of two 64-bit numbers
; In: (a1,a2)
; Out: (a1,a2)
_ll_and
AND a1, a1, a3
AND a2, a2, a4
Return ,, LinkNotStacked
; Find the bitwise OR of two 64-bit numbers
; In: (a1,a2)
; Out: (a1,a2)
_ll_or
ORR a1, a1, a3
ORR a2, a2, a4
Return ,, LinkNotStacked
; Find the bitwise exclusive OR of two 64-bit numbers
; In: (a1,a2)
; Out: (a1,a2)
_ll_eor
EOR a1, a1, a3
EOR a2, a2, a4
Return ,, LinkNotStacked
; Shift a 64-bit number left
; In: (a1,a2),a3
; Out: (a1,a2)
_ll_shift_l
RSBS ip, a3, #32
MOVHI a2, a2, LSL a3
ORRHI a2, a2, a1, LSR ip
MOVHI a1, a1, LSL a3
Return ,, LinkNotStacked, HI
SUB ip, a3, #32
MOV a2, a1, LSL ip
MOV a1, #0
Return ,, LinkNotStacked
; Logical-shift a 64-bit number right
; In: (a1,a2),a3
; Out: (a1,a2)
_ll_ushift_r
RSBS ip, a3, #32
MOVHI a1, a1, LSR a3
ORRHI a1, a1, a2, LSL ip
MOVHI a2, a2, LSR a3
Return ,, LinkNotStacked, HI
SUB ip, a3, #32
MOV a1, a2, LSR ip
MOV a2, #0
Return ,, LinkNotStacked
; Arithmetic-shift a 64-bit number right
; In: (a1,a2),a3
; Out: (a1,a2)
_ll_sshift_r
RSBS ip, a3, #32
MOVHI a1, a1, LSR a3
ORRHI a1, a1, a2, LSL ip
MOVHI a2, a2, ASR a3
Return ,, LinkNotStacked, HI
SUB ip, a3, #32
MOV a1, a2, ASR ip
MOV a2, a1, ASR #31
Return ,, LinkNotStacked
; Compare two uint64_t numbers, or test two int64_t numbers for equality
; In: (a1,a2),(a3,a4)
; Out: Z set if equal, Z clear if different
; C set if unsigned higher or same, C clear if unsigned lower
; all registers preserved
_ll_cmpu
CMP a2, a4
CMPEQ a1, a3
MOV pc, lr ; irrespective of calling standard
; Compare two int64_t numbers for testing GE or LT
; In: (a1,a2),(a3,a4)
; Out: N == V if signed greater than or equal, N != V if signed less than
; a1, a2 corrupted
_ll_cmpge
SUBS a1, a1, a3
SBCS a2, a2, a4
MOV pc, lr ; irrespective of calling standard
; Compare two int64_t numbers for testing LE or GT
; In: (a1,a2),(a3,a4)
; Out: N == V if signed less than or equal, N != V if signed greater than
; (ie subsequent instructions need to use GE/LT condition instead of LE/GT)
; a1, a2 corrupted
_ll_cmple
SUBS a1, a3, a1
SBCS a2, a4, a2
MOV pc, lr ; irrespective of calling standard
; Now the floating point functions...
EXPORT _ll_uto_d
EXPORT _ll_sto_d
EXPORT _ll_uto_f
EXPORT _ll_sto_f
EXPORT _ll_ufrom_d
EXPORT _ll_sfrom_d
EXPORT _ll_ufrom_f
EXPORT _ll_sfrom_f
EXPORT llrint
EXPORT llrintf
EXPORT llround
EXPORT llroundf
; bit 31 rounding direction
; bits 30..26 exceptions (30=INX,29=UFL,28=OFL,27=DVZ,26=IVO)
; bit 24 flush to zero
; bits 23..22 rounding mode
; bit 18 "round" version of to-nearest (halfway case round away from zero)
; bit 17 rounded convert (as opposed to towards zero)
; bit 16 attempt to convert NaN
; bits 9..7 in type
; bits 6..4 out type
; bits 3..0 function
FE_EX_RDIR * &80000000
FE_EX_EXCEPT_MASK * &7C000000
FE_EX_INEXACT * &40000000
FE_EX_UNDERFLOW * &20000000
FE_EX_OVERFLOW * &10000000
FE_EX_DIVBYZERO * &08000000
FE_EX_INVALID * &04000000
FE_EX_FLUSHZERO * &01000000
FE_EX_ROUND_MASK * &00C00000
FE_EX_CVT_RND * &00040000
FE_EX_CVT_R * &00020000
FE_EX_CVT_NAN * &00010000
FE_EX_INTYPE_MASK * &00000380
FE_EX_OUTTYPE_MASK * &00000070
FE_EX_TYPE_MASK * &00000070
FE_EX_FN_MASK * &0000000F
FE_EX_ROUND_NEAREST * &00000000
FE_EX_ROUND_PLUSINF * &00400000
FE_EX_ROUND_MINUSINF * &00800000
FE_EX_ROUND_ZERO * &00C00000
FE_EX_BASETYPE_FLOAT * 0
FE_EX_BASETYPE_DOUBLE * 1
FE_EX_BASETYPE_UNSIGNED * 2
FE_EX_BASETYPE_INT * 4
FE_EX_BASETYPE_LONGLONG * FE_EX_BASETYPE_INT+FE_EX_BASETYPE_DOUBLE
FE_EX_BASETYPE_UINT * FE_EX_BASETYPE_INT+FE_EX_BASETYPE_UNSIGNED
FE_EX_BASETYPE_ULONGLONG * FE_EX_BASETYPE_LONGLONG+FE_EX_BASETYPE_UNSIGNED
FE_EX_TYPE_FLOAT * FE_EX_BASETYPE_FLOAT :SHL: 4
FE_EX_TYPE_DOUBLE * FE_EX_BASETYPE_DOUBLE :SHL: 4
FE_EX_TYPE_INT * FE_EX_BASETYPE_INT :SHL: 4
FE_EX_TYPE_LONGLONG * FE_EX_BASETYPE_LONGLONG :SHL: 4
FE_EX_TYPE_UINT * FE_EX_BASETYPE_UINT :SHL: 4
FE_EX_TYPE_ULONGLONG * FE_EX_BASETYPE_ULONGLONG :SHL: 4
FE_EX_INTYPE_FLOAT * FE_EX_BASETYPE_FLOAT :SHL: 7
FE_EX_INTYPE_DOUBLE * FE_EX_BASETYPE_DOUBLE :SHL: 7
FE_EX_INTYPE_INT * FE_EX_BASETYPE_INT :SHL: 7
FE_EX_INTYPE_LONGLONG * FE_EX_BASETYPE_LONGLONG :SHL: 7
FE_EX_INTYPE_UINT * FE_EX_BASETYPE_UINT :SHL: 7
FE_EX_INTYPE_ULONGLONG * FE_EX_BASETYPE_ULONGLONG :SHL: 7
FE_EX_OUTTYPE_FLOAT * FE_EX_BASETYPE_FLOAT :SHL: 4
FE_EX_OUTTYPE_DOUBLE * FE_EX_BASETYPE_DOUBLE :SHL: 4
FE_EX_OUTTYPE_UNSIGNED * FE_EX_BASETYPE_UNSIGNED :SHL: 4
FE_EX_OUTTYPE_INT * FE_EX_BASETYPE_INT :SHL: 4
FE_EX_OUTTYPE_LONGLONG * FE_EX_BASETYPE_LONGLONG :SHL: 4
FE_EX_OUTTYPE_UINT * FE_EX_BASETYPE_UINT :SHL: 4
FE_EX_OUTTYPE_ULONGLONG * FE_EX_BASETYPE_ULONGLONG :SHL: 4
FE_EX_FN_ADD * 1
FE_EX_FN_SUB * 2
FE_EX_FN_MUL * 3
FE_EX_FN_DIV * 4
FE_EX_FN_REM * 5
FE_EX_FN_RND * 6
FE_EX_FN_SQRT * 7
FE_EX_FN_CVT * 8
FE_EX_FN_CMP * 9
FE_EX_FN_RAISE * 15
_ll_uto_d
MOV a3,#&42000000
B dfltll_normalise
_ll_sto_d
ANDS a3,a2,#&80000000
BPL %FT10
RSBS a1,a1,#0
RSC a2,a2,#0
10 ORR a3,a3,#&42000000
dfltll_normalise
SUB a3,a3,#&00300000
MOVS a4,a2
MOVNE a4,#32
MOVEQS a2,a1
Return ,,LinkNotStacked,EQ
[ HaveCLZ
CLZ ip,a2
MOV a2,a2,LSL ip
SUB a4,a4,ip
|
MOVS ip,a2,LSR #16
SUBEQ a4,a4,#16
MOVEQS a2,a2,LSL #16
TST a2,#&FF000000
SUBEQ a4,a4,#8
MOVEQ a2,a2,LSL #8
TST a2,#&F0000000
SUBEQ a4,a4,#4
MOVEQ a2,a2,LSL #4
TST a2,#&C0000000
SUBEQ a4,a4,#2
MOVEQS a2,a2,LSL #2
MOVPL a2,a2,LSL #1
SUBPL a4,a4,#1
]
ADD a3,a3,a4,LSL #20
ORR ip,a2,a1,LSR a4
RSB a4,a4,#32
MOV a4,a1,LSL a4
MOVS a2,a4,LSL #21
MOVNE a2,#FE_EX_INEXACT
STMDB sp!,{a2,lr}
MOVS a2,a4,LSL #22
ANDEQ a2,a4,a4,LSR #1
MOVEQS a2,a2,LSR #11
MOV a2,a4,LSR #11
ADCS a2,a2,ip,LSL #21
ADC a1,a3,ip,LSR #11
MOVS a4,a4,LSL #22
LDMIA sp!,{ip,lr}
TST ip,#FE_EX_INEXACT
BNE __fpl_exception
Return ,,LinkNotStacked
_ll_uto_f
MOV a3,#&3F800000
B fltll_normalise
_ll_sto_f
ANDS a3,a2,#&80000000
BPL %FT10
RSBS a1,a1,#0
RSC a2,a2,#0
10 ORR a3,a3,#&3F800000
fltll_normalise
ADD a3,a3,#&0F000000
MOVS a4,a2
MOVNE a4,#32
MOVEQS a2,a1
Return ,,LinkNotStacked,EQ
[ HaveCLZ
CLZ ip,a2
MOV a2,a2,LSL ip
SUB a4,a4,ip
|
MOVS ip,a2,LSR #16
SUBEQ a4,a4,#16
MOVEQS a2,a2,LSL #16
TST a2,#&FF000000
SUBEQ a4,a4,#8
MOVEQ a2,a2,LSL #8
TST a2,#&F0000000
SUBEQ a4,a4,#4
MOVEQ a2,a2,LSL #4
TST a2,#&C0000000
SUBEQ a4,a4,#2
MOVEQS a2,a2,LSL #2
MOVPL a2,a2,LSL #1
SUBPL a4,a4,#1
]
ORR a2,a2,a1,LSR a4
ADD a3,a3,a4,LSL #23
RSB a4,a4,#32
MOVS ip,a1,LSL a4
ORRS ip,ip,a2,LSL #25
ADC a1,a3,a2,LSR #8
ADC ip,pc,#0
ORRNES ip,ip,#4,2
BICCS a1,a1,#1
MOVS ip,ip,LSL #30
BNE __fpl_exception
Return ,,LinkNotStacked
_ll_ufrom_d
MOVS a3,a1,ASR #20
MOV a4,a1,LSL #11
ORR a4,a4,a2,LSR #21
MOV ip,a2,LSL #11
ORRNE a4,a4,#&80000000
BMI ll_ufrom_d_neg
SUB a3,a3,#&4E
RSBS a3,a3,#&03F0
BLT ll_ufrom_d_ivo
CMP a3,#&50
MOVGE a3,#&50
MOV a2,a4,LSR a3
MOV a1,ip,LSR a3
RSBS a3,a3,#32
ORRHI a1,a1,a4,LSL a3
RSB a3,a3,#0
ORRLS a1,a1,a4,LSR a3
RSBS a3,a3,#0
MOVGE ip,ip,LSL a3
MOVLT ip,ip,LSR #1
ADDS a3,a3,#32
ORRGE ip,ip,a4,LSL a3
MOVGE a4,#0
CMP a4,#1
ORRCS ip,ip,#1
TST ip,ip
Return ,,LinkNotStacked,EQ
LDR ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_ULONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INEXACT
B __fpl_exception
ll_ufrom_d_neg
ADD a3,a1,#&40000000
CMN a3,#&00100000
BGE ll_ufrom_d_ivo
ORRS a3,a2,a1,LSL #1
MOV a1,#0
MOV a2,#0
LDRNE ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_ULONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INEXACT
BNE __fpl_exception
Return ,,LinkNotStacked
ll_ufrom_d_ivo
LDR ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_ULONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INVALID
MOV a4,a1,LSL #1
CMP a4,#&FFE00000
CMPEQ a2,#0
ORRHI ip,ip,#FE_EX_CVT_NAN
B __fpl_exception
LTORG
MACRO
$func DblRound $round
$func
MOVS a3,a1,ASR #20 ; a3 = exponent, and 21 sign bits
MOV a4,a1,LSL #11
ORR a4,a4,a2,LSR #21
MOV ip,a2,LSL #11 ; ip = low mantissa
ORRNE a4,a4,#&80000000 ; a4 = high mantissa, unit bit forced on if neg
BMI $func._neg
SUB a3,a3,#&4E
RSBS a3,a3,#&03F0 ; a3 = &43E-exp = shift to get b.p. at bottom
BLE $func._ivo ; (must shift right, to get bit 63 clear)
CMP a3,#80 ; clamp to a shift by 80
MOVGE a3,#80
MOV a2,a4,LSR a3 ; a2 & a1 = shifted
MOV a1,ip,LSR a3
RSBS a3,a3,#32
ORRHI a1,a1,a4,LSL a3
RSB a3,a3,#0
ORRLS a1,a1,a4,LSR a3
RSBS a3,a3,#0
MOVGE ip,ip,LSL a3
MOVLT ip,ip,LSR #1
ADDS a3,a3,#32
ORRGE ip,ip,a4,LSL a3
MOVGE a4,#0
CMP a4,#1
ORRCS ip,ip,#1
TST ip,ip
Return ,,LinkNotStacked,EQ
[ "$round" = "rint"
TEQ ip,#&80000000
LDR ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INEXACT+FE_EX_CVT_R
MVNEQS a4,a1,LSL #31
BMI __fpl_exception
ADDS a1,a1,#1 ; Can't overflow, as any argument >= 2^52
ADCS a2,a2,#0 ; is an integer, so won't get here
B __fpl_exception
]
[ "$round" = "round"
LDR ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INEXACT+FE_EX_CVT_RND
BPL __fpl_exception
ADDS a1,a1,#1
ADCS a2,a2,#0
B __fpl_exception
]
[ "$round" = ""
LDR ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INEXACT
B __fpl_exception
]
$func._neg
ADD a3,a3,#&03F0
CMN a3,#&0410
BICEQ a4,a4,#&80000000 ; clear sign bit if exponent = 0
RSBS a3,a3,#&2E
BLT $func._ivo
BEQ $func._minint
$func._neg_noovf
CMP a3,#&50
MOVGE a3,#&50
MOV a2,a4,LSR a3
MOV a1,ip,LSR a3
RSBS a3,a3,#32
ORRHI a1,a1,a4,LSL a3
RSB a3,a3,#0
ORRLS a1,a1,a4,LSR a3
RSBS a1,a1,#0
RSC a2,a2,#0
RSBS a3,a3,#0
MOVGE ip,ip,LSL a3
MOVLT ip,ip,LSR #1
ADDS a3,a3,#32
ORRGE ip,ip,a4,LSL a3
MOVGE a4,#0
CMP a4,#1
ORRCS ip,ip,#1
TST ip,ip
Return ,,LinkNotStacked,EQ
[ "$round" = "rint"
TEQ ip,#&80000000
LDR ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INEXACT+FE_EX_CVT_R
MVNEQS a4,a1,LSL #31
BMI __fpl_exception
SUBS a1,a1,#1
SBC a2,a2,#0
B __fpl_exception
]
[ "$round" = "round"
LDR ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INEXACT+FE_EX_CVT_RND
BPL __fpl_exception
SUBS a1,a1,#1
SBCS a2,a2,#0
B __fpl_exception
]
[ "$round" = ""
LDR ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INEXACT
B __fpl_exception
]
$func._minint
TEQ ip,#0
TEQEQ a4,#&80000000
BEQ $func._neg_noovf
$func._ivo
[ "$round" = "rint"
LDR ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INVALID+FE_EX_CVT_R
|
LDR ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_DOUBLE+FE_EX_INVALID
]
MOV a4,a1,LSL #1
CMP a4,#&FFE00000
CMPEQ a2,#0
ORRHI ip,ip,#FE_EX_CVT_NAN
B __fpl_exception
MEND
_ll_sfrom_d DblRound
llrint DblRound rint
llround DblRound round
LTORG
_ll_ufrom_f
MOVS a3,a1,ASR #23
MOV a4,a1,LSL #8
ORRNE a4,a4,#&80000000
BMI ll_ufrom_f_negative
RSBS a3,a3,#&BE
BCC ll_ufrom_f_ivo
MOV a2,a4,LSR a3
SUBS ip,a3,#32
MOVCS a1,a4,LSR ip
RSBCC ip,a3,#32
MOVCC a1,a4,LSL ip
Return ,,LinkNotStacked,CC
RSBS a3,a3,#&40
MOVPL a4,a4,LSL a3
MOVMI a4,a4,LSR #1
TST a4,a4
Return ,,LinkNotStacked,EQ
LDR ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_ULONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INEXACT
B __fpl_exception
ll_ufrom_f_negative
MOV ip,a1,LSL #1
CMP ip,#&7F000000
BCS ll_ufrom_f_ivo
MOV a1,#0
MOV a2,#0
CMP ip,#0
LDRNE ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_ULONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INEXACT
BNE __fpl_exception
Return ,,LinkNotStacked
ll_ufrom_f_ivo
LDR ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_ULONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INVALID
MOV a4,a1,LSL #1
CMP a4,#&FF000000
ORRHI ip,ip,#FE_EX_CVT_NAN
B __fpl_exception
LTORG
MACRO
$func FltRound $round
$func
MOVS a3,a1,ASR #23
MOV a4,a1,LSL #8
ORRNE a4,a4,#&80000000
BMI $func.negative
RSBS a3,a3,#&BE
BLS $func.ivo
MOV a2,a4,LSR a3
SUBS ip,a3,#32
MOVCS a1,a4,LSR ip
RSBCC ip,a3,#32
MOVCC a1,a4,LSL ip
Return ,,LinkNotStacked,CC
RSBS a3,a3,#64
MOVPL a4,a4,LSL a3
MOVMI a4,a4,LSR #1
TST a4,a4
Return ,,LinkNotStacked,EQ
[ "$round" = "rint"
LDR ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INEXACT+FE_EX_CVT_R
TEQ a4,#&80000000
MVNEQS a4,a1,LSL #31
ADDPL a1,a1,#1 ; Can't overflow, as any argument >= 2^23
; is an integer, so won't get here
]
[ "$round" = "round"
LDR ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INEXACT+FE_EX_CVT_RND
ADDMI a1,a1,#1
]
[ "$round" = ""
LDR ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INEXACT
]
B __fpl_exception
$func.negative
CMP a1,#&DF000000
BHI $func.ivo
ANDS a3,a3,#&FF
BICEQ a4,a4,#&80000000
RSB a3,a3,#&BE
MOV a2,a4,LSR a3
SUBS ip,a3,#32
MOVCS a1,a4,LSR ip
RSBCC ip,a3,#32
MOVCC a1,a4,LSL ip
RSBS a1,a1,#0
RSC a2,a2,#0
RSBS a3,a3,#&40
MOVPL a4,a4,LSL a3
MOVMI a4,a4,LSR #1
TST a4,a4
Return ,,LinkNotStacked,EQ
[ "$round" = "rint"
LDR ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INEXACT+FE_EX_CVT_R
TEQ a4,#&80000000
MVNEQS a4,a1,LSL #31
SUBPL a1,a1,#1 ; Can't overflow, as any argument >= 2^23
; is an integer, so won't get here
]
[ "$round" = "round"
LDR ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INEXACT+FE_EX_CVT_RND
SUBMI a1,a1,#1
]
[ "$round" = ""
LDR ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INEXACT
]
B __fpl_exception
$func.ivo
[ "$round" = "rint"
LDR ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INVALID+FE_EX_CVT_R
|
LDR ip,=FE_EX_FN_CVT+FE_EX_OUTTYPE_LONGLONG+FE_EX_INTYPE_FLOAT+FE_EX_INVALID
]
MOV a4,a1,LSL #1
CMP a4,#&FF000000
ORRHI ip,ip,#FE_EX_CVT_NAN
B __fpl_exception
MEND
_ll_sfrom_f FltRound
; Extra complication because of callee narrowing
llrintf
STMFD sp!,{a1-a2}
LDFD f0,[sp],#8
STFS f0,[sp,#-4]!
LDR a1,[sp],#4
; fall through
_ll_sfrom_f_r FltRound rint
llroundf
STMFD sp!,{a1-a2}
LDFD f0,[sp],#8
STFS f0,[sp,#-4]!
LDR a1,[sp],#4
; fall through
_ll_sfrom_f_rnd FltRound round
; FP support code.
; __fpl_exception receives all exception-generating results. This includes
; all inexact results, so it is responsible for rounding.
;
; ip on entry tells it what to do, and consists of the FE_EX_xxx flags
__fpl_exception
TST ip,#FE_EX_OUTTYPE_DOUBLE
MOVEQ a3,a2
STMDB sp!,{a1-a4}
RFS a2
BIC a4,ip,a2,LSL #10 ; BIC out enabled exceptions
ANDS a4,a4,#FE_EX_UNDERFLOW+FE_EX_OVERFLOW
ORRNE ip,ip,#FE_EX_INEXACT
MOV a4,ip,LSL #1
MOV a4,a4,LSR #27 ; move exceptions down to bottom
ORR a2,a2,a4 ; OR them into cumulative FPSR bits
AND a4,a2,#&100 ; extract ND bit
ORR ip,ip,a4,LSL #16 ; pop it in our word
WFS a2
AND a4,ip,#FE_EX_FN_MASK
TEQ a4,#FE_EX_FN_CVT
ANDNE a4,ip,#FE_EX_TYPE_MASK
ORRNE ip,ip,a4,LSL #3
MOVEQ a4,#FE_EX_CVT_R
BICEQS a4,a4,ip
ORREQ ip,ip,#FE_EX_ROUND_ZERO
; If we actually had trap handlers we should worry about
; FE_EX_CVT_RND here (eg have FE_EX_ROUND_NEAREST)
TST ip,#FE_EX_UNDERFLOW
BNE underflow
TST ip,#FE_EX_OVERFLOW
BNE overflow
TST ip,#FE_EX_INEXACT
BNE inexact
TST ip,#FE_EX_DIVBYZERO
BNE divide_by_zero
; invalid
TST a2,#&00010000 ; IOE bit
LDMIA sp!,{a1-a4}
BEQ return_NaN
B _fp_trapveneer
overflow
TST a2,#&00040000 ; OFE bit
LDMIA sp!,{a1-a4}
BEQ ovf_return
B _fp_trapveneer
underflow
TST a2,#&00080000 ; UFE bit
LDMIA sp!,{a1-a4}
BEQ return_result
B _fp_trapveneer
divide_by_zero
TST a2,#&00020000 ; DZE bit
LDMIA sp!,{a1-a4}
BNE _fp_trapveneer
EOR a3,a1,a3
B return_Inf
inexact
TST a2,#&00100000 ; IXE bit
LDMIA sp!,{a1-a4}
BEQ return_result
B _fp_trapveneer
return_result
TST ip,#FE_EX_OUTTYPE_DOUBLE
Return ,,LinkNotStacked
ovf_return
AND a3,a1,#&80000000
return_Inf
AND a3,a3,#&80000000
TST ip,#FE_EX_OUTTYPE_DOUBLE
ADRNE a1,prototype_double_Inf
LDMNEIA a1,{a1,a2}
ORRNE a1,a1,a3
LDREQ a1,prototype_single_Inf
ORREQ a1,a1,a3
Return ,,LinkNotStacked
return_NaN
AND a3,a1,#&80000000
TST ip,#FE_EX_OUTTYPE_DOUBLE
ADRNE a1,prototype_double_NaN
LDMNEIA a1,{a1,a2}
ORRNE a1,a1,a3
LDREQ a1,prototype_single_NaN
ORREQ a1,a1,a3
B __fpl_return_NaN
prototype_double_Inf
DCD &7FF00000,&00000000
DCD &7FEFFFFF,&FFFFFFFF
prototype_single_Inf
DCD &7F800000
DCD &7F7FFFFF
prototype_double_NaN
DCD &7FF80000,&00000000
prototype_single_NaN
DCD &7FC00000
__fpl_return_NaN
AND a4,ip,#FE_EX_FN_MASK
; CMP a4,#FE_EX_FN_CMP
; MOVEQ a1,#8
; BEQ __fpl_cmpreturn
CMP a4,#FE_EX_FN_CVT
ANDEQ a4,ip,#FE_EX_OUTTYPE_INT
TEQEQ a4,#FE_EX_OUTTYPE_INT
Return ,,LinkNotStacked,NE
TST ip,#FE_EX_CVT_NAN
BNE return_zero
TST ip,#FE_EX_OUTTYPE_UNSIGNED
BNE return_umaxint
TST ip,#FE_EX_OUTTYPE_DOUBLE ; long long?
MOV a3,a1
MVNEQ a1,#&80000000
MVNNE a2,#&80000000
MVNNE a1,#0
TST a3,#&80000000
MVNNE a1,a1
MVNNE a2,a2
Return ,,LinkNotStacked
return_zero
MOV a1,#0
MOV a2,#0
Return ,,LinkNotStacked
return_umaxint
MVN a1,#0
MVN a2,#0
TST a3,#&80000000
MVNNE a1,a1
MVNNE a2,a2
Return ,,LinkNotStacked
IMPORT feraiseexcept
_fp_trapveneer
; This would be a bit backwards for some people, but it works for us...
; we know the relevant traps are enabled, so feraiseexcept won't
; return. Honest...
MOV a1,ip,LSR #26
AND a1,a1,#&1F
B feraiseexcept
END
// Generate test data for long long support
// Run this on a known good C library
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
uint64_t rand63(void)
{
// Generate a 63-bit random number
uint32_t a, b;
a = rand();
b = rand();
return 0x4000000000000000 | ((uint64_t) a << 31) | b;
}
int main(void)
{
// We want to create pairs of random 64-bit numbers with
// every combination of 1-64 leading 0s and 1-64 leading 1s
assert(RAND_MAX == 0x7FFFFFFF);
srand(42);
bool invertb = false;
do
{
bool inverta = false;
do
{
for (size_t bbits = 1; bbits <= 64; bbits++)
for (size_t abits = 1; abits <= 64; abits++)
{
uint64_t a = rand63() >> (abits - 1);
uint64_t b = rand63() >> (bbits - 1);
if (inverta) a = ~a;
if (invertb) b = ~b;
printf("%016llX %016llX\n", a, b);
}
}
while (!inverta++);
}
while (!invertb++);
return 0;
}
// Test long long support
#include <stdio.h>
#include <stdint.h>
struct udivrem { uint64_t q; uint64_t r; };
struct divrem { int64_t q; int64_t r; };
extern int64_t _ll_from_u(uint32_t);
extern int64_t _ll_from_l( int32_t);
extern uint32_t _ll_to_l ( int64_t);
extern int64_t _ll_add ( int64_t, int64_t);
extern int64_t _ll_addlu( int64_t, uint32_t);
extern int64_t _ll_addls( int64_t, int32_t);
extern int64_t _ll_adduu(uint32_t, uint32_t);
extern int64_t _ll_addss( int32_t, int32_t);
extern int64_t _ll_sub ( int64_t, int64_t);
extern int64_t _ll_sublu( int64_t, uint32_t);
extern int64_t _ll_subls( int64_t, int32_t);
extern int64_t _ll_subuu(uint32_t, uint32_t);
extern int64_t _ll_subss( int32_t, int32_t);
extern int64_t _ll_rsb ( int64_t, int64_t);
extern int64_t _ll_rsblu( int64_t, uint32_t);
extern int64_t _ll_rsbls( int64_t, int32_t);
extern int64_t _ll_rsbuu(uint32_t, uint32_t);
extern int64_t _ll_rsbss( int32_t, int32_t);
extern int64_t _ll_mul ( int64_t, int64_t);
extern int64_t _ll_mullu( int64_t, uint32_t);
extern int64_t _ll_mulls( int64_t, int32_t);
extern int64_t _ll_muluu(uint32_t, uint32_t);
extern int64_t _ll_mulss( int32_t, int32_t);
extern __value_in_regs struct udivrem _ll_udiv (uint64_t, uint64_t);
extern __value_in_regs struct udivrem _ll_urdv (uint64_t, uint64_t);
extern __value_in_regs struct udivrem _ll_udiv10(uint64_t);
extern __value_in_regs struct divrem _ll_sdiv ( int64_t, int64_t);
extern __value_in_regs struct divrem _ll_srdv ( int64_t, int64_t);
extern __value_in_regs struct divrem _ll_sdiv10( int64_t);
extern int64_t _ll_not( int64_t);
extern int64_t _ll_neg( int64_t);
extern int64_t _ll_and( int64_t, int64_t);
extern int64_t _ll_or ( int64_t, int64_t);
extern int64_t _ll_eor( int64_t, int64_t);
extern uint64_t _ll_shift_l (uint64_t, uint32_t);
extern uint64_t _ll_ushift_r(uint64_t, uint32_t);
extern int64_t _ll_sshift_r( int64_t, uint32_t);
int main(void)
{
FILE *f = fopen("inputs", "r");
while (!feof(f))
{
int64_t a,b;
fscanf(f, "%016llX %016llX\n", &a, &b);
struct udivrem udr;
struct divrem dr;
printf("%016llX %016llX "
"%016llX %016llX %08X "
"%016llX %016llX %016llX %016llX %016llX "
"%016llX %016llX %016llX %016llX %016llX "
"%016llX %016llX %016llX %016llX %016llX "
"%016llX %016llX %016llX %016llX %016llX ",
a, b,
_ll_from_u(a), _ll_from_l(a), _ll_to_l(a),
_ll_add(a,b), _ll_addlu(a,b), _ll_addls(a,b), _ll_adduu(a,b), _ll_addss(a,b),
_ll_sub(a,b), _ll_sublu(a,b), _ll_subls(a,b), _ll_subuu(a,b), _ll_subss(a,b),
_ll_rsb(a,b), _ll_rsblu(a,b), _ll_rsbls(a,b), _ll_rsbuu(a,b), _ll_rsbss(a,b),
_ll_mul(a,b), _ll_mullu(a,b), _ll_mulls(a,b), _ll_muluu(a,b), _ll_mulss(a,b));
udr = b == 0 ? (struct udivrem) { 0, 0 } : _ll_udiv(a,b);
printf("%016llX %016llX ", udr.q, udr.r);
udr = a == 0 ? (struct udivrem) { 0, 0 } : _ll_urdv(a,b);
printf("%016llX %016llX ", udr.q, udr.r);
udr = _ll_udiv10(a);
printf("%016llX %016llX ", udr.q, udr.r);
dr = b == 0 ? (struct divrem) { 0, 0 } : _ll_sdiv(a,b);
printf("%016llX %016llX ", dr.q, dr.r);
dr = a == 0 ? (struct divrem) { 0, 0 } : _ll_srdv(a,b);
printf("%016llX %016llX ", dr.q, dr.r);
dr = _ll_sdiv10(a);
printf("%016llX %016llX ", dr.q, dr.r);
printf("%016llX %016llX %016llX %016llX %016llX "
"%016llX %016llX %016llX\n",
_ll_not(a), _ll_neg(a), _ll_and(a,b), _ll_or(a,b), _ll_eor(a,b),
_ll_shift_l(a,b&0x3F), _ll_ushift_r(a,b&0x3F), _ll_sshift_r(a,b&0x3F));
}
fclose(f);
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment