Commit 377852c3 authored by Jeffrey Lee's avatar Jeffrey Lee
Browse files

Prefer CPS over MSR for PSR manipulation. Use UDIV/SDIV for general-purpose...

Prefer CPS over MSR for PSR manipulation. Use UDIV/SDIV for general-purpose division, and UMULL for /10.

Detail:
  s/k_body:
  - _kernel_irqs_off and _kernel_irqs_on now use CPS for disabling/enabling IRQs as opposed to MSR. Apart from being shorter code sequences, it's generally a faster instruction.
  - __rt_udiv and __rt_sdiv (and aliases) now use the UDIV and SDIV instructions if building for ARMv7VE
  - __rt_udiv10 and __rt_sdiv10 (and aliases) now use UMULL to multiply by 1/10 when building for targets with long multiply support, as this is faster than the old method. UDIV/SDIV can be fast too, but only for small numbers, making UMULL the best for the general case.
Admin:
  Tested on Cortex-A15
  Prototyping of division routines on assorted CPUs shows that UDIV/SDIV is generally between 20% and 400% faster than the old routine (Cortex-A7, Cortex-A53), or up to 1300% faster on Cortex-A15 (the CPU does not like the old routine!)
  Division by 10 is now about 20% faster across all appropriate CPUs


Version 5.89. Tagged as 'RISC_OSLib-5_89'
parent 90c0b2a6
......@@ -11,13 +11,13 @@
GBLS Module_HelpVersion
GBLS Module_ComponentName
GBLS Module_ComponentPath
Module_MajorVersion SETS "5.88"
Module_Version SETA 588
Module_MajorVersion SETS "5.89"
Module_Version SETA 589
Module_MinorVersion SETS ""
Module_Date SETS "29 Feb 2016"
Module_ApplicationDate SETS "29-Feb-16"
Module_Date SETS "08 May 2016"
Module_ApplicationDate SETS "08-May-16"
Module_ComponentName SETS "RISC_OSLib"
Module_ComponentPath SETS "castle/RiscOS/Sources/Lib/RISC_OSLib"
Module_FullVersion SETS "5.88"
Module_HelpVersion SETS "5.88 (29 Feb 2016)"
Module_FullVersion SETS "5.89"
Module_HelpVersion SETS "5.89 (08 May 2016)"
END
/* (5.88)
/* (5.89)
*
* This file is automatically maintained by srccommit, do not edit manually.
* Last processed by srccommit version: 1.1.
*
*/
#define Module_MajorVersion_CMHG 5.88
#define Module_MajorVersion_CMHG 5.89
#define Module_MinorVersion_CMHG
#define Module_Date_CMHG 29 Feb 2016
#define Module_Date_CMHG 08 May 2016
#define Module_MajorVersion "5.88"
#define Module_Version 588
#define Module_MajorVersion "5.89"
#define Module_Version 589
#define Module_MinorVersion ""
#define Module_Date "29 Feb 2016"
#define Module_Date "08 May 2016"
#define Module_ApplicationDate "29-Feb-16"
#define Module_ApplicationDate "08-May-16"
#define Module_ComponentName "RISC_OSLib"
#define Module_ComponentPath "castle/RiscOS/Sources/Lib/RISC_OSLib"
#define Module_FullVersion "5.88"
#define Module_HelpVersion "5.88 (29 Feb 2016)"
#define Module_LibraryVersionInfo "5:88"
#define Module_FullVersion "5.89"
#define Module_HelpVersion "5.89 (08 May 2016)"
#define Module_LibraryVersionInfo "5:89"
......@@ -270,21 +270,29 @@ uwb_size # 0
[ {CONFIG}=26
BICS pc, lr, #PSRIBit ; 32-bit OK - in {CONFIG}=26
|
[ NoARMv6
MRS ip, CPSR
BIC ip, ip, #PSR32IBit
MSR CPSR_c, ip
|
CPSIE i
]
Return ,LinkNotStacked
]
|_kernel_irqs_off|
[ {CONFIG}=26
[ {CONFIG}=26
ORRS pc, lr, #PSRIBit
|
|
[ NoARMv6
MRS ip, CPSR
ORR ip, ip, #PSR32IBit
MSR CPSR_c, ip
|
CPSID i
]
Return ,LinkNotStacked
]
]
|_kernel_processor_mode|
[ {CONFIG}=26
......@@ -3132,7 +3140,7 @@ Sleep
|x$udivide|
; Unsigned divide of a2 by a1: returns quotient in a1, remainder in a2
; Destroys a3 and ip
[ NoARMVE
MOV a3, #0
RSBS ip, a1, a2, LSR #3
BCC u_sh2
......@@ -3180,6 +3188,14 @@ u_sh0 RSBS ip, a1, a2
ADCS a3, a3, a3
BCS u_loop
MOV a1, a3
|
; Long delay on UDIV result makes it faster to divide and then check for error
UDIV a3, a2, a1
TEQ a1, #0
BEQ dividebyzero
MLS a2, a3, a1, a2
MOV a1, a3
]
Return ,LinkNotStacked
......@@ -3200,6 +3216,7 @@ u_sh0 RSBS ip, a1, a2
|__rt_udiv10|
|_kernel_udiv10|
[ NoARMM
SUB a2, a1, #10
SUB a1, a1, a1, LSR #2
ADD a1, a1, a1, LSR #4
......@@ -3210,6 +3227,16 @@ u_sh0 RSBS ip, a1, a2
SUBS a2, a2, a3, LSL #1
ADDPL a1, a1, #1
ADDMI a2, a2, #10
|
; For small numbers, UDIV would be faster than this, but not enough to make it
; worth dynamically switching between algorithms.
LDR a2, =&CCCCCCCD ; (8^32) / 10
UMULL ip, a3, a2, a1
MOV a3, a3, LSR #3 ; Accurate division by 10
SUB a2, a1, a3, LSL #1
MOV a1, a3
SUB a2, a2, a3, LSL #3
]
Return ,LinkNotStacked
......@@ -3220,6 +3247,7 @@ u_sh0 RSBS ip, a1, a2
; Quotient is truncated (rounded towards zero).
; Sign of remainder = sign of dividend.
; Destroys a3, a4 and ip
[ NoARMVE
; Negates dividend and divisor, then does an unsigned divide; signs
; get sorted out again at the end.
......@@ -3275,6 +3303,13 @@ s_sh0 RSBS ip, a1, a2
EORS a1, a3, a4, ASR #31
ADD a1, a1, a4, LSR #31
RSBCS a2, a2, #0
|
SDIV a3, a2, a1
TEQ a1, #0
BEQ dividebyzero
MLS a2, a3, a1, a2
MOV a1, a3
]
Return ,LinkNotStacked
; Signed remainder of a2 by a1: returns remainder in a1
......@@ -3292,6 +3327,7 @@ s_sh0 RSBS ip, a1, a2
|__rt_sdiv10|
|_kernel_sdiv10|
[ NoARMM
MOVS a4, a1
RSBMI a1, a1, #0
SUB a2, a1, #10
......@@ -3307,6 +3343,19 @@ s_sh0 RSBS ip, a1, a2
MOVS a4, a4
RSBMI a1, a1, #0
RSBMI a2, a2, #0
|
; Using SMULL here would be tricky due to the need to round towards zero
MOVS a4, a1
LDR a2, =&CCCCCCCD ; (8^32) / 10
RSBMI a1, a1, #0
UMULL ip, a3, a2, a1
MOV a3, a3, LSR #3 ; Accurate division by 10
SUB a2, a1, a3, LSL #1
MOV a1, a3
SUB a2, a2, a3, LSL #3
RSBMI a1, a1, #0
RSBMI a2, a2, #0
]
Return ,LinkNotStacked
EXPORT __rt_div0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment